diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 08909f8b28..7f6d6b0fb9 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -124,6 +124,10 @@ #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX struct VZeroUpperGuard { +#ifdef __GNUC__ + __attribute__((always_inline)) +#endif + inline VZeroUpperGuard() { _mm256_zeroupper(); } #ifdef __GNUC__ __attribute__((always_inline)) #endif diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp index 1ea8c28643..c3f5b87267 100644 --- a/modules/core/include/opencv2/core/private.hpp +++ b/modules/core/include/opencv2/core/private.hpp @@ -796,9 +796,9 @@ CV_EXPORTS InstrNode* getCurrentNode(); #endif #ifdef __CV_AVX_GUARD -#define CV_INSTRUMENT_REGION(); __CV_AVX_GUARD CV_INSTRUMENT_REGION_(); +#define CV_INSTRUMENT_REGION() __CV_AVX_GUARD CV_INSTRUMENT_REGION_(); #else -#define CV_INSTRUMENT_REGION(); CV_INSTRUMENT_REGION_(); +#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_(); #endif namespace cv { diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index 1caadbbbad..6232aa5fab 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,3 +1,6 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) +ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 38d35c014d..8f268e07e0 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -3,6 +3,7 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" +#include "opencl_kernels_imgproc.hpp" #include "color.hpp" namespace cv diff --git a/modules/imgproc/src/color.hpp b/modules/imgproc/src/color.hpp index 70e7844277..8c1f19fa8a 100644 --- a/modules/imgproc/src/color.hpp +++ b/modules/imgproc/src/color.hpp @@ -3,59 +3,17 @@ // of this distribution and at http://opencv.org/license.html #include "opencv2/imgproc.hpp" -#include "opencv2/core/utility.hpp" -#include -#include "opencl_kernels_imgproc.hpp" #include "hal_replacement.hpp" -#include "opencv2/core/hal/intrin.hpp" -#include "opencv2/core/softfloat.hpp" -#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) - -namespace cv -{ - -//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601 -const float B2YF = 0.114f; -const float G2YF = 0.587f; -const float R2YF = 0.299f; - -enum -{ - yuv_shift = 14, - xyz_shift = 12, - R2Y = 4899, // == R2YF*16384 - G2Y = 9617, // == G2YF*16384 - B2Y = 1868, // == B2YF*16384 - BLOCK_SIZE = 256 -}; - -template struct ColorChannel -{ - typedef float worktype_f; - static _Tp max() { return std::numeric_limits<_Tp>::max(); } - static _Tp half() { return (_Tp)(max()/2 + 1); } -}; - -template<> struct ColorChannel -{ - typedef float worktype_f; - static float max() { return 1.f; } - static float half() { return 0.5f; } -}; - -/*template<> struct ColorChannel -{ - typedef double worktype_f; - static double max() { return 1.; } - static double half() { return 0.5; } -};*/ +namespace cv { // // Helper functions // -namespace { +namespace impl { + +#include "color.simd_helpers.hpp" inline bool isHSV(int code) { @@ -209,40 +167,9 @@ inline int uIndex(int code) } } // namespace:: +using namespace impl; -template -struct Set -{ - static bool contains(int i) - { - return (i == i0 || i == i1 || i == i2); - } -}; - -template -struct Set -{ - static bool contains(int i) - { - return (i == i0 || i == i1); - } -}; - -template -struct Set -{ - static bool contains(int i) - { - return (i == i0); - } -}; - -enum SizePolicy -{ - TO_YUV, FROM_YUV, NONE -}; - -template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE > +/*template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE > struct CvtHelper { CvtHelper(InputArray _src, OutputArray _dst, int dcn) @@ -282,7 +209,7 @@ struct CvtHelper Mat src, dst; int depth, scn; Size dstSz; -}; +};*/ #ifdef HAVE_OPENCL @@ -380,49 +307,7 @@ struct OclHelper #endif -///////////////////////////// Top-level template function //////////////////////////////// - -template -class CvtColorLoop_Invoker : public ParallelLoopBody -{ - typedef typename Cvt::channel_type _Tp; -public: - - CvtColorLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt) : - ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), - width(width_), cvt(_cvt) - { - } - - virtual void operator()(const Range& range) const CV_OVERRIDE - { - CV_TRACE_FUNCTION(); - - const uchar* yS = src_data + static_cast(range.start) * src_step; - uchar* yD = dst_data + static_cast(range.start) * dst_step; - for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step ) - cvt(reinterpret_cast(yS), reinterpret_cast<_Tp*>(yD), width); - } - -private: - const uchar * src_data; - const size_t src_step; - uchar * dst_data; - const size_t dst_step; - const int width; - const Cvt& cvt; - - const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&); -}; - -template -void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) -{ - parallel_for_(Range(0, height), - CvtColorLoop_Invoker(src_data, src_step, dst_data, dst_step, width, cvt), - (width * height) / static_cast(1<<16)); -} #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) # define NEED_IPP 1 diff --git a/modules/imgproc/src/color.simd_helpers.hpp b/modules/imgproc/src/color.simd_helpers.hpp index 70e7844277..343491f2c6 100644 --- a/modules/imgproc/src/color.simd_helpers.hpp +++ b/modules/imgproc/src/color.simd_helpers.hpp @@ -2,23 +2,14 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html -#include "opencv2/imgproc.hpp" -#include "opencv2/core/utility.hpp" -#include -#include "opencl_kernels_imgproc.hpp" -#include "hal_replacement.hpp" -#include "opencv2/core/hal/intrin.hpp" -#include "opencv2/core/softfloat.hpp" - #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) -namespace cv -{ +namespace { //constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601 -const float B2YF = 0.114f; -const float G2YF = 0.587f; -const float R2YF = 0.299f; +static const float B2YF = 0.114f; +static const float G2YF = 0.587f; +static const float R2YF = 0.299f; enum { @@ -33,15 +24,15 @@ enum template struct ColorChannel { typedef float worktype_f; - static _Tp max() { return std::numeric_limits<_Tp>::max(); } - static _Tp half() { return (_Tp)(max()/2 + 1); } + static inline _Tp max() { return std::numeric_limits<_Tp>::max(); } + static inline _Tp half() { return (_Tp)(max()/2 + 1); } }; template<> struct ColorChannel { typedef float worktype_f; - static float max() { return 1.f; } - static float half() { return 0.5f; } + static inline float max() { return 1.f; } + static inline float half() { return 0.5f; } }; /*template<> struct ColorChannel @@ -51,169 +42,11 @@ template<> struct ColorChannel static double half() { return 0.5; } };*/ -// -// Helper functions -// - -namespace { - -inline bool isHSV(int code) -{ - switch(code) - { - case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: - case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: - return true; - default: - return false; - } -} - -inline bool isLab(int code) -{ - switch (code) - { - case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Lab2LBGR: case COLOR_Lab2LRGB: - case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_LBGR2Lab: case COLOR_LRGB2Lab: - return true; - default: - return false; - } -} - -inline bool is_sRGB(int code) -{ - switch (code) - { - case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_BGR2Luv: case COLOR_RGB2Luv: - case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Luv2BGR: case COLOR_Luv2RGB: - return true; - default: - return false; - } -} - -inline bool swapBlue(int code) -{ - switch (code) - { - case COLOR_BGR2BGRA: case COLOR_BGRA2BGR: - case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: - case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: - case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY: - case COLOR_BGR2YCrCb: case COLOR_BGR2YUV: - case COLOR_YCrCb2BGR: case COLOR_YUV2BGR: - case COLOR_BGR2XYZ: case COLOR_XYZ2BGR: - case COLOR_BGR2HSV: case COLOR_BGR2HLS: case COLOR_BGR2HSV_FULL: case COLOR_BGR2HLS_FULL: - case COLOR_YUV2BGR_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2BGRA_IYUV: - case COLOR_YUV2BGR_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2BGRA_NV12: - case COLOR_Lab2BGR: case COLOR_Luv2BGR: case COLOR_Lab2LBGR: case COLOR_Luv2LBGR: - case COLOR_BGR2Lab: case COLOR_BGR2Luv: case COLOR_LBGR2Lab: case COLOR_LBGR2Luv: - case COLOR_HSV2BGR: case COLOR_HLS2BGR: case COLOR_HSV2BGR_FULL: case COLOR_HLS2BGR_FULL: - case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2: - case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU: - case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12: - return false; - default: - return true; - } -} - -inline bool isFullRangeHSV(int code) -{ - switch (code) - { - case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL: - case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL: - return true; - default: - return false; - } -} - -inline int dstChannels(int code) -{ - switch( code ) - { - case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2RGBA: - case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA: - case COLOR_GRAY2BGRA: - case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12: - case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV: - case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU: - case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: - - return 4; - - case COLOR_BGRA2BGR: case COLOR_RGBA2BGR: case COLOR_RGB2BGR: - case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB: - case COLOR_GRAY2BGR: - case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: - case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: - case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: - case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: - - return 3; - - default: - return 0; - } -} - -inline int greenBits(int code) -{ - switch( code ) - { - case COLOR_BGR2BGR565: case COLOR_RGB2BGR565: case COLOR_BGRA2BGR565: case COLOR_RGBA2BGR565: - case COLOR_BGR5652BGR: case COLOR_BGR5652RGB: case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA: - case COLOR_BGR5652GRAY: case COLOR_GRAY2BGR565: - - return 6; - - case COLOR_BGR2BGR555: case COLOR_RGB2BGR555: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR555: - case COLOR_BGR5552BGR: case COLOR_BGR5552RGB: case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA: - case COLOR_BGR5552GRAY: case COLOR_GRAY2BGR555: - - return 5; - - default: - return 0; - } -} - -inline int uIndex(int code) -{ - switch( code ) - { - case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12: - - return 2; - - case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU: - case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: - case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: - case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: - - return 1; - - case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12: - case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV: - case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: - case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: - - return 0; - - default: - return -1; - } -} - -} // namespace:: template struct Set { - static bool contains(int i) + static inline bool contains(int i) { return (i == i0 || i == i1 || i == i2); } @@ -222,7 +55,7 @@ struct Set template struct Set { - static bool contains(int i) + static inline bool contains(int i) { return (i == i0 || i == i1); } @@ -231,7 +64,7 @@ struct Set template struct Set { - static bool contains(int i) + static inline bool contains(int i) { return (i == i0); } @@ -284,101 +117,6 @@ struct CvtHelper Size dstSz; }; -#ifdef HAVE_OPENCL - -template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE > -struct OclHelper -{ - OclHelper( InputArray _src, OutputArray _dst, int dcn) : - nArgs(0) - { - src = _src.getUMat(); - Size sz = src.size(), dstSz; - int scn = src.channels(); - int depth = src.depth(); - - CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) ); - switch (sizePolicy) - { - case TO_YUV: - CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 ); - dstSz = Size(sz.width, sz.height / 2 * 3); - break; - case FROM_YUV: - CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 ); - dstSz = Size(sz.width, sz.height * 2 / 3); - break; - case NONE: - default: - dstSz = sz; - break; - } - - _dst.create(dstSz, CV_MAKETYPE(depth, dcn)); - dst = _dst.getUMat(); - } - - bool createKernel(cv::String name, ocl::ProgramSource& source, cv::String options) - { - ocl::Device dev = ocl::Device::getDefault(); - int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1; - int pxPerWIx = 1; - - cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ", - src.depth(), src.channels(), pxPerWIy); - - switch (sizePolicy) - { - case TO_YUV: - if (dev.isIntel() && - src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 && - dst.step % 4 == 0 && dst.offset % 4 == 0) - { - pxPerWIx = 2; - } - globalSize[0] = (size_t)dst.cols/(2*pxPerWIx); - globalSize[1] = ((size_t)dst.rows/3 + pxPerWIy - 1) / pxPerWIy; - baseOptions += format("-D PIX_PER_WI_X=%d ", pxPerWIx); - break; - case FROM_YUV: - globalSize[0] = (size_t)dst.cols/2; - globalSize[1] = ((size_t)dst.rows/2 + pxPerWIy - 1) / pxPerWIy; - break; - case NONE: - default: - globalSize[0] = (size_t)src.cols; - globalSize[1] = ((size_t)src.rows + pxPerWIy - 1) / pxPerWIy; - break; - } - - k.create(name.c_str(), source, baseOptions + options); - - if(k.empty()) - return false; - - nArgs = k.set(0, ocl::KernelArg::ReadOnlyNoSize(src)); - nArgs = k.set(nArgs, ocl::KernelArg::WriteOnly(dst)); - return true; - } - - bool run() - { - return k.run(2, globalSize, NULL, false); - } - - template - void setArg(const T& arg) - { - nArgs = k.set(nArgs, arg); - } - - UMat src, dst; - ocl::Kernel k; - size_t globalSize[2]; - int nArgs; -}; - -#endif ///////////////////////////// Top-level template function //////////////////////////////// @@ -413,261 +151,17 @@ private: const int width; const Cvt& cvt; - const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&); + CvtColorLoop_Invoker(const CvtColorLoop_Invoker&); // = delete; + const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&); // = delete; }; -template +template static inline void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) { + CV_AVX_GUARD parallel_for_(Range(0, height), CvtColorLoop_Invoker(src_data, src_step, dst_data, dst_step, width, cvt), (width * height) / static_cast(1<<16)); } -#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) -# define NEED_IPP 1 -#else -# define NEED_IPP 0 -#endif - -#if NEED_IPP - -#define MAX_IPP8u 255 -#define MAX_IPP16u 65535 -#define MAX_IPP32f 1.0 - -typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *); -typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize); -typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *); - -template -class CvtColorIPPLoop_Invoker : - public ParallelLoopBody -{ -public: - - CvtColorIPPLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt, bool *_ok) : - ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), cvt(_cvt), ok(_ok) - { - *ok = true; - } - - virtual void operator()(const Range& range) const CV_OVERRIDE - { - const void *yS = src_data + src_step * range.start; - void *yD = dst_data + dst_step * range.start; - if( !cvt(yS, static_cast(src_step), yD, static_cast(dst_step), width, range.end - range.start) ) - *ok = false; - else - { - CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); - } - } - -private: - const uchar * src_data; - const size_t src_step; - uchar * dst_data; - const size_t dst_step; - const int width; - const Cvt& cvt; - bool *ok; - - const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&); -}; - - -template -bool CvtColorIPPLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) -{ - bool ok; - parallel_for_(Range(0, height), CvtColorIPPLoop_Invoker(src_data, src_step, dst_data, dst_step, width, cvt, &ok), (width * height)/(double)(1<<16) ); - return ok; -} - - -template -bool CvtColorIPPLoopCopy(const uchar * src_data, size_t src_step, int src_type, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) -{ - Mat temp; - Mat src(Size(width, height), src_type, const_cast(src_data), src_step); - Mat source = src; - if( src_data == dst_data ) - { - src.copyTo(temp); - source = temp; - } - bool ok; - parallel_for_(Range(0, source.rows), - CvtColorIPPLoop_Invoker(source.data, source.step, dst_data, dst_step, - source.cols, cvt, &ok), - source.total()/(double)(1<<16) ); - return ok; -} - - -struct IPPGeneralFunctor -{ - IPPGeneralFunctor(ippiGeneralFunc _func) : ippiColorConvertGeneral(_func){} - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiColorConvertGeneral ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false; - } -private: - ippiGeneralFunc ippiColorConvertGeneral; -}; - - -struct IPPReorderFunctor -{ - IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : ippiColorConvertReorder(_func) - { - order[0] = _order0; - order[1] = _order1; - order[2] = _order2; - order[3] = 3; - } - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiColorConvertReorder ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false; - } -private: - ippiReorderFunc ippiColorConvertReorder; - int order[4]; -}; - - -struct IPPReorderGeneralFunctor -{ - IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) : - ippiColorConvertReorder(_func1), ippiColorConvertGeneral(_func2), depth(_depth) - { - order[0] = _order0; - order[1] = _order1; - order[2] = _order2; - order[3] = 3; - } - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - if (ippiColorConvertReorder == 0 || ippiColorConvertGeneral == 0) - return false; - - Mat temp; - temp.create(rows, cols, CV_MAKETYPE(depth, 3)); - if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0) - return false; - return CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0; - } -private: - ippiReorderFunc ippiColorConvertReorder; - ippiGeneralFunc ippiColorConvertGeneral; - int order[4]; - int depth; -}; - - -struct IPPGeneralReorderFunctor -{ - IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) : - ippiColorConvertGeneral(_func1), ippiColorConvertReorder(_func2), depth(_depth) - { - order[0] = _order0; - order[1] = _order1; - order[2] = _order2; - order[3] = 3; - } - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - if (ippiColorConvertGeneral == 0 || ippiColorConvertReorder == 0) - return false; - - Mat temp; - temp.create(rows, cols, CV_MAKETYPE(depth, 3)); - if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0) - return false; - return CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0; - } -private: - ippiGeneralFunc ippiColorConvertGeneral; - ippiReorderFunc ippiColorConvertReorder; - int order[4]; - int depth; -}; - -extern ippiReorderFunc ippiSwapChannelsC3C4RTab[8]; -extern ippiReorderFunc ippiSwapChannelsC4C3RTab[8]; -extern ippiReorderFunc ippiSwapChannelsC3RTab[8]; - -#endif - -#ifdef HAVE_OPENCL - -bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb ); -bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb ); -bool oclCvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb); -bool oclCvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb); -bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx ); -bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ); - -bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ); -bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ); -bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full ); -bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full ); - -bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse ); -bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits ); -bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits ); -bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits ); -bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits ); -bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx ); -bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn ); -bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst ); -bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst ); - -bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx); -bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx); -bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx ); -bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ); - -bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx ); -bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ); -bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ); -bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx ); -bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ); - -#endif - -void cvtColorBGR2Lab( InputArray _src, OutputArray _dst, bool swapb, bool srgb); -void cvtColorBGR2Luv( InputArray _src, OutputArray _dst, bool swapb, bool srgb); -void cvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb ); -void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb ); -void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb ); -void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb ); - -void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb); -void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb); - -void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn); -void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ); -void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx ); -void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ); -void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx); -void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ); -void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi ); - -void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ); -void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ); -void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange); -void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange); - -void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb); -void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits); -void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits); -void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb); -void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn); -void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits); -void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits); -void cvtColorRGBA2mRGBA(InputArray _src, OutputArray _dst); -void cvtColormRGBA2RGBA(InputArray _src, OutputArray _dst); - -} //namespace cv +} //namespace diff --git a/modules/imgproc/src/color_hsv.dispatch.cpp b/modules/imgproc/src/color_hsv.dispatch.cpp index f0a4c87558..f1678f5deb 100644 --- a/modules/imgproc/src/color_hsv.dispatch.cpp +++ b/modules/imgproc/src/color_hsv.dispatch.cpp @@ -3,1194 +3,15 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" - -namespace cv -{ - -////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// - - -struct RGB2HSV_b -{ - typedef uchar channel_type; - - RGB2HSV_b(int _srccn, int _blueIdx, int _hrange) - : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) - { - CV_Assert( hrange == 180 || hrange == 256 ); - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int i, bidx = blueIdx, scn = srccn; - const int hsv_shift = 12; - - static int sdiv_table[256]; - static int hdiv_table180[256]; - static int hdiv_table256[256]; - static volatile bool initialized = false; - - int hr = hrange; - const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256; - n *= 3; - - if( !initialized ) - { - sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0; - for( i = 1; i < 256; i++ ) - { - sdiv_table[i] = saturate_cast((255 << hsv_shift)/(1.*i)); - hdiv_table180[i] = saturate_cast((180 << hsv_shift)/(6.*i)); - hdiv_table256[i] = saturate_cast((256 << hsv_shift)/(6.*i)); - } - initialized = true; - } - - for( i = 0; i < n; i += 3, src += scn ) - { - int b = src[bidx], g = src[1], r = src[bidx^2]; - int h, s, v = b; - int vmin = b; - int vr, vg; - - CV_CALC_MAX_8U( v, g ); - CV_CALC_MAX_8U( v, r ); - CV_CALC_MIN_8U( vmin, g ); - CV_CALC_MIN_8U( vmin, r ); - - uchar diff = saturate_cast(v - vmin); - vr = v == r ? -1 : 0; - vg = v == g ? -1 : 0; - - s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift; - h = (vr & (g - b)) + - (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); - h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift; - h += h < 0 ? hr : 0; - - dst[i] = saturate_cast(h); - dst[i+1] = (uchar)s; - dst[i+2] = (uchar)v; - } - } - - int srccn, blueIdx, hrange; -}; - - -struct RGB2HSV_f -{ - typedef float channel_type; - - RGB2HSV_f(int _srccn, int _blueIdx, float _hrange) - : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - #if CV_SIMD128 - inline void process(v_float32x4& v_r, v_float32x4& v_g, - v_float32x4& v_b, float hscale) const - { - v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b); - v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b); - - v_float32x4 v_eps = v_setall_f32(FLT_EPSILON); - v_float32x4 v_diff = v_max_rgb - v_min_rgb; - v_float32x4 v_s = v_diff / (v_abs(v_max_rgb) + v_eps); - - v_float32x4 v_r_eq_max = v_r == v_max_rgb; - v_float32x4 v_g_eq_max = v_g == v_max_rgb; - v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b, - v_select(v_g_eq_max, v_b - v_r, v_r - v_g)); - v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f), - v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f))); - v_float32x4 v_rev_diff = v_setall_f32(60.0f) / (v_diff + v_eps); - v_r = v_muladd(v_h, v_rev_diff, v_res) * v_setall_f32(hscale); - - v_g = v_s; - v_b = v_max_rgb; - } - #endif - - void operator()(const float* src, float* dst, int n) const - { - int i = 0, bidx = blueIdx, scn = srccn; - float hscale = hrange*(1.f/360.f); - n *= 3; - - #if CV_SIMD128 - if (hasSIMD) - { - if (scn == 3) { - if (bidx) { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_load_deinterleave(src, v_r, v_g, v_b); - process(v_r, v_g, v_b, hscale); - v_store_interleave(dst + i, v_r, v_g, v_b); - } - } else { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_load_deinterleave(src, v_r, v_g, v_b); - process(v_b, v_g, v_r, hscale); - v_store_interleave(dst + i, v_b, v_g, v_r); - } - } - } else { // scn == 4 - if (bidx) { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_float32x4 v_a; - v_load_deinterleave(src, v_r, v_g, v_b, v_a); - process(v_r, v_g, v_b, hscale); - v_store_interleave(dst + i, v_r, v_g, v_b); - } - } else { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_float32x4 v_a; - v_load_deinterleave(src, v_r, v_g, v_b, v_a); - process(v_b, v_g, v_r, hscale); - v_store_interleave(dst + i, v_b, v_g, v_r); - } - } - } - } - #endif - - for( ; i < n; i += 3, src += scn ) - { - float b = src[bidx], g = src[1], r = src[bidx^2]; - float h, s, v; - - float vmin, diff; - - v = vmin = r; - if( v < g ) v = g; - if( v < b ) v = b; - if( vmin > g ) vmin = g; - if( vmin > b ) vmin = b; - - diff = v - vmin; - s = diff/(float)(fabs(v) + FLT_EPSILON); - diff = (float)(60./(diff + FLT_EPSILON)); - if( v == r ) - h = (g - b)*diff; - else if( v == g ) - h = (b - r)*diff + 120.f; - else - h = (r - g)*diff + 240.f; - - if( h < 0 ) h += 360.f; - - dst[i] = h*hscale; - dst[i+1] = s; - dst[i+2] = v; - } - } - - int srccn, blueIdx; - float hrange; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -#if CV_SIMD128 -inline void HSV2RGB_simd(v_float32x4& v_h, v_float32x4& v_s, v_float32x4& v_v, float hscale) -{ - v_h = v_h * v_setall_f32(hscale); - v_float32x4 v_pre_sector = v_cvt_f32(v_trunc(v_h)); - v_h = v_h - v_pre_sector; - v_float32x4 v_tab0 = v_v; - v_float32x4 v_one = v_setall_f32(1.0f); - v_float32x4 v_tab1 = v_v * (v_one - v_s); - v_float32x4 v_tab2 = v_v * (v_one - (v_s * v_h)); - v_float32x4 v_tab3 = v_v * (v_one - (v_s * (v_one - v_h))); - - v_float32x4 v_one_sixth = v_setall_f32(1.0f / 6.0f); - v_float32x4 v_sector = v_pre_sector * v_one_sixth; - v_sector = v_cvt_f32(v_trunc(v_sector)); - v_float32x4 v_six = v_setall_f32(6.0f); - v_sector = v_pre_sector - (v_sector * v_six); - - v_float32x4 v_two = v_setall_f32(2.0f); - v_h = v_tab1 & (v_sector < v_two); - v_h = v_h | (v_tab3 & (v_sector == v_two)); - v_float32x4 v_three = v_setall_f32(3.0f); - v_h = v_h | (v_tab0 & (v_sector == v_three)); - v_float32x4 v_four = v_setall_f32(4.0f); - v_h = v_h | (v_tab0 & (v_sector == v_four)); - v_h = v_h | (v_tab2 & (v_sector > v_four)); - - v_s = v_tab3 & (v_sector < v_one); - v_s = v_s | (v_tab0 & (v_sector == v_one)); - v_s = v_s | (v_tab0 & (v_sector == v_two)); - v_s = v_s | (v_tab2 & (v_sector == v_three)); - v_s = v_s | (v_tab1 & (v_sector > v_three)); - - v_v = v_tab0 & (v_sector < v_one); - v_v = v_v | (v_tab2 & (v_sector == v_one)); - v_v = v_v | (v_tab1 & (v_sector == v_two)); - v_v = v_v | (v_tab1 & (v_sector == v_three)); - v_v = v_v | (v_tab3 & (v_sector == v_four)); - v_v = v_v | (v_tab0 & (v_sector > v_four)); -} -#endif - - -inline void HSV2RGB_native(const float* src, float* dst, const float hscale, const int bidx) -{ - float h = src[0], s = src[1], v = src[2]; - float b, g, r; - - if( s == 0 ) - b = g = r = v; - else - { - static const int sector_data[][3]= - {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - float tab[4]; - int sector; - h *= hscale; - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - sector = cvFloor(h); - h -= sector; - if( (unsigned)sector >= 6u ) - { - sector = 0; - h = 0.f; - } - - tab[0] = v; - tab[1] = v*(1.f - s); - tab[2] = v*(1.f - s*h); - tab[3] = v*(1.f - s*(1.f - h)); - - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; - } - - dst[bidx] = b; - dst[1] = g; - dst[bidx^2] = r; -} - -struct HSV2RGB_f -{ - typedef float channel_type; - - HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange) - : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - void operator()(const float* src, float* dst, int n) const - { - int i = 0, bidx = blueIdx, dcn = dstcn; - n *= 3; - - if (dcn == 3) - { - #if CV_SIMD128 - if (hasSIMD) - { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_src[3]; - v_load_deinterleave(src + i, v_src[0], v_src[1], v_src[2]); - HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); - v_store_interleave(dst, v_src[bidx], v_src[1], v_src[bidx^2]); - } - } - #endif - for( ; i < n; i += 3, dst += dcn ) - { - HSV2RGB_native(src + i, dst, hscale, bidx); - } - } else { // dcn == 4 - float alpha = ColorChannel::max(); - #if CV_SIMD128 - if (hasSIMD) - { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_src[3]; - v_load_deinterleave(src + i, v_src[0], v_src[1], v_src[2]); - HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); - v_float32x4 v_a = v_setall_f32(alpha); - v_store_interleave(dst, v_src[bidx], v_src[1], v_src[bidx^2], v_a); - } - } - #endif - for( ; i < n; i += 3, dst += dcn ) - { - HSV2RGB_native(src + i, dst, hscale, bidx); - dst[3] = alpha; - } - } - } - - int dstcn, blueIdx; - float hscale; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -struct HSV2RGB_b -{ - typedef uchar channel_type; - - HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange) - : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.0f / _hrange) - { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int j = 0, dcn = dstcn; - uchar alpha = ColorChannel::max(); - - #if CV_SIMD128 - if (hasSIMD) - { - for (j = 0; j <= (n - 16) * 3; j += 48, dst += dcn * 16) - { - v_uint8x16 h_b, s_b, v_b; - v_uint16x8 h_w[2], s_w[2], v_w[2]; - v_uint32x4 h_u[4], s_u[4], v_u[4]; - v_load_deinterleave(src + j, h_b, s_b, v_b); - v_expand(h_b, h_w[0], h_w[1]); - v_expand(s_b, s_w[0], s_w[1]); - v_expand(v_b, v_w[0], v_w[1]); - v_expand(h_w[0], h_u[0], h_u[1]); - v_expand(h_w[1], h_u[2], h_u[3]); - v_expand(s_w[0], s_u[0], s_u[1]); - v_expand(s_w[1], s_u[2], s_u[3]); - v_expand(v_w[0], v_u[0], v_u[1]); - v_expand(v_w[1], v_u[2], v_u[3]); - - v_int32x4 b_i[4], g_i[4], r_i[4]; - v_float32x4 v_coeff0 = v_setall_f32(1.0f / 255.0f); - v_float32x4 v_coeff1 = v_setall_f32(255.0f); - - for( int k = 0; k < 4; k++ ) - { - v_float32x4 v_src[3]; - v_src[0] = v_cvt_f32(v_reinterpret_as_s32(h_u[k])); - v_src[1] = v_cvt_f32(v_reinterpret_as_s32(s_u[k])); - v_src[2] = v_cvt_f32(v_reinterpret_as_s32(v_u[k])); - - v_src[1] *= v_coeff0; - v_src[2] *= v_coeff0; - HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); - - v_src[0] *= v_coeff1; - v_src[1] *= v_coeff1; - v_src[2] *= v_coeff1; - b_i[k] = v_trunc(v_src[0]); - g_i[k] = v_trunc(v_src[1]); - r_i[k] = v_trunc(v_src[2]); - } - - v_uint16x8 r_w[2], g_w[2], b_w[2]; - v_uint8x16 r_b, g_b, b_b; - - r_w[0] = v_pack_u(r_i[0], r_i[1]); - r_w[1] = v_pack_u(r_i[2], r_i[3]); - r_b = v_pack(r_w[0], r_w[1]); - g_w[0] = v_pack_u(g_i[0], g_i[1]); - g_w[1] = v_pack_u(g_i[2], g_i[3]); - g_b = v_pack(g_w[0], g_w[1]); - b_w[0] = v_pack_u(b_i[0], b_i[1]); - b_w[1] = v_pack_u(b_i[2], b_i[3]); - b_b = v_pack(b_w[0], b_w[1]); - - if( dcn == 3 ) - { - if( blueIdx == 0 ) - v_store_interleave(dst, b_b, g_b, r_b); - else - v_store_interleave(dst, r_b, g_b, b_b); - } - else - { - v_uint8x16 alpha_b = v_setall_u8(alpha); - if( blueIdx == 0 ) - v_store_interleave(dst, b_b, g_b, r_b, alpha_b); - else - v_store_interleave(dst, r_b, g_b, b_b, alpha_b); - } - } - } - #endif - for( ; j < n * 3; j += 3, dst += dcn ) - { - float buf[6]; - buf[0] = src[j]; - buf[1] = src[j+1] * (1.0f / 255.0f); - buf[2] = src[j+2] * (1.0f / 255.0f); - HSV2RGB_native(buf, buf + 3, hscale, blueIdx); - dst[0] = saturate_cast(buf[3] * 255.0f); - dst[1] = saturate_cast(buf[4] * 255.0f); - dst[2] = saturate_cast(buf[5] * 255.0f); - if( dcn == 4 ) - dst[3] = alpha; - } - } - - int dstcn; - int blueIdx; - float hscale; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -///////////////////////////////////// RGB <-> HLS //////////////////////////////////////// - -struct RGB2HLS_f -{ - typedef float channel_type; - - RGB2HLS_f(int _srccn, int _blueIdx, float _hrange) - : srccn(_srccn), blueIdx(_blueIdx), hscale(_hrange/360.f) { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - #if CV_SIMD128 - inline void process(v_float32x4& v_r, v_float32x4& v_g, - v_float32x4& v_b, v_float32x4& v_hscale) const - { - v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b); - v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b); - - v_float32x4 v_diff = v_max_rgb - v_min_rgb; - v_float32x4 v_sum = v_max_rgb + v_min_rgb; - v_float32x4 v_half = v_setall_f32(0.5f); - v_float32x4 v_l = v_sum * v_half; - - v_float32x4 v_s = v_diff / v_select(v_l < v_half, v_sum, v_setall_f32(2.0f) - v_sum); - - v_float32x4 v_r_eq_max = v_max_rgb == v_r; - v_float32x4 v_g_eq_max = v_max_rgb == v_g; - v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b, - v_select(v_g_eq_max, v_b - v_r, v_r - v_g)); - v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f), - v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f))); - v_float32x4 v_rev_diff = v_setall_f32(60.0f) / v_diff; - v_h = v_muladd(v_h, v_rev_diff, v_res) * v_hscale; - - v_float32x4 v_diff_gt_eps = v_diff > v_setall_f32(FLT_EPSILON); - v_r = v_diff_gt_eps & v_h; - v_g = v_l; - v_b = v_diff_gt_eps & v_s; - } - #endif - - void operator()(const float* src, float* dst, int n) const - { - int i = 0, bidx = blueIdx, scn = srccn; - n *= 3; - - #if CV_SIMD128 - if (hasSIMD) - { - v_float32x4 v_hscale = v_setall_f32(hscale); - if (scn == 3) { - if (bidx) { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_load_deinterleave(src, v_r, v_g, v_b); - process(v_r, v_g, v_b, v_hscale); - v_store_interleave(dst + i, v_r, v_g, v_b); - } - } else { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_load_deinterleave(src, v_r, v_g, v_b); - process(v_b, v_g, v_r, v_hscale); - v_store_interleave(dst + i, v_b, v_g, v_r); - } - } - } else { // scn == 4 - if (bidx) { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_float32x4 v_a; - v_load_deinterleave(src, v_r, v_g, v_b, v_a); - process(v_r, v_g, v_b, v_hscale); - v_store_interleave(dst + i, v_r, v_g, v_b); - } - } else { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_float32x4 v_a; - v_load_deinterleave(src, v_r, v_g, v_b, v_a); - process(v_b, v_g, v_r, v_hscale); - v_store_interleave(dst + i, v_b, v_g, v_r); - } - } - } - } - #endif - - for( ; i < n; i += 3, src += scn ) - { - float b = src[bidx], g = src[1], r = src[bidx^2]; - float h = 0.f, s = 0.f, l; - float vmin, vmax, diff; - - vmax = vmin = r; - if( vmax < g ) vmax = g; - if( vmax < b ) vmax = b; - if( vmin > g ) vmin = g; - if( vmin > b ) vmin = b; - - diff = vmax - vmin; - l = (vmax + vmin)*0.5f; - - if( diff > FLT_EPSILON ) - { - s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); - diff = 60.f/diff; - - if( vmax == r ) - h = (g - b)*diff; - else if( vmax == g ) - h = (b - r)*diff + 120.f; - else - h = (r - g)*diff + 240.f; - - if( h < 0.f ) h += 360.f; - } - - dst[i] = h*hscale; - dst[i+1] = l; - dst[i+2] = s; - } - } +#include "opencl_kernels_imgproc.hpp" - int srccn, blueIdx; - float hscale; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -struct RGB2HLS_b -{ - typedef uchar channel_type; - - RGB2HLS_b(int _srccn, int _blueIdx, int _hrange) - : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) - { - #if CV_NEON - v_scale_inv = vdupq_n_f32(1.f/255.f); - v_scale = vdupq_n_f32(255.f); - v_alpha = vdup_n_u8(ColorChannel::max()); - #elif CV_SSE2 - v_scale_inv = _mm_set1_ps(1.f/255.f); - v_zero = _mm_setzero_si128(); - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - #endif - } - - #if CV_SSE2 - void process(const float * buf, - __m128 & v_coeffs, uchar * dst) const - { - __m128 v_l0f = _mm_load_ps(buf); - __m128 v_l1f = _mm_load_ps(buf + 4); - __m128 v_u0f = _mm_load_ps(buf + 8); - __m128 v_u1f = _mm_load_ps(buf + 12); - - v_l0f = _mm_mul_ps(v_l0f, v_coeffs); - v_u1f = _mm_mul_ps(v_u1f, v_coeffs); - v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92)); - v_u0f = _mm_mul_ps(v_u0f, v_coeffs); - v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92)); - v_l1f = _mm_mul_ps(v_l1f, v_coeffs); - - __m128i v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f)); - __m128i v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f)); - __m128i v_l0 = _mm_packus_epi16(v_l, v_u); - - _mm_storeu_si128((__m128i *)(dst), v_l0); - } - #endif - - void operator()(const uchar* src, uchar* dst, int n) const - { - int i, j, scn = srccn; - float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; - #if CV_SSE2 - __m128 v_coeffs = _mm_set_ps(1.f, 255.f, 255.f, 1.f); - #endif - - for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) - { - int dn = std::min(n - i, (int)BLOCK_SIZE); - j = 0; - - #if CV_NEON - for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn) - { - uint16x8_t v_t0, v_t1, v_t2; - - if (scn == 3) - { - uint8x8x3_t v_src = vld3_u8(src); - v_t0 = vmovl_u8(v_src.val[0]); - v_t1 = vmovl_u8(v_src.val[1]); - v_t2 = vmovl_u8(v_src.val[2]); - } - else - { - uint8x8x4_t v_src = vld4_u8(src); - v_t0 = vmovl_u8(v_src.val[0]); - v_t1 = vmovl_u8(v_src.val[1]); - v_t2 = vmovl_u8(v_src.val[2]); - } - - float32x4x3_t v_dst; - v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv); - v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); - v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); - vst3q_f32(buf + j, v_dst); - - v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv); - v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); - v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); - vst3q_f32(buf + j + 12, v_dst); - } - #elif CV_SSE2 - if (scn == 3 && haveSIMD) - { - for ( ; j <= (dn * 3 - 16); j += 16, src += 16) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)src); - - __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); - _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); - _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); - - v_src_p = _mm_unpackhi_epi8(v_src, v_zero); - _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); - _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); - } - - int jr = j % 3; - if (jr) - src -= jr, j -= jr; - } - else if (scn == 4 && haveSIMD) - { - for ( ; j <= (dn * 3 - 12); j += 12, src += 16) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)src); - - __m128i v_src_lo = _mm_unpacklo_epi8(v_src, v_zero); - __m128i v_src_hi = _mm_unpackhi_epi8(v_src, v_zero); - _mm_storeu_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_lo, v_zero)), v_scale_inv)); - _mm_storeu_ps(buf + j + 3, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_lo, v_zero)), v_scale_inv)); - _mm_storeu_ps(buf + j + 6, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_hi, v_zero)), v_scale_inv)); - float tmp = buf[j + 8]; - _mm_storeu_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_unpackhi_epi16(v_src_hi, v_zero), 0x90)), v_scale_inv)); - buf[j + 8] = tmp; - } - - int jr = j % 3; - if (jr) - src -= jr, j -= jr; - } - #endif - for( ; j < dn*3; j += 3, src += scn ) - { - buf[j] = src[0]*(1.f/255.f); - buf[j+1] = src[1]*(1.f/255.f); - buf[j+2] = src[2]*(1.f/255.f); - } - cvt(buf, buf, dn); - - j = 0; - #if CV_NEON - for ( ; j <= (dn - 8) * 3; j += 24) - { - float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); - - uint8x8x3_t v_dst; - v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])), - vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0])))); - v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); - v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); - vst3_u8(dst + j, v_dst); - } - #elif CV_SSE2 - if (haveSIMD) - { - for ( ; j <= (dn - 16) * 3; j += 48) - { - process(buf + j, - v_coeffs, dst + j); - - process(buf + j + 16, - v_coeffs, dst + j + 16); - - process(buf + j + 32, - v_coeffs, dst + j + 32); - } - } - #endif - for( ; j < dn*3; j += 3 ) - { - dst[j] = saturate_cast(buf[j]); - dst[j+1] = saturate_cast(buf[j+1]*255.f); - dst[j+2] = saturate_cast(buf[j+2]*255.f); - } - } - } - - int srccn; - RGB2HLS_f cvt; - #if CV_NEON - float32x4_t v_scale, v_scale_inv; - uint8x8_t v_alpha; - #elif CV_SSE2 - __m128 v_scale_inv; - __m128i v_zero; - bool haveSIMD; - #endif -}; - - -struct HLS2RGB_f -{ - typedef float channel_type; - - HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange) - : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - #if CV_SIMD128 - inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const - { - v_float32x4 v_one = v_setall_f32(1.0f); - - v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f); - v_float32x4 v_ls = v_l * v_s; - v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls); - - v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale); - v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw)); - v_float32x4 v_hs = v_hs_raw - v_pre_hs; - v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f))); - v_float32x4 v_elem1 = v_hs + v_hs; - - v_float32x4 v_tab0 = v_l + v_elem0; - v_float32x4 v_tab1 = v_l - v_elem0; - v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1; - v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1; - - v_float32x4 v_two = v_setall_f32(2.0f); - v_float32x4 v_four = v_setall_f32(4.0f); - - v_h = v_select(v_sector < v_two , v_tab1, - v_select(v_sector <= v_two , v_tab3, - v_select(v_sector <= v_four, v_tab0, v_tab2))); - - v_l = v_select(v_sector < v_one , v_tab3, - v_select(v_sector <= v_two , v_tab0, - v_select(v_sector < v_four, v_tab2, v_tab1))); - - v_s = v_select(v_sector < v_one , v_tab0, - v_select(v_sector < v_two , v_tab2, - v_select(v_sector < v_four, v_tab1, - v_select(v_sector <= v_four, v_tab3, v_tab0)))); - } - #endif - - void operator()(const float* src, float* dst, int n) const - { - int i = 0, bidx = blueIdx, dcn = dstcn; - float alpha = ColorChannel::max(); - n *= 3; - - #if CV_SIMD128 - if (hasSIMD) - { - if (dcn == 3) - { - if (bidx) - { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_h; - v_float32x4 v_l; - v_float32x4 v_s; - v_load_deinterleave(src + i, v_h, v_l, v_s); - process(v_h, v_l, v_s); - v_store_interleave(dst, v_s, v_l, v_h); - } - } else { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_h; - v_float32x4 v_l; - v_float32x4 v_s; - v_load_deinterleave(src + i, v_h, v_l, v_s); - process(v_h, v_l, v_s); - v_store_interleave(dst, v_h, v_l, v_s); - } - } - } else { // dcn == 4 - if (bidx) - { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_h; - v_float32x4 v_l; - v_float32x4 v_s; - v_load_deinterleave(src + i, v_h, v_l, v_s); - process(v_h, v_l, v_s); - v_float32x4 v_a = v_setall_f32(alpha); - v_store_interleave(dst, v_s, v_l, v_h, v_a); - } - } else { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_h; - v_float32x4 v_l; - v_float32x4 v_s; - v_load_deinterleave(src + i, v_h, v_l, v_s); - process(v_h, v_l, v_s); - v_float32x4 v_a = v_setall_f32(alpha); - v_store_interleave(dst, v_h, v_l, v_s, v_a); - } - } - } - } - #endif - - for( ; i < n; i += 3, dst += dcn ) - { - float h = src[i], l = src[i+1], s = src[i+2]; - float b, g, r; - - if( s == 0 ) - b = g = r = l; - else - { - static const int sector_data[][3]= - {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - float tab[4]; - int sector; - - float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; - float p1 = 2*l - p2; - - h *= hscale; - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - - assert( 0 <= h && h < 6 ); - sector = cvFloor(h); - h -= sector; - - tab[0] = p2; - tab[1] = p1; - tab[2] = p1 + (p2 - p1)*(1-h); - tab[3] = p1 + (p2 - p1)*h; - - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; - } - - dst[bidx] = b; - dst[1] = g; - dst[bidx^2] = r; - if( dcn == 4 ) - dst[3] = alpha; - } - } - - int dstcn, blueIdx; - float hscale; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -struct HLS2RGB_b -{ - typedef uchar channel_type; - - HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange) - : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange) - { - #if CV_NEON - v_scale_inv = vdupq_n_f32(1.f/255.f); - v_scale = vdupq_n_f32(255.f); - v_alpha = vdup_n_u8(ColorChannel::max()); - #elif CV_SSE2 - v_scale = _mm_set1_ps(255.f); - v_alpha = _mm_set1_ps(ColorChannel::max()); - v_zero = _mm_setzero_si128(); - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - #endif - } - - #if CV_SSE2 - void process(__m128i v_r, __m128i v_g, __m128i v_b, - const __m128& v_coeffs_, - float * buf) const - { - __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); - __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); - __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); - - __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); - __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); - __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); - - __m128 v_coeffs = v_coeffs_; - - v_r0 = _mm_mul_ps(v_r0, v_coeffs); - v_g1 = _mm_mul_ps(v_g1, v_coeffs); - - v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49)); - - v_r1 = _mm_mul_ps(v_r1, v_coeffs); - v_b0 = _mm_mul_ps(v_b0, v_coeffs); - - v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49)); - - v_g0 = _mm_mul_ps(v_g0, v_coeffs); - v_b1 = _mm_mul_ps(v_b1, v_coeffs); - - _mm_store_ps(buf, v_r0); - _mm_store_ps(buf + 4, v_r1); - _mm_store_ps(buf + 8, v_g0); - _mm_store_ps(buf + 12, v_g1); - _mm_store_ps(buf + 16, v_b0); - _mm_store_ps(buf + 20, v_b1); - } - #endif - - void operator()(const uchar* src, uchar* dst, int n) const - { - int i, j, dcn = dstcn; - uchar alpha = ColorChannel::max(); - float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; - #if CV_SSE2 - __m128 v_coeffs = _mm_set_ps(1.f, 1.f/255.f, 1.f/255.f, 1.f); - #endif - - for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) - { - int dn = std::min(n - i, (int)BLOCK_SIZE); - j = 0; - - #if CV_NEON - for ( ; j <= (dn - 8) * 3; j += 24) - { - uint8x8x3_t v_src = vld3_u8(src + j); - uint16x8_t v_t0 = vmovl_u8(v_src.val[0]), - v_t1 = vmovl_u8(v_src.val[1]), - v_t2 = vmovl_u8(v_src.val[2]); - - float32x4x3_t v_dst; - v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))); - v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); - v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); - vst3q_f32(buf + j, v_dst); - - v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))); - v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); - v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); - vst3q_f32(buf + j + 12, v_dst); - } - #elif CV_SSE2 - if (haveSIMD) - { - for ( ; j <= (dn - 8) * 3; j += 24) - { - __m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16)); - - process(_mm_unpacklo_epi8(v_src0, v_zero), - _mm_unpackhi_epi8(v_src0, v_zero), - _mm_unpacklo_epi8(v_src1, v_zero), - v_coeffs, - buf + j); - } - } - #endif - for( ; j < dn*3; j += 3 ) - { - buf[j] = src[j]; - buf[j+1] = src[j+1]*(1.f/255.f); - buf[j+2] = src[j+2]*(1.f/255.f); - } - cvt(buf, buf, dn); - - j = 0; - #if CV_NEON - for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8) - { - float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); - uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale))))); - uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); - uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); - - if (dcn == 4) - { - uint8x8x4_t v_dst; - v_dst.val[0] = v_dst0; - v_dst.val[1] = v_dst1; - v_dst.val[2] = v_dst2; - v_dst.val[3] = v_alpha; - vst4_u8(dst, v_dst); - } - else - { - uint8x8x3_t v_dst; - v_dst.val[0] = v_dst0; - v_dst.val[1] = v_dst1; - v_dst.val[2] = v_dst2; - vst3_u8(dst, v_dst); - } - } - #elif CV_SSE2 - if (dcn == 3 && haveSIMD) - { - for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) - { - __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); - __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); - __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); - __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); - - __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), - _mm_cvtps_epi32(v_src1)); - __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), - _mm_cvtps_epi32(v_src3)); - - _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); - } - - int jr = j % 3; - if (jr) - dst -= jr, j -= jr; - } - else if (dcn == 4 && haveSIMD) - { - for ( ; j <= (dn * 3 - 12); j += 12, dst += 16) - { - __m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); - __m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); - __m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); - - __m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha); - __m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha); - - __m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44)); - __m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78); - __m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e)); - __m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78); - - __m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1); - __m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3); - - _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); - } +#include "color.hpp" - int jr = j % 3; - if (jr) - dst -= jr, j -= jr; - } - #endif +#include "color_hsv.simd.hpp" +#include "color_hsv.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content - for( ; j < dn*3; j += 3, dst += dcn ) - { - dst[0] = saturate_cast(buf[j]*255.f); - dst[1] = saturate_cast(buf[j+1]*255.f); - dst[2] = saturate_cast(buf[j+2]*255.f); - if( dcn == 4 ) - dst[3] = alpha; - } - } - } - - int dstcn; - HLS2RGB_f cvt; - #if CV_NEON - float32x4_t v_scale, v_scale_inv; - uint8x8_t v_alpha; - #elif CV_SSE2 - __m128 v_scale; - __m128 v_alpha; - __m128i v_zero; - bool haveSIMD; - #endif -}; +namespace cv { // // IPP functions @@ -1302,29 +123,15 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step, } #endif - int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180; - int blueIdx = swapBlue ? 2 : 0; - if(isHSV) - { - if(depth == CV_8U) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_b(scn, blueIdx, hrange)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_f(scn, blueIdx, static_cast(hrange))); - } - else - { - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_b(scn, blueIdx, hrange)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_f(scn, blueIdx, static_cast(hrange))); - } + CV_CPU_DISPATCH(cvtBGRtoHSV, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV), + CV_CPU_DISPATCH_MODES_ALL); } // 8u, 32f void cvtHSVtoBGR(const uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) { CV_INSTRUMENT_REGION(); @@ -1393,22 +200,8 @@ void cvtHSVtoBGR(const uchar * src_data, size_t src_step, } #endif - int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180; - int blueIdx = swapBlue ? 2 : 0; - if(isHSV) - { - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_b(dcn, blueIdx, hrange)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_f(dcn, blueIdx, static_cast(hrange))); - } - else - { - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_b(dcn, blueIdx, hrange)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_f(dcn, blueIdx, static_cast(hrange))); - } + CV_CPU_DISPATCH(cvtHSVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV), + CV_CPU_DISPATCH_MODES_ALL); } } // namespace hal diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp index f0a4c87558..30ae7064bc 100644 --- a/modules/imgproc/src/color_hsv.simd.hpp +++ b/modules/imgproc/src/color_hsv.simd.hpp @@ -3,11 +3,31 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" +#include "opencv2/core/hal/intrin.hpp" -namespace cv -{ +namespace cv { +namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations + +void cvtBGRtoHSV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV); +void cvtHSVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if defined(CV_CPU_BASELINE_MODE) +// included in color.hpp +#else +#include "color.simd_helpers.hpp" +#endif +namespace { ////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// @@ -1192,46 +1212,7 @@ struct HLS2RGB_b #endif }; -// -// IPP functions -// - -#if NEED_IPP - -#if !IPP_DISABLE_RGB_HSV -static ippiGeneralFunc ippiRGB2HSVTab[] = -{ - (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0, - 0, 0, 0, 0 -}; -#endif - -static ippiGeneralFunc ippiHSV2RGBTab[] = -{ - (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0, - 0, 0, 0, 0 -}; - -static ippiGeneralFunc ippiRGB2HLSTab[] = -{ - (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0, - 0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0 -}; - -static ippiGeneralFunc ippiHLS2RGBTab[] = -{ - (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0, - 0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0 -}; - -#endif - -// -// HAL functions -// - -namespace hal -{ +} // namespace anon // 8u, 32f void cvtBGRtoHSV(const uchar * src_data, size_t src_step, @@ -1241,67 +1222,6 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoHSV, cv_hal_cvtBGRtoHSV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - if(depth == CV_8U && isFullRange) - { - if (isHSV) - { -#if !IPP_DISABLE_RGB_HSV // breaks OCL accuracy tests - if(scn == 3 && !swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(scn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(scn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) ) - return; - } -#endif - } - else - { - if(scn == 3 && !swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(scn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(scn == 3 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiRGB2HLSTab[depth])) ) - return; - } - else if(scn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) ) - return; - } - } - } - } -#endif - int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180; int blueIdx = swapBlue ? 2 : 0; if(isHSV) @@ -1322,77 +1242,12 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step, // 8u, 32f void cvtHSVtoBGR(const uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtHSVtoBGR, cv_hal_cvtHSVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - if (depth == CV_8U && isFullRange) - { - if (isHSV) - { - if(dcn == 3 && !swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(dcn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(dcn == 3 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiHSV2RGBTab[depth])) ) - return; - } - else if(dcn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) ) - return; - } - } - else - { - if(dcn == 3 && !swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(dcn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(dcn == 3 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiHLS2RGBTab[depth])) ) - return; - } - else if(dcn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) ) - return; - } - } - } - } -#endif - int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180; int blueIdx = swapBlue ? 2 : 0; if(isHSV) @@ -1411,155 +1266,6 @@ void cvtHSVtoBGR(const uchar * src_data, size_t src_step, } } -} // namespace hal - -// -// OCL calls -// - -#ifdef HAVE_OPENCL - -bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ) -{ - OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255); - - if(!h.createKernel("HSV2RGB", ocl::imgproc::color_hsv_oclsrc, - format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ) -{ - OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255); - - if(!h.createKernel("HLS2RGB", ocl::imgproc::color_hsv_oclsrc, - format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full ) -{ - OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - float hscale = (_src.depth() == CV_32F ? 360.f : (!full ? 180.f : 256.f))/360.f; - - if(!h.createKernel("RGB2HLS", ocl::imgproc::color_hsv_oclsrc, - format("-D hscale=%ff -D bidx=%d -D dcn=3", hscale, bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full ) -{ - OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 256); - - cv::String options = (_src.depth() == CV_8U ? - format("-D hrange=%d -D bidx=%d -D dcn=3", hrange, bidx) : - format("-D hscale=%ff -D bidx=%d -D dcn=3", hrange*(1.f/360.f), bidx)); - - if(!h.createKernel("RGB2HSV", ocl::imgproc::color_hsv_oclsrc, options)) - { - return false; - } - - if(_src.depth() == CV_8U) - { - static UMat sdiv_data; - static UMat hdiv_data180; - static UMat hdiv_data256; - static int sdiv_table[256]; - static int hdiv_table180[256]; - static int hdiv_table256[256]; - static volatile bool initialized180 = false, initialized256 = false; - volatile bool & initialized = hrange == 180 ? initialized180 : initialized256; - - if (!initialized) - { - int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12; - UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256; - - sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0; - - int v = 255 << hsv_shift; - if (!initialized180 && !initialized256) - { - for(int i = 1; i < 256; i++ ) - sdiv_table[i] = saturate_cast(v/(1.*i)); - Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data); - } - - v = hrange << hsv_shift; - for (int i = 1; i < 256; i++ ) - hdiv_table[i] = saturate_cast(v/(6.*i)); - - Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data); - initialized = true; - } - - h.setArg(ocl::KernelArg::PtrReadOnly(sdiv_data)); - h.setArg(hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) : - ocl::KernelArg::PtrReadOnly(hdiv_data180)); - } - - return h.run(); -} - #endif - -// -// HAL calls -// - -void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ) -{ - CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, swapb, fullRange, false); -} - -void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ) -{ - CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, swapb, fullRange, true); -} - -void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, dcn, swapb, fullRange, false); -} - -void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, dcn, swapb, fullRange, true); -} - - -} // namespace cv +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp index 0fff89358c..cb5c0fdf53 100644 --- a/modules/imgproc/src/color_lab.cpp +++ b/modules/imgproc/src/color_lab.cpp @@ -9,6 +9,10 @@ \**********************************************************************************/ #include "precomp.hpp" +#include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" +#include "opencv2/core/softfloat.hpp" + #include "color.hpp" using cv::softfloat; diff --git a/modules/imgproc/src/color_rgb.dispatch.cpp b/modules/imgproc/src/color_rgb.dispatch.cpp index 9245f26d05..ed2961f0fb 100644 --- a/modules/imgproc/src/color_rgb.dispatch.cpp +++ b/modules/imgproc/src/color_rgb.dispatch.cpp @@ -3,1047 +3,16 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" - -#define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1 - -namespace cv -{ - -////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// - -template struct v_type; - -template<> -struct v_type{ - typedef v_uint8 t; -}; - -template<> -struct v_type{ - typedef v_uint16 t; -}; - -template<> -struct v_type{ - typedef v_float32 t; -}; - -template struct v_set; - -template<> -struct v_set -{ - static inline v_type::t set(uchar x) - { - return vx_setall_u8(x); - } -}; - -template<> -struct v_set -{ - static inline v_type::t set(ushort x) - { - return vx_setall_u16(x); - } -}; - -template<> -struct v_set -{ - static inline v_type::t set(float x) - { - return vx_setall_f32(x); - } -}; - -template -struct RGB2RGB -{ - typedef _Tp channel_type; - typedef typename v_type<_Tp>::t vt; - - RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : - srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) - { - CV_Assert(srccn == 3 || srccn == 4); - CV_Assert(dstcn == 3 || dstcn == 4); - } - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int scn = srccn, dcn = dstcn, bi = blueIdx; - int i = 0; - _Tp alphav = ColorChannel<_Tp>::max(); - -#if CV_SIMD - const int vsize = vt::nlanes; - - for(; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize*dcn) - { - vt a, b, c, d; - if(scn == 4) - { - v_load_deinterleave(src, a, b, c, d); - } - else - { - v_load_deinterleave(src, a, b, c); - d = v_set<_Tp>::set(alphav); - } - if(bi == 2) - swap(a, c); - - if(dcn == 4) - { - v_store_interleave(dst, a, b, c, d); - } - else - { - v_store_interleave(dst, a, b, c); - } - } - vx_cleanup(); -#endif - for ( ; i < n; i++, src += scn, dst += dcn ) - { - _Tp t0 = src[0], t1 = src[1], t2 = src[2]; - dst[bi ] = t0; - dst[1] = t1; - dst[bi^2] = t2; - if(dcn == 4) - { - _Tp d = scn == 4 ? src[3] : alphav; - dst[3] = d; - } - } - } - - int srccn, dstcn, blueIdx; -}; - - -/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// - -struct RGB5x52RGB -{ - typedef uchar channel_type; - - RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits) - : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) - { } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx, gb = greenBits; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 vz = vx_setzero_u8(), vn0 = vx_setall_u8(255); - for(; i <= n-vsize; - i += vsize, src += vsize*sizeof(ushort), dst += vsize*dcn) - { - v_uint16 t0 = v_reinterpret_as_u16(vx_load(src)); - v_uint16 t1 = v_reinterpret_as_u16(vx_load(src + - sizeof(ushort)*v_uint16::nlanes)); - - //TODO: shorten registers use when v_interleave is available - v_uint8 r, g, b, a; - v_uint16 b0 = (t0 << 11) >> 8; - v_uint16 b1 = (t1 << 11) >> 8; - b = v_pack(b0, b1); - - v_uint16 g0, g1, r0, r1, a0, a1; - - if( gb == 6 ) - { - g0 = ((t0 >> 5) << 10) >> 8; - g1 = ((t1 >> 5) << 10) >> 8; - - r0 = (t0 >> 11) << 3; - r1 = (t1 >> 11) << 3; - - a = vn0; - } - else - { - g0 = ((t0 >> 5) << 11) >> 8; - g1 = ((t1 >> 5) << 11) >> 8; - - r0 = ((t0 >> 10) << 11) >> 8; - r1 = ((t1 >> 10) << 11) >> 8; - - a0 = t0 >> 15; - a1 = t1 >> 15; - a = v_pack(a0, a1); - a = a != vz; - } - g = v_pack(g0, g1); - r = v_pack(r0, r1); - - if(bidx == 2) - swap(b, r); - - if(dcn == 4) - { - v_store_interleave(dst, b, g, r, a); - } - else - { - v_store_interleave(dst, b, g, r); - } - } - vx_cleanup(); -#endif - - for( ; i < n; i++, src += sizeof(ushort), dst += dcn ) - { - unsigned t = ((const ushort*)src)[0]; - uchar b, g, r, a; - - b = (uchar)(t << 3); - - if( gb == 6 ) - { - g = (uchar)((t >> 3) & ~3); - r = (uchar)((t >> 8) & ~7); - a = 255; - } - else - { - g = (uchar)((t >> 2) & ~7); - r = (uchar)((t >> 7) & ~7); - a = (uchar)(((t & 0x8000) >> 15) * 255); - } - - dst[bidx] = b; - dst[1] = g; - dst[bidx ^ 2] = r; - if( dcn == 4 ) - dst[3] = a; - } - } - - int dstcn, blueIdx, greenBits; -}; - - -struct RGB2RGB5x5 -{ - typedef uchar channel_type; - - RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits) - : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) - { } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int scn = srccn, bidx = blueIdx, gb = greenBits; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint16 vn3 = vx_setall_u16((ushort)(~3)); - v_uint16 vn7 = vx_setall_u16((ushort)(~7)); - v_uint16 vz = vx_setzero_u16(); - v_uint8 v7 = vx_setall_u8((uchar)(~7)); - for(; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize*sizeof(ushort)) - { - v_uint8 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - a = vx_setzero_u8(); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - if(bidx == 2) - swap(b, r); - - r = r & v7; - - //TODO: shorten registers use when v_deinterleave is available - v_uint16 r0, r1, g0, g1, b0, b1, a0, a1; - v_expand(r, r0, r1); - v_expand(g, g0, g1); - v_expand(b, b0, b1); - v_expand(a, a0, a1); - - v_uint16 d0, d1; - - b0 = b0 >> 3; - b1 = b1 >> 3; - a0 = (a0 != vz) << 15; - a1 = (a1 != vz) << 15; - - if(gb == 6) - { - d0 = b0 | ((g0 & vn3) << 3) | (r0 << 8); - d1 = b1 | ((g1 & vn3) << 3) | (r1 << 8); - } - else - { - d0 = b0 | ((g0 & vn7) << 2) | (r0 << 7) | a0; - d1 = b1 | ((g1 & vn7) << 2) | (r1 << 7) | a1; - } - - v_store((ushort*)dst, d0); - v_store(((ushort*)dst) + vsize/2, d1); - } - vx_cleanup(); -#endif - for ( ; i < n; i++, src += scn, dst += sizeof(ushort) ) - { - uchar r = src[bidx^2]; - uchar g = src[1]; - uchar b = src[bidx]; - uchar a = scn == 4 ? src[3] : 0; - - ushort d; - if (gb == 6) - { - d = (ushort)((b >> 3)|((g & ~3) << 3)|((r & ~7) << 8)); - } - else - { - d = (ushort)((b >> 3)|((g & ~7) << 2)|((r & ~7) << 7)|(a ? 0x8000 : 0)); - } - ((ushort*)dst)[0] = d; - } - } - - int srccn, blueIdx, greenBits; -}; - - -///////////////////////////////// Color to/from Grayscale //////////////////////////////// - -template -struct Gray2RGB -{ - typedef _Tp channel_type; - typedef typename v_type<_Tp>::t vt; - - Gray2RGB(int _dstcn) : dstcn(_dstcn) {} - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int dcn = dstcn; - int i = 0; - _Tp alpha = ColorChannel<_Tp>::max(); - -#if CV_SIMD - const int vsize = vt::nlanes; - vt valpha = v_set<_Tp>::set(alpha); - for(; i <= n-vsize; - i += vsize, src += vsize, dst += vsize*dcn) - { - vt g = vx_load(src); - - if(dcn == 3) - { - v_store_interleave(dst, g, g, g); - } - else - { - v_store_interleave(dst, g, g, g, valpha); - } - } - vx_cleanup(); -#endif - for ( ; i < n; i++, src++, dst += dcn ) - { - dst[0] = dst[1] = dst[2] = src[0]; - if(dcn == 4) - dst[3] = alpha; - } - } - - int dstcn; -}; - - -struct Gray2RGB5x5 -{ - typedef uchar channel_type; - - Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) - { } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int gb = greenBits; - int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; - v_uint16 v3 = vx_setall_u16((ushort)(~3)); - for(; i <= n-vsize; - i += vsize, src += vsize, dst += vsize*sizeof(ushort)) - { - v_uint8 t8 = vx_load_low(src); - v_uint16 t = v_expand_low(t8); - - v_uint16 t3 = t >> 3; - - v_uint16 d = t3; - if(gb == 6) - { - d |= ((t & v3) << 3) | (t3 << 11); - } - else - { - d |= (t3 << 5) | (t3 << 10); - } - - v_store((ushort*)dst, d); - } - vx_cleanup(); -#endif - - for( ; i < n; i++, src++, dst += sizeof(ushort)) - { - int t = src[0]; - int t3 = t >> 3; - ushort d; - if( gb == 6 ) - { - d = (ushort)(t3 |((t & ~3) << 3)|(t3 << 11)); - } - else - { - d = (ushort)(t3 |(t3 << 5)|(t3 << 10)); - } - ((ushort*)dst)[0] = d; - } - } - int greenBits; -}; - - -struct RGB5x52Gray -{ - typedef uchar channel_type; +#include "opencl_kernels_imgproc.hpp" - // can be changed to 15-shift coeffs - static const int BY = B2Y; - static const int GY = G2Y; - static const int RY = R2Y; - static const int shift = yuv_shift; - - RGB5x52Gray(int _greenBits) : greenBits(_greenBits) - { - CV_Assert(BY + GY + RY == (1 << shift)); - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int gb = greenBits; - int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; - - v_int16 bg2y; - v_int16 r12y; - v_int16 dummy; - v_zip(vx_setall_s16(BY), vx_setall_s16(GY), bg2y, dummy); - v_zip(vx_setall_s16(RY), vx_setall_s16( 1), r12y, dummy); - v_int16 delta = vx_setall_s16(1 << (shift-1)); - - for(; i <= n-vsize; - i += vsize, src += vsize*sizeof(ushort), dst += vsize) - { - v_uint16 t = vx_load((ushort*)src); - - v_uint16 r, g, b; - b = (t << 11) >> 8; - - if(gb == 5) - { - g = ((t >> 5) << 11) >> 8; - r = ((t >> 10) << 11) >> 8; - } - else - { - g = ((t >> 5) << 10) >> 8; - r = (t >> 11) << 3; - } - - v_uint8 d; - v_uint16 dx; - - v_int16 sr = v_reinterpret_as_s16(r); - v_int16 sg = v_reinterpret_as_s16(g); - v_int16 sb = v_reinterpret_as_s16(b); - - v_int16 bg0, bg1; - v_int16 rd0, rd1; - v_zip(sb, sg, bg0, bg1); - v_zip(sr, delta, rd0, rd1); - - v_uint32 d0, d1; - d0 = v_reinterpret_as_u32(v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)); - d1 = v_reinterpret_as_u32(v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)); - - d0 = d0 >> shift; - d1 = d1 >> shift; - - dx = v_pack(d0, d1); - // high part isn't used - d = v_pack(dx, dx); - - v_store_low(dst, d); - } - vx_cleanup(); -#endif - for( ; i < n; i++, src += sizeof(ushort), dst++) - { - int t = ((ushort*)src)[0]; - uchar r, g, b; - b = (t << 3) & 0xf8; - if( gb == 6 ) - { - g = (t >> 3) & 0xfc; - r = (t >> 8) & 0xf8; - } - else - { - g = (t >> 2) & 0xf8; - r = (t >> 7) & 0xf8; - } - dst[0] = (uchar)CV_DESCALE(b*BY + g*GY + r*RY, shift); - } - } - int greenBits; -}; - - -template struct RGB2Gray -{ - typedef _Tp channel_type; - - RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) - { - static const float coeffs0[] = { R2YF, G2YF, B2YF }; - memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int scn = srccn; - float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - for(int i = 0; i < n; i++, src += scn) - dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr); - } - int srccn; - float coeffs[3]; -}; - - -template <> -struct RGB2Gray -{ - typedef float channel_type; - - RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) - { - static const float coeffs0[] = { R2YF, G2YF, B2YF }; - for(int i = 0; i < 3; i++) - { - coeffs[i] = _coeffs ? _coeffs[i] : coeffs0[i]; - } - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const float * src, float * dst, int n) const - { - int scn = srccn, i = 0; - float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - -#if CV_SIMD - const int vsize = v_float32::nlanes; - v_float32 rv = vx_setall_f32(cr), gv = vx_setall_f32(cg), bv = vx_setall_f32(cb); - for(; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize) - { - v_float32 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - v_float32 d = v_fma(r, rv, v_fma(g, gv, b*bv)); - - v_store(dst, d); - } - vx_cleanup(); -#endif - - for ( ; i < n; i++, src += scn, dst++) - dst[0] = src[0]*cb + src[1]*cg + src[2]*cr; - } - - int srccn; - float coeffs[3]; -}; - -template<> -struct RGB2Gray -{ - typedef uchar channel_type; - - // can be changed to 15-shift coeffs - static const int BY = B2Y; - static const int GY = G2Y; - static const int RY = R2Y; - static const int shift = yuv_shift; - - RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) - { - const int coeffs0[] = { RY, GY, BY }; - for(int i = 0; i < 3; i++) - coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]); - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - - CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift)); - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int scn = srccn; - short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_int16 bg2y; - v_int16 r12y; - v_int16 dummy; - v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy); - v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy); - v_int16 delta = vx_setall_s16(1 << (shift-1)); - - for( ; i <= n-vsize; - i += vsize, src += scn*vsize, dst += vsize) - { - v_uint8 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - //TODO: shorten registers use when v_deinterleave is available - - v_uint16 r0, r1, g0, g1, b0, b1; - v_expand(r, r0, r1); - v_expand(g, g0, g1); - v_expand(b, b0, b1); - - v_int16 bg00, bg01, bg10, bg11; - v_int16 rd00, rd01, rd10, rd11; - v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01); - v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11); - v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01); - v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11); - - v_uint32 y00, y01, y10, y11; - y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; - y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; - y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; - y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; - - v_uint16 y0, y1; - y0 = v_pack(y00, y01); - y1 = v_pack(y10, y11); - - v_uint8 y = v_pack(y0, y1); - v_store(dst, y); - } - vx_cleanup(); -#endif - - for( ; i < n; i++, src += scn, dst++) - { - int b = src[0], g = src[1], r = src[2]; - uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift); - dst[0] = y; - } - } - - int srccn; - short coeffs[3]; -}; - - -template<> -struct RGB2Gray -{ - typedef ushort channel_type; - - // can be changed to 15-shift coeffs - static const int BY = B2Y; - static const int GY = G2Y; - static const int RY = R2Y; - static const int shift = yuv_shift; - static const int fix_shift = (int)(sizeof(short)*8 - shift); - - RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) - { - const int coeffs0[] = { RY, GY, BY }; - for(int i = 0; i < 3; i++) - coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]); - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - - CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift)); - } - - void operator()(const ushort* src, ushort* dst, int n) const - { - int scn = srccn; - short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint16::nlanes; - - v_int16 b2y = vx_setall_s16(cb); - v_int16 g2y = vx_setall_s16(cg); - v_int16 r2y = vx_setall_s16(cr); - v_int16 one = vx_setall_s16(1); - v_int16 z = vx_setzero_s16(); - - v_int16 bg2y, r12y; - v_int16 dummy; - v_zip(b2y, g2y, bg2y, dummy); - v_zip(r2y, one, r12y, dummy); - - v_int16 delta = vx_setall_s16(1 << (shift-1)); - - for( ; i <= n-vsize; - i += vsize, src += scn*vsize, dst += vsize) - { - v_uint16 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - v_int16 sb = v_reinterpret_as_s16(b); - v_int16 sr = v_reinterpret_as_s16(r); - v_int16 sg = v_reinterpret_as_s16(g); - - v_int16 bg0, bg1; - v_int16 rd0, rd1; - v_zip(sb, sg, bg0, bg1); - v_zip(sr, delta, rd0, rd1); - - // fixing 16bit signed multiplication - v_int16 mr, mg, mb; - mr = (sr < z) & r2y; - mg = (sg < z) & g2y; - mb = (sb < z) & b2y; - v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; - - v_int32 sy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; - v_int32 sy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; - - v_int16 y = v_add_wrap(v_pack(sy0, sy1), fixmul); - - v_store((short*)dst, y); - } - vx_cleanup(); -#endif - for( ; i < n; i++, src += scn, dst++) - { - int b = src[0], g = src[1], r = src[2]; - ushort d = (ushort)CV_DESCALE((unsigned)(b*cb + g*cg + r*cr), shift); - dst[0] = d; - } - } - - int srccn; - short coeffs[3]; -}; - - -/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) ////////////// - -template -struct RGBA2mRGBA -{ - typedef _Tp channel_type; - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - _Tp max_val = ColorChannel<_Tp>::max(); - _Tp half_val = ColorChannel<_Tp>::half(); - for( int i = 0; i < n; i++ ) - { - _Tp v0 = *src++; - _Tp v1 = *src++; - _Tp v2 = *src++; - _Tp v3 = *src++; - - *dst++ = (v0 * v3 + half_val) / max_val; - *dst++ = (v1 * v3 + half_val) / max_val; - *dst++ = (v2 * v3 + half_val) / max_val; - *dst++ = v3; - } - } -}; - - -template<> -struct RGBA2mRGBA -{ - typedef uchar channel_type; - - void operator()(const uchar* src, uchar* dst, int n) const - { - const uchar max_val = 255; - const uchar half_val = 128; - - int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); - v_uint16 vh = vx_setall_u16(half_val+1); - - // processing 4 registers per loop cycle is about 10% faster - // than processing 1 register - for( ; i <= n-vsize; - i += vsize, src += 4*vsize, dst += 4*vsize) - { - v_uint8 v[4]; - for(int j = 0; j < 4; j++) - v[j] = vx_load(src + j*vsize); - - // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 => - // => 00,00,a0,a0,00,00,a1,a1 - // => a0,a0,a0,a0,a1,a1,a1,a1 - - v_uint16 a16[4]; - for(int j = 0; j < 4; j++) - a16[j] = v_reinterpret_as_u16(v[j] & amask); - - v_uint32 a32[4]; - for(int j = 0; j < 4; j++) - a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8)); - - v_uint8 a[4]; - for(int j = 0; j < 4; j++) - a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16)); - - v_uint16 m[8]; - for(int j = 0; j < 4; j++) - v_mul_expand(v[j], a[j], m[j], m[j+4]); - - for(int j = 0; j < 8; j++) - m[j] += vh; - - // div 255: (v+1+(v>>8))>8 - // +1 is in vh, has no effect on (v>>8) - for(int j = 0; j < 8; j++) - m[j] = (m[j] + (m[j] >> 8)) >> 8; - - v_uint8 d[4]; - for(int j = 0; j < 4; j++) - d[j] = v_pack(m[j], m[j+4]); - - for(int j = 0; j < 4; j++) - d[j] = v_select(amask, a[j], d[j]); - - for(int j = 0; j < 4; j++) - v_store(dst + j*vsize, d[j]); - } - - vx_cleanup(); -#endif - for(; i < n; i++, src += 4, dst += 4 ) - { - uchar v0 = src[0]; - uchar v1 = src[1]; - uchar v2 = src[2]; - uchar v3 = src[3]; - - dst[0] = (v0 * v3 + half_val) / max_val; - dst[1] = (v1 * v3 + half_val) / max_val; - dst[2] = (v2 * v3 + half_val) / max_val; - dst[3] = v3; - } - } -}; - - -template -struct mRGBA2RGBA -{ - typedef _Tp channel_type; - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - _Tp max_val = ColorChannel<_Tp>::max(); - for( int i = 0; i < n; i++ ) - { - _Tp v0 = *src++; - _Tp v1 = *src++; - _Tp v2 = *src++; - _Tp v3 = *src++; - _Tp v3_half = v3 / 2; - - *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v0 * max_val + v3_half) / v3); - *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v1 * max_val + v3_half) / v3); - *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v2 * max_val + v3_half) / v3); - *dst++ = v3; - } - } -}; - - -template<> -struct mRGBA2RGBA -{ - typedef uchar channel_type; - - void operator()(const uchar* src, uchar* dst, int n) const - { - uchar max_val = ColorChannel::max(); - int i = 0; +#include "color.hpp" -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); - v_uint8 vmax = vx_setall_u8(max_val); +#include "color_rgb.simd.hpp" +#include "color_rgb.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content - for( ; i <= n-vsize/4; - i += vsize/4, src += vsize, dst += vsize) - { - v_uint8 s = vx_load(src + 0*vsize); - - // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 => - // => 00,00,a0,a0,00,00,a1,a1 - // => a0,a0,a0,a0,a1,a1,a1,a1 - v_uint8 a; - v_uint16 a16; - v_uint32 a32; - a16 = v_reinterpret_as_u16(s & amask); - a32 = v_reinterpret_as_u32(a16 | (a16 >> 8)); - a = v_reinterpret_as_u8(a32 | (a32 >> 16)); - - // s *= max_val - v_uint16 s0, s1; - v_mul_expand(s, vmax, s0, s1); - - // s += a/2 - v_uint16 ae0, ae1; - v_expand(a, ae0, ae1); - s0 += ae0 >> 1; s1 += ae1 >> 1; - - // s, a -> u32 -> float - v_uint32 u00, u01, u10, u11; - v_int32 s00, s01, s10, s11; - v_expand(s0, u00, u01); - v_expand(s1, u10, u11); - s00 = v_reinterpret_as_s32(u00); - s01 = v_reinterpret_as_s32(u01); - s10 = v_reinterpret_as_s32(u10); - s11 = v_reinterpret_as_s32(u11); - - v_uint32 ua00, ua01, ua10, ua11; - v_int32 a00, a01, a10, a11; - v_expand(ae0, ua00, ua01); - v_expand(ae1, ua10, ua11); - a00 = v_reinterpret_as_s32(ua00); - a01 = v_reinterpret_as_s32(ua01); - a10 = v_reinterpret_as_s32(ua10); - a11 = v_reinterpret_as_s32(ua11); - - v_float32 fs00, fs01, fs10, fs11; - fs00 = v_cvt_f32(s00); - fs01 = v_cvt_f32(s01); - fs10 = v_cvt_f32(s10); - fs11 = v_cvt_f32(s11); - - v_float32 fa00, fa01, fa10, fa11; - fa00 = v_cvt_f32(a00); - fa01 = v_cvt_f32(a01); - fa10 = v_cvt_f32(a10); - fa11 = v_cvt_f32(a11); - - // float d = (float)s/(float)a - v_float32 fd00, fd01, fd10, fd11; - fd00 = fs00/fa00; - fd01 = fs01/fa01; - fd10 = fs10/fa10; - fd11 = fs11/fa11; - - // d -> u32 -> u8 - v_uint32 ud00, ud01, ud10, ud11; - ud00 = v_reinterpret_as_u32(v_trunc(fd00)); - ud01 = v_reinterpret_as_u32(v_trunc(fd01)); - ud10 = v_reinterpret_as_u32(v_trunc(fd10)); - ud11 = v_reinterpret_as_u32(v_trunc(fd11)); - v_uint16 ud0, ud1; - ud0 = v_pack(ud00, ud01); - ud1 = v_pack(ud10, ud11); - v_uint8 d; - d = v_pack(ud0, ud1); - - // if a == 0 then d = 0 - v_uint8 am; - am = a != vx_setzero_u8(); - d = d & am; - - // put alpha values - d = v_select(amask, a, d); - - v_store(dst, d); - } +#define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1 - vx_cleanup(); -#endif - for(; i < n; i++, src += 4, dst += 4 ) - { - uchar v0 = src[0]; - uchar v1 = src[1]; - uchar v2 = src[2]; - uchar v3 = src[3]; - - uchar v3_half = v3 / 2; - - dst[0] = (v3==0)? 0 : (v0 * max_val + v3_half) / v3; - dst[1] = (v3==0)? 0 : (v1 * max_val + v3_half) / v3; - dst[2] = (v3==0)? 0 : (v2 * max_val + v3_half) / v3; - dst[3] = v3; - - dst[0] = (v3==0)? 0 : saturate_cast((v0 * max_val + v3_half) / v3); - dst[1] = (v3==0)? 0 : saturate_cast((v1 * max_val + v3_half) / v3); - dst[2] = (v3==0)? 0 : saturate_cast((v2 * max_val + v3_half) / v3); - dst[3] = v3; - } - } -}; +namespace cv { // // IPP functions @@ -1051,25 +20,25 @@ struct mRGBA2RGBA #if NEED_IPP -static ippiColor2GrayFunc ippiColor2GrayC3Tab[] = +static const ippiColor2GrayFunc ippiColor2GrayC3Tab[] = { (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0, 0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0 }; -static ippiColor2GrayFunc ippiColor2GrayC4Tab[] = +static const ippiColor2GrayFunc ippiColor2GrayC4Tab[] = { (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0, 0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0 }; -static ippiGeneralFunc ippiRGB2GrayC3Tab[] = +static const ippiGeneralFunc ippiRGB2GrayC3Tab[] = { (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0, 0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0 }; -static ippiGeneralFunc ippiRGB2GrayC4Tab[] = +static const ippiGeneralFunc ippiRGB2GrayC4Tab[] = { (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0, 0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0 @@ -1208,8 +177,7 @@ static ippiReorderFunc ippiSwapChannelsC4RTab[] = // HAL functions // -namespace hal -{ +namespace hal { // 8u, 16u, 32f void cvtBGRtoBGR(const uchar * src_data, size_t src_step, @@ -1265,13 +233,8 @@ void cvtBGRtoBGR(const uchar * src_data, size_t src_step, #endif #endif - int blueIdx = swapBlue ? 2 : 0; - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); + CV_CPU_DISPATCH(cvtBGRtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue), + CV_CPU_DISPATCH_MODES_ALL); } // only 8u @@ -1284,7 +247,8 @@ void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits)); + CV_CPU_DISPATCH(cvtBGRtoBGR5x5, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits), + CV_CPU_DISPATCH_MODES_ALL); } // only 8u @@ -1297,7 +261,8 @@ void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits)); + CV_CPU_DISPATCH(cvtBGR5x5toBGR, (src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits), + CV_CPU_DISPATCH_MODES_ALL); } // 8u, 16u, 32f @@ -1340,13 +305,8 @@ void cvtBGRtoGray(const uchar * src_data, size_t src_step, } #endif - int blueIdx = swapBlue ? 2 : 0; - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); + CV_CPU_DISPATCH(cvtBGRtoGray, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue), + CV_CPU_DISPATCH_MODES_ALL); } // 8u, 16u, 32f @@ -1390,12 +350,8 @@ void cvtGraytoBGR(const uchar * src_data, size_t src_step, } #endif - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); + CV_CPU_DISPATCH(cvtGraytoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn), + CV_CPU_DISPATCH_MODES_ALL); } // only 8u @@ -1407,7 +363,9 @@ void cvtBGR5x5toGray(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits)); + + CV_CPU_DISPATCH(cvtBGR5x5toGray, (src_data, src_step, dst_data, dst_step, width, height, greenBits), + CV_CPU_DISPATCH_MODES_ALL); } // only 8u @@ -1419,7 +377,9 @@ void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits)); + + CV_CPU_DISPATCH(cvtGraytoBGR5x5, (src_data, src_step, dst_data, dst_step, width, height, greenBits), + CV_CPU_DISPATCH_MODES_ALL); } void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, @@ -1439,7 +399,8 @@ void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, } #endif - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA()); + CV_CPU_DISPATCH(cvtRGBAtoMultipliedRGBA, (src_data, src_step, dst_data, dst_step, width, height), + CV_CPU_DISPATCH_MODES_ALL); } void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, @@ -1449,7 +410,9 @@ void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA()); + + CV_CPU_DISPATCH(cvtMultipliedRGBAtoRGBA, (src_data, src_step, dst_data, dst_step, width, height), + CV_CPU_DISPATCH_MODES_ALL); } } // namespace hal diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp index 9245f26d05..76dc4e5e1e 100644 --- a/modules/imgproc/src/color_rgb.simd.hpp +++ b/modules/imgproc/src/color_rgb.simd.hpp @@ -3,13 +3,58 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" +#include "opencv2/core/hal/intrin.hpp" -#define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1 +namespace cv { +namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations -namespace cv -{ +void cvtBGRtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, int dcn, bool swapBlue); +void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int greenBits); +void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int dcn, bool swapBlue, int greenBits); +void cvtBGRtoGray(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue); +void cvtGraytoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn); +void cvtBGR5x5toGray(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int greenBits); +void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int greenBits); +void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height); +void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if defined(CV_CPU_BASELINE_MODE) +// included in color.hpp +#else +#include "color.simd_helpers.hpp" +#endif + +namespace { ////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// template struct v_type; @@ -1044,172 +1089,7 @@ struct mRGBA2RGBA } } }; - -// -// IPP functions -// - -#if NEED_IPP - -static ippiColor2GrayFunc ippiColor2GrayC3Tab[] = -{ - (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0, - 0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0 -}; - -static ippiColor2GrayFunc ippiColor2GrayC4Tab[] = -{ - (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0, - 0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0 -}; - -static ippiGeneralFunc ippiRGB2GrayC3Tab[] = -{ - (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0, - 0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0 -}; - -static ippiGeneralFunc ippiRGB2GrayC4Tab[] = -{ - (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0, - 0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0 -}; - - -#if !IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 -static IppStatus ippiGrayToRGB_C1C3R(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, IppiSize roiSize) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); -} -#endif -static IppStatus ippiGrayToRGB_C1C3R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); -} -static IppStatus ippiGrayToRGB_C1C3R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); -} - -static IppStatus ippiGrayToRGB_C1C4R(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, IppiSize roiSize, Ipp8u aval) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); -} -static IppStatus ippiGrayToRGB_C1C4R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize, Ipp16u aval) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); -} -static IppStatus ippiGrayToRGB_C1C4R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize, Ipp32f aval) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); -} - -struct IPPColor2GrayFunctor -{ - IPPColor2GrayFunctor(ippiColor2GrayFunc _func) : - ippiColorToGray(_func) - { - coeffs[0] = B2YF; - coeffs[1] = G2YF; - coeffs[2] = R2YF; - } - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiColorToGray ? CV_INSTRUMENT_FUN_IPP(ippiColorToGray, src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false; - } -private: - ippiColor2GrayFunc ippiColorToGray; - Ipp32f coeffs[3]; -}; - -template -struct IPPGray2BGRFunctor -{ - IPPGray2BGRFunctor(){} - - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiGrayToRGB_C1C3R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows)) >= 0; - } -}; - -template -struct IPPGray2BGRAFunctor -{ - IPPGray2BGRAFunctor() - { - alpha = ColorChannel::max(); - } - - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiGrayToRGB_C1C4R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows), alpha) >= 0; - } - - T alpha; -}; - -static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, - IppiSize roiSize, const int *dstOrder) -{ - return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_8u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u); -} - -static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, - IppiSize roiSize, const int *dstOrder) -{ - return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_16u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u); -} - -static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, - IppiSize roiSize, const int *dstOrder) -{ - return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_32f_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f); -} - -// shared -ippiReorderFunc ippiSwapChannelsC3C4RTab[] = -{ - (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0, - 0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0 -}; - -static ippiGeneralFunc ippiCopyAC4C3RTab[] = -{ - (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0, - 0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0 -}; - -// shared -ippiReorderFunc ippiSwapChannelsC4C3RTab[] = -{ - (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0, - 0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0 -}; - -// shared -ippiReorderFunc ippiSwapChannelsC3RTab[] = -{ - (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0, - 0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0 -}; - -#if IPP_VERSION_X100 >= 810 -static ippiReorderFunc ippiSwapChannelsC4RTab[] = -{ - (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0, - 0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0 -}; -#endif - -#endif - -// -// HAL functions -// - -namespace hal -{ +} // namespace anon // 8u, 16u, 32f void cvtBGRtoBGR(const uchar * src_data, size_t src_step, @@ -1219,52 +1099,6 @@ void cvtBGRtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - if(scn == 3 && dcn == 4 && !swapBlue) - { - if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) ) - return; - } - else if(scn == 4 && dcn == 3 && !swapBlue) - { - if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) ) - return; - } - else if(scn == 3 && dcn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) ) - return; - } - else if(scn == 4 && dcn == 3 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) ) - return; - } - else if(scn == 3 && dcn == 3 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) ) - return; - } -#if IPP_VERSION_X100 >= 810 - else if(scn == 4 && dcn == 4 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) ) - return; - } - } -#endif -#endif - int blueIdx = swapBlue ? 2 : 0; if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); @@ -1282,8 +1116,6 @@ void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits)); } @@ -1295,8 +1127,6 @@ void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits)); } @@ -1308,38 +1138,6 @@ void cvtBGRtoGray(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoGray, cv_hal_cvtBGRtoGray, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - if(depth == CV_32F && scn == 3 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) ) - return; - } - else if(depth == CV_32F && scn == 3 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) ) - return; - } - else if(depth == CV_32F && scn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) ) - return; - } - else if(depth == CV_32F && scn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) ) - return; - } - } -#endif - int blueIdx = swapBlue ? 2 : 0; if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); @@ -1357,39 +1155,6 @@ void cvtGraytoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtGraytoBGR, cv_hal_cvtGraytoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - bool ippres = false; - if(dcn == 3) - { - if( depth == CV_8U ) - { -#if !IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); -#endif - } - else if( depth == CV_16U ) - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); - else - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); - } - else if(dcn == 4) - { - if( depth == CV_8U ) - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); - else if( depth == CV_16U ) - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); - else - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); - } - if(ippres) - return; - } -#endif - if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); else if( depth == CV_16U ) @@ -1406,7 +1171,6 @@ void cvtBGR5x5toGray(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits); CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits)); } @@ -1418,7 +1182,6 @@ void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits); CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits)); } @@ -1428,17 +1191,6 @@ void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtRGBAtoMultipliedRGBA, cv_hal_cvtRGBAtoMultipliedRGBA, src_data, src_step, dst_data, dst_step, width, height); - -#ifdef HAVE_IPP - CV_IPP_CHECK() - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R))) - return; - } -#endif - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA()); } @@ -1448,209 +1200,9 @@ void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height); CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA()); } -} // namespace hal - -// -// OCL calls -// - -#ifdef HAVE_OPENCL - -bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse ) -{ - OclHelper< Set<3, 4>, Set<3, 4>, Set > h(_src, _dst, dcn); - - if(!h.createKernel("RGB", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER"))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits ) -{ - OclHelper< Set<3, 4>, Set<2>, Set > h(_src, _dst, 2); - - if(!h.createKernel("RGB2RGB5x5", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, gbits))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits) -{ - OclHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); - - if(!h.createKernel("RGB5x52RGB", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, gbits))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits) -{ - OclHelper< Set<2>, Set<1>, Set > h(_src, _dst, 1); - - if(!h.createKernel("BGR5x52Gray", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=1 -D bidx=0 -D greenbits=%d", gbits))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits) -{ - OclHelper< Set<1>, Set<2>, Set > h(_src, _dst, 2); - - if(!h.createKernel("Gray2BGR5x5", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=2 -D bidx=0 -D greenbits=%d", gbits))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx) -{ - OclHelper< Set<3, 4>, Set<1>, Set > h(_src, _dst, 1); - - int stripeSize = 1; - if(!h.createKernel("RGB2Gray", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", bidx, stripeSize))) - { - return false; - } - - h.globalSize[0] = (h.src.cols + stripeSize - 1)/stripeSize; - return h.run(); -} - -bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn) -{ - OclHelper< Set<1>, Set<3, 4>, Set > h(_src, _dst, dcn); - if(!h.createKernel("Gray2RGB", ocl::imgproc::color_rgb_oclsrc, - format("-D bidx=0 -D dcn=%d", dcn))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst) -{ - OclHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); - - if(!h.createKernel("RGBA2mRGBA", ocl::imgproc::color_rgb_oclsrc, - "-D dcn=4 -D bidx=3")) - { - return false; - } - - return h.run(); -} - -bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst) -{ - OclHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); - - if(!h.createKernel("mRGBA2RGBA", ocl::imgproc::color_rgb_oclsrc, - "-D dcn=4 -D bidx=3")) - { - return false; - } - - return h.run(); -} - #endif - -// -// HAL calls -// - -void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb) -{ - CvtHelper< Set<3, 4>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtBGRtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, dcn, swapb); -} - -void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits) -{ - CvtHelper< Set<3, 4>, Set<2>, Set > h(_src, _dst, 2); - - hal::cvtBGRtoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.scn, swapb, gbits); -} - -void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtBGR5x5toBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - dcn, swapb, gbits); -} - -void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb) -{ - CvtHelper< Set<3, 4>, Set<1>, Set > h(_src, _dst, 1); - - hal::cvtBGRtoGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, swapb); -} - -void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<1>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtGraytoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, h.depth, dcn); -} - -void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits) -{ - CvtHelper< Set<2>, Set<1>, Set > h(_src, _dst, 1); - - hal::cvtBGR5x5toGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits); -} - -void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits) -{ - CvtHelper< Set<1>, Set<2>, Set > h(_src, _dst, 2); - - hal::cvtGraytoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits); -} - -void cvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst) -{ - CvtHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); - - hal::cvtRGBAtoMultipliedRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows); -} - -void cvtColormRGBA2RGBA( InputArray _src, OutputArray _dst) -{ - CvtHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); - - hal::cvtMultipliedRGBAtoRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows); -} - -} // namespace cv +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace diff --git a/modules/imgproc/src/color_yuv.dispatch.cpp b/modules/imgproc/src/color_yuv.dispatch.cpp index 7d731378e2..6cb508f980 100644 --- a/modules/imgproc/src/color_yuv.dispatch.cpp +++ b/modules/imgproc/src/color_yuv.dispatch.cpp @@ -3,1747 +3,19 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" - -namespace cv -{ - -//constants for conversion from/to RGB and YUV, YCrCb according to BT.601 - -//to YCbCr -static const float YCBF = 0.564f; // == 1/2/(1-B2YF) -static const float YCRF = 0.713f; // == 1/2/(1-R2YF) -static const int YCBI = 9241; // == YCBF*16384 -static const int YCRI = 11682; // == YCRF*16384 -//to YUV -static const float B2UF = 0.492f; -static const float R2VF = 0.877f; -static const int B2UI = 8061; // == B2UF*16384 -static const int R2VI = 14369; // == R2VF*16384 -//from YUV -static const float U2BF = 2.032f; -static const float U2GF = -0.395f; -static const float V2GF = -0.581f; -static const float V2RF = 1.140f; -static const int U2BI = 33292; -static const int U2GI = -6472; -static const int V2GI = -9519; -static const int V2RI = 18678; -//from YCrCb -static const float CB2BF = 1.773f; -static const float CB2GF = -0.344f; -static const float CR2GF = -0.714f; -static const float CR2RF = 1.403f; -static const int CB2BI = 29049; -static const int CB2GI = -5636; -static const int CR2GI = -11698; -static const int CR2RI = 22987; - -///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// - -template struct RGB2YCrCb_f -{ - typedef _Tp channel_type; - - RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : - srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; - static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int scn = srccn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const _Tp delta = ColorChannel<_Tp>::half(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - n *= 3; - for(int i = 0; i < n; i += 3, src += scn) - { - _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2); - _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta); - _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta); - dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb; - } - } - int srccn, blueIdx; - bool isCrCb; - float coeffs[5]; -}; - -template <> -struct RGB2YCrCb_f -{ - typedef float channel_type; - - RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : - srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; - static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const float * src, float * dst, int n) const - { - int scn = srccn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const float delta = ColorChannel::half(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - - int i = 0; -#if CV_SIMD - v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2); - v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4); - v_float32 vdelta = vx_setall_f32(delta); - const int vsize = v_float32::nlanes; - for( ; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize*3) - { - v_float32 b, g, r, dummy; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, dummy); - } - - v_float32 y, cr, cb; - y = v_fma(b, vc0, v_fma(g, vc1, r*vc2)); - - if(bidx) - std::swap(r, b); - - cr = v_fma(r - y, vc3, vdelta); - cb = v_fma(b - y, vc4, vdelta); - - if(yuvOrder) - { - v_store_interleave(dst, y, cb, cr); - } - else - { - v_store_interleave(dst, y, cr, cb); - } - } - vx_cleanup(); -#endif - for ( ; i < n; i ++, src += scn, dst += 3) - { - float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; - float Cr = (src[bidx^2] - Y)*C3 + delta; - float Cb = (src[bidx] - Y)*C4 + delta; - dst[0 ] = Y; - dst[1+yuvOrder] = Cr; - dst[2-yuvOrder] = Cb; - } - } - - int srccn, blueIdx; - bool isCrCb; - float coeffs[5]; -}; - - -template struct RGB2YCrCb_i -{ - typedef _Tp channel_type; - static const int shift = yuv_shift; - - RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) - : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; - static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if(blueIdx==0) std::swap(coeffs[0], coeffs[2]); - } - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int scn = srccn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel<_Tp>::half()*(1 << shift); - n *= 3; - for(int i = 0; i < n; i += 3, src += scn) - { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift); - dst[i] = saturate_cast<_Tp>(Y); - dst[i+1+yuvOrder] = saturate_cast<_Tp>(Cr); - dst[i+2-yuvOrder] = saturate_cast<_Tp>(Cb); - } - } - int srccn, blueIdx; - bool isCrCb; - int coeffs[5]; -}; - - -template<> -struct RGB2YCrCb_i -{ - typedef ushort channel_type; - static const int shift = yuv_shift; - static const int fix_shift = (int)(sizeof(short)*8 - shift); - - RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) - : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; - static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if(blueIdx==0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const ushort* src, ushort* dst, int n) const - { - int scn = srccn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int sdelta = ColorChannel::half()*(1 << shift); - int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; - const int descale = 1 << (shift-1); - - v_int16 b2y = vx_setall_s16((short)C0); - v_int16 g2y = vx_setall_s16((short)C1); - v_int16 r2y = vx_setall_s16((short)C2); - v_int16 one = vx_setall_s16(1); - v_int16 z = vx_setzero_s16(); - - v_int16 bg2y, r12y; - v_int16 dummy; - v_zip(b2y, g2y, bg2y, dummy); - v_zip(r2y, one, r12y, dummy); - - v_int16 vdescale = vx_setall_s16(1 << (shift-1)); - v_int32 vc3 = vx_setall_s32(C3); - v_int32 vc4 = vx_setall_s32(C4); - v_int32 vdd = vx_setall_s32(sdelta + descale); - - for(; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize*3) - { - v_uint16 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - v_uint16 y, cr, cb; - - v_int16 sb = v_reinterpret_as_s16(b); - v_int16 sr = v_reinterpret_as_s16(r); - v_int16 sg = v_reinterpret_as_s16(g); - - v_int16 bg0, bg1; - v_int16 rd0, rd1; - v_zip(sb, sg, bg0, bg1); - v_zip(sr, vdescale, rd0, rd1); - - // fixing 16bit signed multiplication - v_int16 mr, mg, mb; - mr = (sr < z) & r2y; - mg = (sg < z) & g2y; - mb = (sb < z) & b2y; - v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; - - v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; - v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; - - y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul)); - - if(bidx) - swap(r, b); - - // (r-Y) and (b-Y) don't fit into int16 or uint16 range - v_uint32 r0, r1, b0, b1; - v_expand(r, r0, r1); - v_expand(b, b0, b1); - - v_uint32 uy0, uy1; - v_expand(y, uy0, uy1); - - v_int32 sr0 = v_reinterpret_as_s32(r0); - v_int32 sr1 = v_reinterpret_as_s32(r1); - v_int32 sb0 = v_reinterpret_as_s32(b0); - v_int32 sb1 = v_reinterpret_as_s32(b1); - v_int32 sy0 = v_reinterpret_as_s32(uy0); - v_int32 sy1 = v_reinterpret_as_s32(uy1); - - sr0 = sr0 - sy0; sr1 = sr1 - sy1; - sb0 = sb0 - sy0; sb1 = sb1 - sy1; - - v_int32 scr0, scr1, scb0, scb1; - - scr0 = (sr0*vc3 + vdd) >> shift; - scr1 = (sr1*vc3 + vdd) >> shift; - scb0 = (sb0*vc4 + vdd) >> shift; - scb1 = (sb1*vc4 + vdd) >> shift; - - // saturate and pack - cr = v_pack_u(scr0, scr1); - cb = v_pack_u(scb0, scb1); - - if(yuvOrder) - { - v_store_interleave(dst, y, cb, cr); - } - else - { - v_store_interleave(dst, y, cr, cb); - } - } - vx_cleanup(); -#endif - for( ; i < n; i++, src += scn, dst += 3) - { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + sdelta, shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + sdelta, shift); - dst[0] = saturate_cast(Y); - dst[1+yuvOrder] = saturate_cast(Cr); - dst[2-yuvOrder] = saturate_cast(Cb); - } - } - int srccn, blueIdx; - bool isCrCb; - int coeffs[5]; -}; - - -template <> -struct RGB2YCrCb_i -{ - typedef uchar channel_type; - static const int shift = yuv_shift; - - RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) - : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; - static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if (blueIdx==0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const uchar * src, uchar * dst, int n) const - { - int scn = srccn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel::half()*(1 << shift); - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - const int descaleShift = 1 << (shift-1); - v_int16 bg2y; - v_int16 r12y; - v_int16 dummy; - v_zip(vx_setall_s16((short)C0), vx_setall_s16((short)C1), bg2y, dummy); - v_zip(vx_setall_s16((short)C2), vx_setall_s16( 1), r12y, dummy); - - // delta + descaleShift == descaleShift*(half*2+1) - v_int16 c3h, c4h; - const short h21 = (short)(ColorChannel::half()*2+1); - v_zip(vx_setall_s16((short)C3), vx_setall_s16(h21), c3h, dummy); - v_zip(vx_setall_s16((short)C4), vx_setall_s16(h21), c4h, dummy); - - v_int16 vdescale = vx_setall_s16(descaleShift); - - for( ; i <= n-vsize; - i += vsize, src += scn*vsize, dst += 3*vsize) - { - v_uint8 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - v_uint8 y; - - v_uint16 r0, r1, g0, g1, b0, b1; - v_expand(r, r0, r1); - v_expand(g, g0, g1); - v_expand(b, b0, b1); - - v_int16 sr0, sr1, sg0, sg1, sb0, sb1; - sr0 = v_reinterpret_as_s16(r0); sr1 = v_reinterpret_as_s16(r1); - sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1); - sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1); - - v_uint32 y00, y01, y10, y11; - { - v_int16 bg00, bg01, bg10, bg11; - v_int16 rd00, rd01, rd10, rd11; - v_zip(sb0, sg0, bg00, bg01); - v_zip(sb1, sg1, bg10, bg11); - v_zip(sr0, vdescale, rd00, rd01); - v_zip(sr1, vdescale, rd10, rd11); - - y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; - y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; - y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; - y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; - } - - v_uint16 y0, y1; - y0 = v_pack(y00, y01); - y1 = v_pack(y10, y11); - - y = v_pack(y0, y1); - - v_int16 sy0, sy1; - sy0 = v_reinterpret_as_s16(y0); - sy1 = v_reinterpret_as_s16(y1); - - // (r-Y) and (b-Y) don't fit into 8 bit, use 16 bits instead - sr0 = v_sub_wrap(sr0, sy0); - sr1 = v_sub_wrap(sr1, sy1); - sb0 = v_sub_wrap(sb0, sy0); - sb1 = v_sub_wrap(sb1, sy1); - - if(bidx) - { - swap(sr0, sb0); swap(sr1, sb1); - } - - v_int32 cr00, cr01, cr10, cr11; - v_int32 cb00, cb01, cb10, cb11; - - // delta + descaleShift == descaleShift*(half*2+1) - { - v_int16 rd00, rd01, rd10, rd11; - v_int16 bd00, bd01, bd10, bd11; - - v_zip(sr0, vdescale, rd00, rd01); - v_zip(sr1, vdescale, rd10, rd11); - - v_zip(sb0, vdescale, bd00, bd01); - v_zip(sb1, vdescale, bd10, bd11); - - cr00 = v_dotprod(rd00, c3h); - cr01 = v_dotprod(rd01, c3h); - cr10 = v_dotprod(rd10, c3h); - cr11 = v_dotprod(rd11, c3h); - - cb00 = v_dotprod(bd00, c4h); - cb01 = v_dotprod(bd01, c4h); - cb10 = v_dotprod(bd10, c4h); - cb11 = v_dotprod(bd11, c4h); - } - - v_uint8 cr, cb; - - cr00 = cr00 >> shift; - cr01 = cr01 >> shift; - cr10 = cr10 >> shift; - cr11 = cr11 >> shift; - - cb00 = cb00 >> shift; - cb01 = cb01 >> shift; - cb10 = cb10 >> shift; - cb11 = cb11 >> shift; - - v_int16 cr0, cr1, cb0, cb1; - cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11); - cb0 = v_pack(cb00, cb01); cb1 = v_pack(cb10, cb11); - - cr = v_pack_u(cr0, cr1); - cb = v_pack_u(cb0, cb1); - - if(yuvOrder) - { - v_store_interleave(dst, y, cb, cr); - } - else - { - v_store_interleave(dst, y, cr, cb); - } - } - vx_cleanup(); -#endif - - for ( ; i < n; i++, src += scn, dst += 3) - { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift); - dst[0] = saturate_cast(Y); - dst[1+yuvOrder] = saturate_cast(Cr); - dst[2-yuvOrder] = saturate_cast(Cb); - } - } - - int srccn, blueIdx, coeffs[5]; - bool isCrCb; -}; - - -template struct YCrCb2RGB_f -{ - typedef _Tp channel_type; - - YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; - static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i]; - } - } - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; - for(int i = 0; i < n; i += 3, dst += dcn) - { - _Tp Y = src[i]; - _Tp Cr = src[i+1+yuvOrder]; - _Tp Cb = src[i+2-yuvOrder]; - - _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3); - _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1); - _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0); - - dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - float coeffs[4]; -}; - - -template<> -struct YCrCb2RGB_f -{ - typedef float channel_type; - - YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; - static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i]; - } - } - - void operator()(const float* src, float* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const float delta = ColorChannel::half(), alpha = ColorChannel::max(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - - int i = 0; -#if CV_SIMD - v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1); - v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3); - v_float32 vdelta = vx_setall_f32(delta); - v_float32 valpha = vx_setall_f32(alpha); - const int vsize = v_float32::nlanes; - for( ; i <= n-vsize; - i += vsize, src += vsize*3, dst += vsize*dcn) - { - v_float32 y, cr, cb; - if(yuvOrder) - v_load_deinterleave(src, y, cb, cr); - else - v_load_deinterleave(src, y, cr, cb); - - v_float32 b, g, r; - - cb -= vdelta; cr -= vdelta; - b = v_fma(cb, vc3, y); - g = v_fma(cr, vc1, v_fma(cb, vc2, y)); - r = v_fma(cr, vc0, y); - - if(bidx) - swap(r, b); - - if(dcn == 3) - v_store_interleave(dst, b, g, r); - else - v_store_interleave(dst, b, g, r, valpha); - } - vx_cleanup(); -#endif - for(; i < n; i++, src += 3, dst += dcn) - { - float Y = src[0]; - float Cr = src[1+yuvOrder]; - float Cb = src[2-yuvOrder]; - - float b = Y + (Cb - delta)*C3; - float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; - float r = Y + (Cr - delta)*C0; - - dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - float coeffs[4]; -}; - - -template struct YCrCb2RGB_i -{ - typedef _Tp channel_type; - static const int shift = yuv_shift; - - YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; - static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - } - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max(); - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; - for(int i = 0; i < n; i += 3, dst += dcn) - { - _Tp Y = src[i]; - _Tp Cr = src[i+1+yuvOrder]; - _Tp Cb = src[i+2-yuvOrder]; - - int b = Y + CV_DESCALE((Cb - delta)*C3, shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, shift); - - dst[bidx] = saturate_cast<_Tp>(b); - dst[1] = saturate_cast<_Tp>(g); - dst[bidx^2] = saturate_cast<_Tp>(r); - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - int coeffs[4]; -}; - - -template <> -struct YCrCb2RGB_i -{ - typedef uchar channel_type; - static const int shift = yuv_shift; - - YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; - static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 valpha = vx_setall_u8(alpha); - v_uint8 vdelta = vx_setall_u8(delta); - const int descaleShift = 1 << (shift - 1); - v_int32 vdescale = vx_setall_s32(descaleShift); - - v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); - // if YUV then C3 > 2^15, need to subtract it - // to fit in short by short multiplication - v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3); - - for( ; i <= n-vsize; - i += vsize, src += 3*vsize, dst += dcn*vsize) - { - v_uint8 y, cr, cb; - if(yuvOrder) - { - v_load_deinterleave(src, y, cb, cr); - } - else - { - v_load_deinterleave(src, y, cr, cb); - } - - cr = v_sub_wrap(cr, vdelta); - cb = v_sub_wrap(cb, vdelta); - - v_int8 scr = v_reinterpret_as_s8(cr); - v_int8 scb = v_reinterpret_as_s8(cb); - - v_int16 scr0, scr1, scb0, scb1; - v_expand(scr, scr0, scr1); - v_expand(scb, scb0, scb1); - - v_int32 b00, b01, b10, b11; - v_int32 g00, g01, g10, g11; - v_int32 r00, r01, r10, r11; - - v_mul_expand(scb0, vc3, b00, b01); - v_mul_expand(scb1, vc3, b10, b11); - if(yuvOrder) - { - // if YUV then C3 > 2^15 - // so we fix the multiplication - v_int32 cb00, cb01, cb10, cb11; - v_expand(scb0, cb00, cb01); - v_expand(scb1, cb10, cb11); - b00 += cb00 << 15; b01 += cb01 << 15; - b10 += cb10 << 15; b11 += cb11 << 15; - } - - v_int32 t00, t01, t10, t11; - v_mul_expand(scb0, vc2, t00, t01); - v_mul_expand(scb1, vc2, t10, t11); - v_mul_expand(scr0, vc1, g00, g01); - v_mul_expand(scr1, vc1, g10, g11); - g00 += t00; g01 += t01; - g10 += t10; g11 += t11; - v_mul_expand(scr0, vc0, r00, r01); - v_mul_expand(scr1, vc0, r10, r11); - - b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift; - b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift; - g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift; - g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift; - r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift; - r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift; - - v_int16 b0, b1, g0, g1, r0, r1; - b0 = v_pack(b00, b01); b1 = v_pack(b10, b11); - g0 = v_pack(g00, g01); g1 = v_pack(g10, g11); - r0 = v_pack(r00, r01); r1 = v_pack(r10, r11); - - v_uint16 y0, y1; - v_expand(y, y0, y1); - v_int16 sy0, sy1; - sy0 = v_reinterpret_as_s16(y0); - sy1 = v_reinterpret_as_s16(y1); - - b0 = v_add_wrap(b0, sy0); b1 = v_add_wrap(b1, sy1); - g0 = v_add_wrap(g0, sy0); g1 = v_add_wrap(g1, sy1); - r0 = v_add_wrap(r0, sy0); r1 = v_add_wrap(r1, sy1); - - v_uint8 b, g, r; - b = v_pack_u(b0, b1); - g = v_pack_u(g0, g1); - r = v_pack_u(r0, r1); - - if(bidx) - swap(r, b); - - if(dcn == 3) - { - v_store_interleave(dst, b, g, r); - } - else - { - v_store_interleave(dst, b, g, r, valpha); - } - } - vx_cleanup(); -#endif - - for ( ; i < n; i++, src += 3, dst += dcn) - { - uchar Y = src[0]; - uchar Cr = src[1+yuvOrder]; - uchar Cb = src[2-yuvOrder]; - - int b = Y + CV_DESCALE((Cb - delta)*C3, shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, shift); - - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - int coeffs[4]; -}; +#include "opencl_kernels_imgproc.hpp" +#include "color.hpp" -template <> -struct YCrCb2RGB_i -{ - typedef ushort channel_type; - static const int shift = yuv_shift; - - YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; - static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - } - - void operator()(const ushort* src, ushort* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const ushort delta = ColorChannel::half(), alpha = ColorChannel::max(); - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - -#if CV_SIMD - const int vsize = v_uint16::nlanes; - const int descaleShift = 1 << (shift-1); - v_uint16 valpha = vx_setall_u16(alpha); - v_uint16 vdelta = vx_setall_u16(delta); - v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); - // if YUV then C3 > 2^15, need to subtract it - // to fit in short by short multiplication - v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3); - v_int32 vdescale = vx_setall_s32(descaleShift); - for(; i <= n-vsize; - i += vsize, src += vsize*3, dst += vsize*dcn) - { - v_uint16 y, cr, cb; - if(yuvOrder) - { - v_load_deinterleave(src, y, cb, cr); - } - else - { - v_load_deinterleave(src, y, cr, cb); - } - - v_uint32 uy0, uy1; - v_expand(y, uy0, uy1); - v_int32 y0 = v_reinterpret_as_s32(uy0); - v_int32 y1 = v_reinterpret_as_s32(uy1); - - cr = v_sub_wrap(cr, vdelta); - cb = v_sub_wrap(cb, vdelta); - - v_int32 b0, b1, g0, g1, r0, r1; - - v_int16 scb = v_reinterpret_as_s16(cb); - v_int16 scr = v_reinterpret_as_s16(cr); - v_mul_expand(scb, vc3, b0, b1); - if(yuvOrder) - { - // if YUV then C3 > 2^15 - // so we fix the multiplication - v_int32 cb0, cb1; - v_expand(scb, cb0, cb1); - b0 += cb0 << 15; - b1 += cb1 << 15; - } - v_int32 t0, t1; - v_mul_expand(scb, vc2, t0, t1); - v_mul_expand(scr, vc1, g0, g1); - g0 += t0; g1 += t1; - v_mul_expand(scr, vc0, r0, r1); - - // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits - b0 = ((b0 + vdescale) >> shift) + y0; - b1 = ((b1 + vdescale) >> shift) + y1; - g0 = ((g0 + vdescale) >> shift) + y0; - g1 = ((g1 + vdescale) >> shift) + y1; - r0 = ((r0 + vdescale) >> shift) + y0; - r1 = ((r1 + vdescale) >> shift) + y1; - - // saturate and pack - v_uint16 b, g, r; - b = v_pack_u(b0, b1); - g = v_pack_u(g0, g1); - r = v_pack_u(r0, r1); - - if(bidx) - swap(r, b); - - if(dcn == 3) - { - v_store_interleave(dst, b, g, r); - } - else - { - v_store_interleave(dst, b, g, r, valpha); - } - } - vx_cleanup(); -#endif - - for ( ; i < n; i++, src += 3, dst += dcn) - { - ushort Y = src[0]; - ushort Cr = src[1+yuvOrder]; - ushort Cb = src[2-yuvOrder]; - - int b = Y + CV_DESCALE((Cb - delta)*C3, shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, shift); - - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - int coeffs[4]; -}; - - -///////////////////////////////////// YUV420 -> RGB ///////////////////////////////////// - -static const int ITUR_BT_601_CY = 1220542; -static const int ITUR_BT_601_CUB = 2116026; -static const int ITUR_BT_601_CUG = -409993; -static const int ITUR_BT_601_CVG = -852492; -static const int ITUR_BT_601_CVR = 1673527; -static const int ITUR_BT_601_SHIFT = 20; - -// Coefficients for RGB to YUV420p conversion -static const int ITUR_BT_601_CRY = 269484; -static const int ITUR_BT_601_CGY = 528482; -static const int ITUR_BT_601_CBY = 102760; -static const int ITUR_BT_601_CRU = -155188; -static const int ITUR_BT_601_CGU = -305135; -static const int ITUR_BT_601_CBU = 460324; -static const int ITUR_BT_601_CGV = -385875; -static const int ITUR_BT_601_CBV = -74448; - -//R = 1.164(Y - 16) + 1.596(V - 128) -//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) -//B = 1.164(Y - 16) + 2.018(U - 128) - -//R = (1220542(Y - 16) + 1673527(V - 128) + (1 << 19)) >> 20 -//G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20 -//B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20 - -static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) -{ - int uu, vv; - uu = int(u) - 128; - vv = int(v) - 128; - - ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv; - guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu; - buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; -} - -static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, - v_int32 (&ruv)[4], - v_int32 (&guv)[4], - v_int32 (&buv)[4]) -{ - v_uint8 v128 = vx_setall_u8(128); - v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128)); - v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128)); - - v_int16 uu0, uu1, vv0, vv1; - v_expand(su, uu0, uu1); - v_expand(sv, vv0, vv1); - v_int32 uu[4], vv[4]; - v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]); - v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]); - - v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1)); - v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR); - v_int32 vg = vx_setall_s32(ITUR_BT_601_CVG); - v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG); - v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB); - - for (int k = 0; k < 4; k++) - { - ruv[k] = vshift + vr * vv[k]; - guv[k] = vshift + vg * vv[k] + ug * uu[k]; - buv[k] = vshift + ub * uu[k]; - } -} - -static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv, - uchar& r, uchar& g, uchar& b, uchar& a) -{ - int yy = int(vy); - int y = std::max(0, yy - 16) * ITUR_BT_601_CY; - r = saturate_cast((y + ruv) >> ITUR_BT_601_SHIFT); - g = saturate_cast((y + guv) >> ITUR_BT_601_SHIFT); - b = saturate_cast((y + buv) >> ITUR_BT_601_SHIFT); - a = uchar(0xff); -} - -static inline void yRGBuvToRGBA(const v_uint8& vy, - const v_int32 (&ruv)[4], - const v_int32 (&guv)[4], - const v_int32 (&buv)[4], - v_uint8& rr, v_uint8& gg, v_uint8& bb) -{ - v_uint8 v16 = vx_setall_u8(16); - v_uint8 posY = vy - v16; - v_uint16 yy0, yy1; - v_expand(posY, yy0, yy1); - v_int32 yy[4]; - v_int32 yy00, yy01, yy10, yy11; - v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]); - v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]); - - v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY); - - v_int32 y[4], r[4], g[4], b[4]; - for(int k = 0; k < 4; k++) - { - y[k] = yy[k]*vcy; - r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT; - g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT; - b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT; - } - - v_int16 r0, r1, g0, g1, b0, b1; - r0 = v_pack(r[0], r[1]); - r1 = v_pack(r[2], r[3]); - g0 = v_pack(g[0], g[1]); - g1 = v_pack(g[2], g[3]); - b0 = v_pack(b[0], b[1]); - b1 = v_pack(b[2], b[3]); - - rr = v_pack_u(r0, r1); - gg = v_pack_u(g0, g1); - bb = v_pack_u(b0, b1); -} - -template -static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v, - const uchar vy01, const uchar vy11, const uchar vy02, const uchar vy12, - uchar* row1, uchar* row2) -{ - int ruv, guv, buv; - uvToRGBuv(u, v, ruv, guv, buv); - - uchar r00, g00, b00, a00; - uchar r01, g01, b01, a01; - - yRGBuvToRGBA(vy01, ruv, guv, buv, r00, g00, b00, a00); - yRGBuvToRGBA(vy11, ruv, guv, buv, r01, g01, b01, a01); - - row1[2-bIdx] = r00; - row1[1] = g00; - row1[bIdx] = b00; - if(dcn == 4) - row1[3] = a00; - - row1[dcn+2-bIdx] = r01; - row1[dcn+1] = g01; - row1[dcn+0+bIdx] = b01; - if(dcn == 4) - row1[7] = a01; - - if(is420) - { - uchar r10, g10, b10, a10; - uchar r11, g11, b11, a11; - - yRGBuvToRGBA(vy02, ruv, guv, buv, r10, g10, b10, a10); - yRGBuvToRGBA(vy12, ruv, guv, buv, r11, g11, b11, a11); - - row2[2-bIdx] = r10; - row2[1] = g10; - row2[bIdx] = b10; - if(dcn == 4) - row2[3] = a10; - - row2[dcn+2-bIdx] = r11; - row2[dcn+1] = g11; - row2[dcn+0+bIdx] = b11; - if(dcn == 4) - row2[7] = a11; - } -} - -// bIdx is 0 or 2, uIdx is 0 or 1, dcn is 3 or 4 -template -struct YUV420sp2RGB8Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - int width; - const uchar* my1, *muv; - size_t stride; - - YUV420sp2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv) - : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), muv(_uv), stride(_stride) {} - - void operator()(const Range& range) const CV_OVERRIDE - { - const int rangeBegin = range.start * 2; - const int rangeEnd = range.end * 2; - - const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2; - - for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride) - { - uchar* row1 = dst_data + dst_step * j; - uchar* row2 = dst_data + dst_step * (j + 1); - const uchar* y2 = y1 + stride; - - int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 a = vx_setall_u8(uchar(0xff)); - for( ; i <= width - 2*vsize; - i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) - { - v_uint8 u, v; - v_load_deinterleave(uv + i, u, v); - - if(uIdx) - { - swap(u, v); - } - - v_uint8 vy[4]; - v_load_deinterleave(y1 + i, vy[0], vy[1]); - v_load_deinterleave(y2 + i, vy[2], vy[3]); - - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); - - v_uint8 r[4], g[4], b[4]; - - for(int k = 0; k < 4; k++) - { - yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); - } - - if(bIdx) - { - for(int k = 0; k < 4; k++) - swap(r[k], b[k]); - } - - // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] - v_uint8 r0_0, r0_1, r1_0, r1_1; - v_zip(r[0], r[1], r0_0, r0_1); - v_zip(r[2], r[3], r1_0, r1_1); - v_uint8 g0_0, g0_1, g1_0, g1_1; - v_zip(g[0], g[1], g0_0, g0_1); - v_zip(g[2], g[3], g1_0, g1_1); - v_uint8 b0_0, b0_1, b1_0, b1_1; - v_zip(b[0], b[1], b0_0, b0_1); - v_zip(b[2], b[3], b1_0, b1_1); - - if(dcn == 4) - { - v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a); - v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a); - - v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a); - v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a); - } - else //dcn == 3 - { - v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0); - v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1); - - v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0); - v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1); - } - } - vx_cleanup(); -#endif - for ( ; i < width; i += 2, row1 += dcn*2, row2 += dcn*2) - { - uchar u = uv[i + 0 + uIdx]; - uchar v = uv[i + 1 - uIdx]; - - uchar vy01 = y1[i]; - uchar vy11 = y1[i + 1]; - uchar vy02 = y2[i]; - uchar vy12 = y2[i + 1]; - - cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); - } - } - } -}; - -template -struct YUV420p2RGB8Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - int width; - const uchar* my1, *mu, *mv; - size_t stride; - int ustepIdx, vstepIdx; - - YUV420p2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx) - : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), mu(_u), mv(_v), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {} - - void operator()(const Range& range) const CV_OVERRIDE - { - const int rangeBegin = range.start * 2; - const int rangeEnd = range.end * 2; - - int uvsteps[2] = {width/2, static_cast(stride) - width/2}; - int usIdx = ustepIdx, vsIdx = vstepIdx; - - const uchar* y1 = my1 + rangeBegin * stride; - const uchar* u1 = mu + (range.start / 2) * stride; - const uchar* v1 = mv + (range.start / 2) * stride; - - if(range.start % 2 == 1) - { - u1 += uvsteps[(usIdx++) & 1]; - v1 += uvsteps[(vsIdx++) & 1]; - } - - for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1]) - { - uchar* row1 = dst_data + dst_step * j; - uchar* row2 = dst_data + dst_step * (j + 1); - const uchar* y2 = y1 + stride; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 a = vx_setall_u8(uchar(0xff)); - for( ; i <= width/2 - vsize; - i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) - { - v_uint8 u, v; - u = vx_load(u1 + i); - v = vx_load(v1 + i); - - v_uint8 vy[4]; - v_load_deinterleave(y1 + 2*i, vy[0], vy[1]); - v_load_deinterleave(y2 + 2*i, vy[2], vy[3]); - - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); - - v_uint8 r[4], g[4], b[4]; - - for(int k = 0; k < 4; k++) - { - yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); - } - - if(bIdx) - { - for(int k = 0; k < 4; k++) - swap(r[k], b[k]); - } - - // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] - v_uint8 r0_0, r0_1, r1_0, r1_1; - v_zip(r[0], r[1], r0_0, r0_1); - v_zip(r[2], r[3], r1_0, r1_1); - v_uint8 g0_0, g0_1, g1_0, g1_1; - v_zip(g[0], g[1], g0_0, g0_1); - v_zip(g[2], g[3], g1_0, g1_1); - v_uint8 b0_0, b0_1, b1_0, b1_1; - v_zip(b[0], b[1], b0_0, b0_1); - v_zip(b[2], b[3], b1_0, b1_1); - - if(dcn == 4) - { - v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a); - v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a); - - v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a); - v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a); - } - else //dcn == 3 - { - v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0); - v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1); - - v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0); - v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1); - } - } - vx_cleanup(); -#endif - for (; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2) - { - uchar u = u1[i]; - uchar v = v1[i]; - - uchar vy01 = y1[2 * i]; - uchar vy11 = y1[2 * i + 1]; - uchar vy02 = y2[2 * i]; - uchar vy12 = y2[2 * i + 1]; - - cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); - } - } - } -}; - - -#define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240) - -template -inline void cvtYUV420sp2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _uv) -{ - YUV420sp2RGB8Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _uv); - if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) - parallel_for_(Range(0, dst_height/2), converter); - else - converter(Range(0, dst_height/2)); -} - -template -inline void cvtYUV420p2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx) -{ - YUV420p2RGB8Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _u, _v, ustepIdx, vstepIdx); - if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) - parallel_for_(Range(0, dst_height/2), converter); - else - converter(Range(0, dst_height/2)); -} - -///////////////////////////////////// RGB -> YUV420p ///////////////////////////////////// - -static inline uchar rgbToY42x(uchar r, uchar g, uchar b) -{ - const int shifted16 = (16 << ITUR_BT_601_SHIFT); - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - int yy = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16; - - return saturate_cast(yy >> ITUR_BT_601_SHIFT); -} - -static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b) -{ - const int shifted16 = (16 << ITUR_BT_601_SHIFT); - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - v_uint16 r0, r1, g0, g1, b0, b1; - v_expand(r, r0, r1); - v_expand(g, g0, g1); - v_expand(b, b0, b1); - - v_uint32 rq[4], gq[4], bq[4]; - v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]); - v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]); - v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]); - - v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY); - v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16); - - v_uint32 y[4]; - for(int k = 0; k < 4; k++) - { - y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT; - } - - v_uint16 y0, y1; - y0 = v_pack(y[0], y[1]); - y1 = v_pack(y[2], y[3]); - - return v_pack(y0, y1); -} - -static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v) -{ - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - const int shifted128 = (128 << ITUR_BT_601_SHIFT); - int uu = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128; - int vv = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128; - - u = saturate_cast(uu >> ITUR_BT_601_SHIFT); - v = saturate_cast(vv >> ITUR_BT_601_SHIFT); -} - -static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1, - const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v) -{ - // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..] - v_int16 vlowByte = vx_setall_s16(0x00ff); - v_int16 rd0, rd1, gd0, gd1, bd0, bd1; - rd0 = v_reinterpret_as_s16(r0) & vlowByte; - rd1 = v_reinterpret_as_s16(r1) & vlowByte; - gd0 = v_reinterpret_as_s16(g0) & vlowByte; - gd1 = v_reinterpret_as_s16(g1) & vlowByte; - bd0 = v_reinterpret_as_s16(b0) & vlowByte; - bd1 = v_reinterpret_as_s16(b1) & vlowByte; - - v_int32 rq[4], gq[4], bq[4]; - v_expand(rd0, rq[0], rq[1]); - v_expand(rd1, rq[2], rq[3]); - v_expand(gd0, gq[0], gq[1]); - v_expand(gd1, gq[2], gq[3]); - v_expand(bd0, bq[0], bq[1]); - v_expand(bd1, bq[2], bq[3]); - - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - const int shifted128 = (128 << ITUR_BT_601_SHIFT); - v_int32 shift = vx_setall_s32(halfShift + shifted128); - v_int32 ru, gu, bu, gv, bv; - ru = vx_setall_s32(ITUR_BT_601_CRU); - gu = vx_setall_s32(ITUR_BT_601_CGU); - gv = vx_setall_s32(ITUR_BT_601_CGV); - bu = vx_setall_s32(ITUR_BT_601_CBU); - bv = vx_setall_s32(ITUR_BT_601_CBV); - - v_int32 uq[4], vq[4]; - for(int k = 0; k < 4; k++) - { - uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT; - vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT; - } - - v_int16 u0, u1, v0, v1; - u0 = v_pack(uq[0], uq[1]); - u1 = v_pack(uq[2], uq[3]); - v0 = v_pack(vq[0], vq[1]); - v1 = v_pack(vq[2], vq[3]); - - u = v_pack_u(u0, u1); - v = v_pack_u(v0, v1); -} - - -struct RGB8toYUV420pInvoker: public ParallelLoopBody -{ - RGB8toYUV420pInvoker(const uchar * _srcData, size_t _srcStep, - uchar * _yData, uchar * _uvData, size_t _dstStep, - int _srcWidth, int _srcHeight, int _scn, bool _swapBlue, bool _swapUV, bool _interleave) - : srcData(_srcData), srcStep(_srcStep), - yData(_yData), uvData(_uvData), dstStep(_dstStep), - srcWidth(_srcWidth), srcHeight(_srcHeight), - srcCn(_scn), swapBlue(_swapBlue), swapUV(_swapUV), interleave(_interleave) { } - - void operator()(const Range& rowRange) const CV_OVERRIDE - { - const int w = srcWidth; - const int h = srcHeight; - const int scn = srcCn; - const uchar* srcRow = (uchar*)0; - uchar* yRow = (uchar*)0, *uRow = (uchar*)0, *vRow = (uchar*)0, *uvRow = (uchar*)0; - for( int sRow = rowRange.start*2; sRow < rowRange.end*2; sRow++) - { - srcRow = srcData + srcStep*sRow; - yRow = yData + dstStep * sRow; - bool evenRow = (sRow % 2) == 0; - if(evenRow) - { - if (interleave) - { - uvRow = uvData + dstStep*(sRow/2); - } - else - { - uRow = uvData + dstStep * (sRow/4) + ((sRow/2) % 2) * (w/2); - vRow = uvData + dstStep * ((sRow + h)/4) + (((sRow + h)/2) % 2) * (w/2); - } - } - int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; - - for( ; i <= w/2 - vsize; - i += vsize) - { - // processing (2*vsize) pixels at once - v_uint8 b0, b1, g0, g1, r0, r1, a0, a1; - if(scn == 4) - { - v_load_deinterleave(srcRow + 2*4*i + 0*vsize, b0, g0, r0, a0); - v_load_deinterleave(srcRow + 2*4*i + 4*vsize, b1, g1, r1, a1); - } - else // scn == 3 - { - v_load_deinterleave(srcRow + 2*3*i + 0*vsize, b0, g0, r0); - v_load_deinterleave(srcRow + 2*3*i + 3*vsize, b1, g1, r1); - } - - if(swapBlue) - { - swap(b0, r0); swap(b1, r1); - } - - v_uint8 y0, y1; - - y0 = rgbToY42x(r0, g0, b0); - y1 = rgbToY42x(r1, g1, b1); - - v_store(yRow + 2*i + 0*vsize, y0); - v_store(yRow + 2*i + 1*vsize, y1); - - if(evenRow) - { - v_uint8 u, v; - rgbToUV42x(r0, r1, g0, g1, b0, b1, u, v); - - if(swapUV) - { - swap(u, v); - } - - if(interleave) - { - v_store_interleave(uvRow + 2*i, u, v); - } - else - { - v_store(uRow + i, u); - v_store(vRow + i, v); - } - } - } - vx_cleanup(); -#endif - // processing two pixels at once - for( ; i < w/2; i++) - { - uchar b0, g0, r0; - uchar b1, g1, r1; - b0 = srcRow[(2*i+0)*scn + 0]; - g0 = srcRow[(2*i+0)*scn + 1]; - r0 = srcRow[(2*i+0)*scn + 2]; - b1 = srcRow[(2*i+1)*scn + 0]; - g1 = srcRow[(2*i+1)*scn + 1]; - r1 = srcRow[(2*i+1)*scn + 2]; - - if(swapBlue) - { - swap(b0, r0); swap(b1, r1); - } - - uchar y0 = rgbToY42x(r0, g0, b0); - uchar y1 = rgbToY42x(r1, g1, b1); - - yRow[2*i+0] = y0; - yRow[2*i+1] = y1; - - if(evenRow) - { - uchar uu, vv; - rgbToUV42x(r0, g0, b0, uu, vv); - if(swapUV) - { - swap(uu, vv); - } - - if(interleave) - { - uvRow[2*i+0] = uu; - uvRow[2*i+1] = vv; - } - else - { - uRow[i] = uu; - vRow[i] = vv; - } - } - } - } - } - - const uchar * srcData; - size_t srcStep; - uchar *yData, *uvData; - size_t dstStep; - int srcWidth; - int srcHeight; - const int srcCn; - bool swapBlue; - bool swapUV; - bool interleave; -}; - - -///////////////////////////////////// YUV422 -> RGB ///////////////////////////////////// - -// bIdx is 0 or 2; [uIdx, yIdx] is [0, 0], [0, 1], [1, 0]; dcn is 3 or 4 -template -struct YUV422toRGB8Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - const uchar * src_data; - size_t src_step; - int width; - - YUV422toRGB8Invoker(uchar * _dst_data, size_t _dst_step, - const uchar * _src_data, size_t _src_step, - int _width) - : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {} - - void operator()(const Range& range) const CV_OVERRIDE - { - int rangeBegin = range.start; - int rangeEnd = range.end; - - // [yIdx, uIdx] | [uidx, vidx]: - // 0, 0 | 1, 3 - // 0, 1 | 3, 1 - // 1, 0 | 0, 2 - const int uidx = 1 - yIdx + uIdx * 2; - const int vidx = (2 + uidx) % 4; - const uchar* yuv_src = src_data + rangeBegin * src_step; - - for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step) - { - uchar* row = dst_data + dst_step * j; - int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 a = vx_setall_u8(uchar(0xff)); - for(; i <= 2*width - 4*vsize; - i += 4*vsize, row += vsize*dcn*2) - { - v_uint8 u, v, vy[2]; - if(yIdx == 1) // UYVY - { - v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]); - } - else // YUYV or YVYU - { - v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v); - if(uIdx == 1) // YVYU - { - swap(u, v); - } - } - - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); - - v_uint8 r[2], g[2], b[2]; - - yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]); - yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]); - - if(bIdx) - { - swap(r[0], b[0]); - swap(r[1], b[1]); - } - - // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] - v_uint8 r0_0, r0_1; - v_zip(r[0], r[1], r0_0, r0_1); - v_uint8 g0_0, g0_1; - v_zip(g[0], g[1], g0_0, g0_1); - v_uint8 b0_0, b0_1; - v_zip(b[0], b[1], b0_0, b0_1); - - if(dcn == 4) - { - v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0, a); - v_store_interleave(row + 4*vsize, b0_1, g0_1, r0_1, a); - } - else //dcn == 3 - { - v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0); - v_store_interleave(row + 3*vsize, b0_1, g0_1, r0_1); - } - } - vx_cleanup(); -#endif - for (; i < 2 * width; i += 4, row += dcn*2) - { - uchar u = yuv_src[i + uidx]; - uchar v = yuv_src[i + vidx]; - - uchar vy0 = yuv_src[i + yIdx]; - uchar vy1 = yuv_src[i + yIdx + 2]; - - cvtYuv42xxp2RGB8(u, v, vy0, vy1, 0, 0, row, (uchar*)(0)); - } - } - } -}; - -#define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240) +#include "color_yuv.simd.hpp" +#include "color_yuv.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content -template -inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step, - int width, int height) -{ - YUV422toRGB8Invoker converter(dst_data, dst_step, src_data, src_step, width); - if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION) - parallel_for_(Range(0, height), converter); - else - converter(Range(0, height)); -} +namespace cv { // // HAL functions // - -namespace hal -{ +namespace hal { // 8u, 16u, 32f void cvtBGRtoYUV(const uchar * src_data, size_t src_step, @@ -1790,13 +62,8 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step, #endif #endif - int blueIdx = swapBlue ? 2 : 0; - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i(scn, blueIdx, isCbCr)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i(scn, blueIdx, isCbCr)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_f(scn, blueIdx, isCbCr)); + CV_CPU_DISPATCH(cvtBGRtoYUV, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr), + CV_CPU_DISPATCH_MODES_ALL); } void cvtYUVtoBGR(const uchar * src_data, size_t src_step, @@ -1844,13 +111,8 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step, #endif #endif - int blueIdx = swapBlue ? 2 : 0; - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i(dcn, blueIdx, isCbCr)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i(dcn, blueIdx, isCbCr)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_f(dcn, blueIdx, isCbCr)); + CV_CPU_DISPATCH(cvtYUVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr), + CV_CPU_DISPATCH_MODES_ALL); } void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, @@ -1861,17 +123,10 @@ void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); - const uchar* uv = src_data + src_step * static_cast(dst_height); - cvtTwoPlaneYUVtoBGR(src_data, uv, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); -} -typedef void (*cvt_2plane_yuv_ptr_t)(uchar * /* dst_data*/, - size_t /* dst_step */, - int /* dst_width */, - int /* dst_height */, - size_t /* _stride */, - const uchar* /* _y1 */, - const uchar* /* _uv */); + CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); +} void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step, uchar * dst_data, size_t dst_step, @@ -1880,66 +135,21 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src { CV_INSTRUMENT_REGION(); - // TODO: add hal replacement method - - int blueIdx = swapBlue ? 2 : 0; - - cvt_2plane_yuv_ptr_t cvtPtr; - switch(dcn*100 + blueIdx * 10 + uIdx) - { - case 300: cvtPtr = cvtYUV420sp2RGB<0, 0, 3>; break; - case 301: cvtPtr = cvtYUV420sp2RGB<0, 1, 3>; break; - case 320: cvtPtr = cvtYUV420sp2RGB<2, 0, 3>; break; - case 321: cvtPtr = cvtYUV420sp2RGB<2, 1, 3>; break; - case 400: cvtPtr = cvtYUV420sp2RGB<0, 0, 4>; break; - case 401: cvtPtr = cvtYUV420sp2RGB<0, 1, 4>; break; - case 420: cvtPtr = cvtYUV420sp2RGB<2, 0, 4>; break; - case 421: cvtPtr = cvtYUV420sp2RGB<2, 1, 4>; break; - default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; - }; - - cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); + CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (y_data, uv_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); } -typedef void (*cvt_3plane_yuv_ptr_t)(uchar * /* dst_data */, - size_t /* dst_step */, - int /* dst_width */, - int /* dst_height */, - size_t /* _stride */, - const uchar* /* _y1 */, - const uchar* /* _u */, - const uchar* /* _v */, - int /* ustepIdx */, - int /* vstepIdx */); - void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int dst_width, int dst_height, - int dcn, bool swapBlue, int uIdx) + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx) { CV_INSTRUMENT_REGION(); CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); - const uchar* u = src_data + src_step * static_cast(dst_height); - const uchar* v = src_data + src_step * static_cast(dst_height + dst_height/4) + (dst_width/2) * ((dst_height % 4)/2); - - int ustepIdx = 0; - int vstepIdx = dst_height % 4 == 2 ? 1 : 0; - if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); } - int blueIdx = swapBlue ? 2 : 0; - - cvt_3plane_yuv_ptr_t cvtPtr; - switch(dcn*10 + blueIdx) - { - case 30: cvtPtr = cvtYUV420p2RGB<0, 3>; break; - case 32: cvtPtr = cvtYUV420p2RGB<2, 3>; break; - case 40: cvtPtr = cvtYUV420p2RGB<0, 4>; break; - case 42: cvtPtr = cvtYUV420p2RGB<2, 4>; break; - default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; - }; - - cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); + CV_CPU_DISPATCH(cvtThreePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); } void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, @@ -1950,15 +160,9 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx); - uchar * uv_data = dst_data + dst_step * height; - RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height, - scn, swapBlue, uIdx == 2, false); - - if( width * height >= 320*240 ) - parallel_for_(Range(0, height/2), cvt); - else - cvt(Range(0, height/2)); + CV_CPU_DISPATCH(cvtBGRtoThreePlaneYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); } void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, @@ -1970,22 +174,10 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, // TODO: add hal replacement method - RGB8toYUV420pInvoker cvt(src_data, src_step, y_data, uv_data, dst_step, width, height, - scn, swapBlue, uIdx == 2, true); - - if( width * height >= 320*240 ) - parallel_for_(Range(0, height/2), cvt); - else - cvt(Range(0, height/2)); + CV_CPU_DISPATCH(cvtBGRtoTwoPlaneYUV, (src_data, src_step, y_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); } -typedef void (*cvt_1plane_yuv_ptr_t)(uchar * /* dst_data */, - size_t /* dst_step */, - const uchar * /* src_data */, - size_t /* src_step */, - int /* width */, - int /* height */); - void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, @@ -1995,26 +187,8 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn); - cvt_1plane_yuv_ptr_t cvtPtr; - int blueIdx = swapBlue ? 2 : 0; - switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn) - { - case 3000: cvtPtr = cvtYUV422toRGB<0,0,0,3>; break; - case 3001: cvtPtr = cvtYUV422toRGB<0,0,1,3>; break; - case 3010: cvtPtr = cvtYUV422toRGB<0,1,0,3>; break; - case 3200: cvtPtr = cvtYUV422toRGB<2,0,0,3>; break; - case 3201: cvtPtr = cvtYUV422toRGB<2,0,1,3>; break; - case 3210: cvtPtr = cvtYUV422toRGB<2,1,0,3>; break; - case 4000: cvtPtr = cvtYUV422toRGB<0,0,0,4>; break; - case 4001: cvtPtr = cvtYUV422toRGB<0,0,1,4>; break; - case 4010: cvtPtr = cvtYUV422toRGB<0,1,0,4>; break; - case 4200: cvtPtr = cvtYUV422toRGB<2,0,0,4>; break; - case 4201: cvtPtr = cvtYUV422toRGB<2,0,1,4>; break; - case 4210: cvtPtr = cvtYUV422toRGB<2,1,0,4>; break; - default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; - }; - - cvtPtr(dst_data, dst_step, src_data, src_step, width, height); + CV_CPU_DISPATCH(cvtOnePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn), + CV_CPU_DISPATCH_MODES_ALL); } } // namespace hal diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp index 7d731378e2..8bbd78b244 100644 --- a/modules/imgproc/src/color_yuv.simd.hpp +++ b/modules/imgproc/src/color_yuv.simd.hpp @@ -3,11 +3,54 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" +#include "opencv2/core/hal/intrin.hpp" -namespace cv -{ +namespace cv { +namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void cvtBGRtoYUV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue, bool isCbCr); +void cvtYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isCbCr); +void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx); +void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx); +void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx); +void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int uIdx); +void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, + uchar * y_data, uchar * uv_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int uIdx); +void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int dcn, bool swapBlue, int uIdx, int ycn); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if defined(CV_CPU_BASELINE_MODE) +// included in color.hpp +#else +#include "color.simd_helpers.hpp" +#endif +namespace { //constants for conversion from/to RGB and YUV, YCrCb according to BT.601 //to YCbCr @@ -1738,12 +1781,8 @@ inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_ converter(Range(0, height)); } -// -// HAL functions -// +} // namespace anon -namespace hal -{ // 8u, 16u, 32f void cvtBGRtoYUV(const uchar * src_data, size_t src_step, @@ -1753,43 +1792,6 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr); - -#if defined(HAVE_IPP) -#if !IPP_DISABLE_RGB_YUV - CV_IPP_CHECK() - { - if (scn == 3 && depth == CV_8U && swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R))) - return; - } - else if (scn == 3 && depth == CV_8U && !swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], - (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth))) - return; - } - else if (scn == 4 && depth == CV_8U && swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], - (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth))) - return; - } - else if (scn == 4 && depth == CV_8U && !swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], - (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth))) - return; - } - } -#endif -#endif - int blueIdx = swapBlue ? 2 : 0; if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i(scn, blueIdx, isCbCr)); @@ -1806,44 +1808,6 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr); - - -#if defined(HAVE_IPP) -#if !IPP_DISABLE_YUV_RGB - CV_IPP_CHECK() - { - if (dcn == 3 && depth == CV_8U && swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R))) - return; - } - else if (dcn == 3 && depth == CV_8U && !swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, - ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth))) - return; - } - else if (dcn == 4 && depth == CV_8U && swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, - ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth))) - return; - } - else if (dcn == 4 && depth == CV_8U && !swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, - ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth))) - return; - } - } -#endif -#endif - int blueIdx = swapBlue ? 2 : 0; if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i(dcn, blueIdx, isCbCr)); @@ -1860,7 +1824,6 @@ void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); const uchar* uv = src_data + src_step * static_cast(dst_height); cvtTwoPlaneYUVtoBGR(src_data, uv, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); } @@ -1880,8 +1843,6 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src { CV_INSTRUMENT_REGION(); - // TODO: add hal replacement method - int blueIdx = swapBlue ? 2 : 0; cvt_2plane_yuv_ptr_t cvtPtr; @@ -1919,7 +1880,6 @@ void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); const uchar* u = src_data + src_step * static_cast(dst_height); const uchar* v = src_data + src_step * static_cast(dst_height + dst_height/4) + (dst_width/2) * ((dst_height % 4)/2); @@ -1949,7 +1909,6 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx); uchar * uv_data = dst_data + dst_step * height; RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height, @@ -1968,8 +1927,6 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - // TODO: add hal replacement method - RGB8toYUV420pInvoker cvt(src_data, src_step, y_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx == 2, true); @@ -1993,8 +1950,6 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn); - cvt_1plane_yuv_ptr_t cvtPtr; int blueIdx = swapBlue ? 2 : 0; switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn) @@ -2017,227 +1972,6 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, cvtPtr(dst_data, dst_step, src_data, src_step, width, height); } -} // namespace hal - -// -// OCL calls -// - -#ifdef HAVE_OPENCL - -bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ) -{ - OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - if(!h.createKernel("YUV2RGB", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d", dcn, bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx ) -{ - OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - if(!h.createKernel("RGB2YUV", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=3 -D bidx=%d", bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx) -{ - OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - if(!h.createKernel("YCrCb2RGB", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d", dcn, bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx) -{ - OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - if(!h.createKernel("RGB2YCrCb", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=3 -D bidx=%d", bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx ) -{ - OclHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); - - bool optimized = _src.offset() % 4 == 0 && _src.step() % 4 == 0; - if(!h.createKernel("YUV2RGB_422", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx, - optimized ? " -D USE_OPTIMIZED_LOAD" : ""))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ) -{ - OclHelper< Set<1>, Set<1>, Set, FROM_YUV> h(_src, _dst, 1); - - h.src.rowRange(0, _dst.rows()).copyTo(_dst); - return true; -} - -bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ) -{ - OclHelper< Set<1>, Set<3, 4>, Set, FROM_YUV > h(_src, _dst, dcn); - - if(!h.createKernel("YUV2RGB_NVx", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ) -{ - OclHelper< Set<1>, Set<3, 4>, Set, FROM_YUV > h(_src, _dst, dcn); - - if(!h.createKernel("YUV2RGB_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx, - _src.isContinuous() ? " -D SRC_CONT" : ""))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx ) -{ - OclHelper< Set<3, 4>, Set<1>, Set, TO_YUV > h(_src, _dst, 1); - - if(!h.createKernel("RGB2YUV_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=1 -D bidx=%d -D uidx=%d", bidx, uidx))) - { - return false; - } - - return h.run(); -} - #endif - -// -// HAL calls -// - -void cvtColorBGR2YUV(InputArray _src, OutputArray _dst, bool swapb, bool crcb) -{ - CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - hal::cvtBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, swapb, crcb); -} - -void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, dcn, swapb, crcb); -} - -void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn) -{ - CvtHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtOnePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - dcn, swapb, uidx, ycn); -} - -void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi ) -{ - CV_Assert( _src.channels() == 2 && _src.depth() == CV_8U ); - - extractChannel(_src, _dst, coi); -} - -void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx) -{ - CvtHelper< Set<3, 4>, Set<1>, Set, TO_YUV > h(_src, _dst, 1); - - hal::cvtBGRtoThreePlaneYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.scn, swapb, uidx); -} - -void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ) -{ - CvtHelper< Set<1>, Set<1>, Set, FROM_YUV > h(_src, _dst, 1); - -#ifdef HAVE_IPP -#if IPP_VERSION_X100 >= 201700 - if (CV_INSTRUMENT_FUN_IPP(ippiCopy_8u_C1R_L, h.src.data, (IppSizeL)h.src.step, h.dst.data, (IppSizeL)h.dst.step, - ippiSizeL(h.dstSz.width, h.dstSz.height)) >= 0) - return; -#endif -#endif - h.src(Range(0, h.dstSz.height), Range::all()).copyTo(h.dst); -} - -void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<1>, Set<3, 4>, Set, FROM_YUV> h(_src, _dst, dcn); - - hal::cvtThreePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows, - dcn, swapb, uidx); -} - -// http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples -// http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples - -void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<1>, Set<3, 4>, Set, FROM_YUV> h(_src, _dst, dcn); - - hal::cvtTwoPlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows, - dcn, swapb, uidx); -} - -void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx ) -{ - int stype = _ysrc.type(); - int depth = CV_MAT_DEPTH(stype); - Size ysz = _ysrc.size(), uvs = _uvsrc.size(); - CV_Assert( dcn == 3 || dcn == 4 ); - CV_Assert( depth == CV_8U ); - CV_Assert( ysz.width == uvs.width * 2 && ysz.height == uvs.height * 2 ); - - Mat ysrc = _ysrc.getMat(), uvsrc = _uvsrc.getMat(); - - _dst.create( ysz, CV_MAKETYPE(depth, dcn)); - Mat dst = _dst.getMat(); - - hal::cvtTwoPlaneYUVtoBGR(ysrc.data, uvsrc.data, ysrc.step, - dst.data, dst.step, dst.cols, dst.rows, - dcn, swapb, uidx); -} - -} // namespace cv +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace