diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index 08909f8b28..7f6d6b0fb9 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -124,6 +124,10 @@
 
 #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
 struct VZeroUpperGuard {
+#ifdef __GNUC__
+    __attribute__((always_inline))
+#endif
+    inline VZeroUpperGuard() { _mm256_zeroupper(); }
 #ifdef __GNUC__
     __attribute__((always_inline))
 #endif
diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
index 1ea8c28643..c3f5b87267 100644
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -796,9 +796,9 @@ CV_EXPORTS InstrNode*   getCurrentNode();
 #endif
 
 #ifdef __CV_AVX_GUARD
-#define CV_INSTRUMENT_REGION(); __CV_AVX_GUARD CV_INSTRUMENT_REGION_();
+#define CV_INSTRUMENT_REGION() __CV_AVX_GUARD CV_INSTRUMENT_REGION_();
 #else
-#define CV_INSTRUMENT_REGION(); CV_INSTRUMENT_REGION_();
+#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_();
 #endif
 
 namespace cv {
diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt
index 1caadbbbad..6232aa5fab 100644
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -1,3 +1,6 @@
 set(the_description "Image Processing")
 ocv_add_dispatched_file(accum SSE4_1 AVX AVX2)
+ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2)
+ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2)
+ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2)
 ocv_define_module(imgproc opencv_core WRAP java python js)
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 38d35c014d..8f268e07e0 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
+#include "opencl_kernels_imgproc.hpp"
 #include "color.hpp"
 
 namespace cv
diff --git a/modules/imgproc/src/color.hpp b/modules/imgproc/src/color.hpp
index 70e7844277..8c1f19fa8a 100644
--- a/modules/imgproc/src/color.hpp
+++ b/modules/imgproc/src/color.hpp
@@ -3,59 +3,17 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "opencv2/imgproc.hpp"
-#include "opencv2/core/utility.hpp"
-#include <limits>
-#include "opencl_kernels_imgproc.hpp"
 #include "hal_replacement.hpp"
-#include "opencv2/core/hal/intrin.hpp"
-#include "opencv2/core/softfloat.hpp"
 
-#define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
-
-namespace cv
-{
-
-//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
-const float B2YF = 0.114f;
-const float G2YF = 0.587f;
-const float R2YF = 0.299f;
-
-enum
-{
-    yuv_shift = 14,
-    xyz_shift = 12,
-    R2Y = 4899, // == R2YF*16384
-    G2Y = 9617, // == G2YF*16384
-    B2Y = 1868, // == B2YF*16384
-    BLOCK_SIZE = 256
-};
-
-template<typename _Tp> struct ColorChannel
-{
-    typedef float worktype_f;
-    static _Tp max() { return std::numeric_limits<_Tp>::max(); }
-    static _Tp half() { return (_Tp)(max()/2 + 1); }
-};
-
-template<> struct ColorChannel<float>
-{
-    typedef float worktype_f;
-    static float max() { return 1.f; }
-    static float half() { return 0.5f; }
-};
-
-/*template<> struct ColorChannel<double>
-{
-    typedef double worktype_f;
-    static double max() { return 1.; }
-    static double half() { return 0.5; }
-};*/
+namespace cv {
 
 //
 // Helper functions
 //
 
-namespace {
+namespace impl {
+
+#include "color.simd_helpers.hpp"
 
 inline bool isHSV(int code)
 {
@@ -209,40 +167,9 @@ inline int uIndex(int code)
 }
 
 } // namespace::
+using namespace impl;
 
-template<int i0, int i1 = -1, int i2 = -1>
-struct Set
-{
-    static bool contains(int i)
-    {
-        return (i == i0 || i == i1 || i == i2);
-    }
-};
-
-template<int i0, int i1>
-struct Set<i0, i1, -1>
-{
-    static bool contains(int i)
-    {
-        return (i == i0 || i == i1);
-    }
-};
-
-template<int i0>
-struct Set<i0, -1, -1>
-{
-    static bool contains(int i)
-    {
-        return (i == i0);
-    }
-};
-
-enum SizePolicy
-{
-    TO_YUV, FROM_YUV, NONE
-};
-
-template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
+/*template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
 struct CvtHelper
 {
     CvtHelper(InputArray _src, OutputArray _dst, int dcn)
@@ -282,7 +209,7 @@ struct CvtHelper
     Mat src, dst;
     int depth, scn;
     Size dstSz;
-};
+};*/
 
 #ifdef HAVE_OPENCL
 
@@ -380,49 +307,7 @@ struct OclHelper
 
 #endif
 
-///////////////////////////// Top-level template function ////////////////////////////////
-
-template <typename Cvt>
-class CvtColorLoop_Invoker : public ParallelLoopBody
-{
-    typedef typename Cvt::channel_type _Tp;
-public:
-
-    CvtColorLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt) :
-        ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_),
-        width(width_), cvt(_cvt)
-    {
-    }
-
-    virtual void operator()(const Range& range) const CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-
-        const uchar* yS = src_data + static_cast<size_t>(range.start) * src_step;
-        uchar* yD = dst_data + static_cast<size_t>(range.start) * dst_step;
 
-        for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step )
-            cvt(reinterpret_cast<const _Tp*>(yS), reinterpret_cast<_Tp*>(yD), width);
-    }
-
-private:
-    const uchar * src_data;
-    const size_t src_step;
-    uchar * dst_data;
-    const size_t dst_step;
-    const int width;
-    const Cvt& cvt;
-
-    const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
-};
-
-template <typename Cvt>
-void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
-{
-    parallel_for_(Range(0, height),
-                  CvtColorLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt),
-                  (width * height) / static_cast<double>(1<<16));
-}
 
 #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700)
 #  define NEED_IPP 1
diff --git a/modules/imgproc/src/color.simd_helpers.hpp b/modules/imgproc/src/color.simd_helpers.hpp
index 70e7844277..343491f2c6 100644
--- a/modules/imgproc/src/color.simd_helpers.hpp
+++ b/modules/imgproc/src/color.simd_helpers.hpp
@@ -2,23 +2,14 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 
-#include "opencv2/imgproc.hpp"
-#include "opencv2/core/utility.hpp"
-#include <limits>
-#include "opencl_kernels_imgproc.hpp"
-#include "hal_replacement.hpp"
-#include "opencv2/core/hal/intrin.hpp"
-#include "opencv2/core/softfloat.hpp"
-
 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
 
-namespace cv
-{
+namespace {
 
 //constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
-const float B2YF = 0.114f;
-const float G2YF = 0.587f;
-const float R2YF = 0.299f;
+static const float B2YF = 0.114f;
+static const float G2YF = 0.587f;
+static const float R2YF = 0.299f;
 
 enum
 {
@@ -33,15 +24,15 @@ enum
 template<typename _Tp> struct ColorChannel
 {
     typedef float worktype_f;
-    static _Tp max() { return std::numeric_limits<_Tp>::max(); }
-    static _Tp half() { return (_Tp)(max()/2 + 1); }
+    static inline _Tp max() { return std::numeric_limits<_Tp>::max(); }
+    static inline _Tp half() { return (_Tp)(max()/2 + 1); }
 };
 
 template<> struct ColorChannel<float>
 {
     typedef float worktype_f;
-    static float max() { return 1.f; }
-    static float half() { return 0.5f; }
+    static inline float max() { return 1.f; }
+    static inline float half() { return 0.5f; }
 };
 
 /*template<> struct ColorChannel<double>
@@ -51,169 +42,11 @@ template<> struct ColorChannel<float>
     static double half() { return 0.5; }
 };*/
 
-//
-// Helper functions
-//
-
-namespace {
-
-inline bool isHSV(int code)
-{
-    switch(code)
-    {
-    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
-    case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
-        return true;
-    default:
-        return false;
-    }
-}
-
-inline bool isLab(int code)
-{
-    switch (code)
-    {
-    case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Lab2LBGR: case COLOR_Lab2LRGB:
-    case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_LBGR2Lab: case COLOR_LRGB2Lab:
-        return true;
-    default:
-        return false;
-    }
-}
-
-inline bool is_sRGB(int code)
-{
-    switch (code)
-    {
-    case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_BGR2Luv: case COLOR_RGB2Luv:
-    case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Luv2BGR: case COLOR_Luv2RGB:
-        return true;
-    default:
-        return false;
-    }
-}
-
-inline bool swapBlue(int code)
-{
-    switch (code)
-    {
-    case COLOR_BGR2BGRA: case COLOR_BGRA2BGR:
-    case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555:
-    case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA:
-    case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
-    case COLOR_BGR2YCrCb: case COLOR_BGR2YUV:
-    case COLOR_YCrCb2BGR: case COLOR_YUV2BGR:
-    case COLOR_BGR2XYZ: case COLOR_XYZ2BGR:
-    case COLOR_BGR2HSV: case COLOR_BGR2HLS: case COLOR_BGR2HSV_FULL: case COLOR_BGR2HLS_FULL:
-    case COLOR_YUV2BGR_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2BGRA_IYUV:
-    case COLOR_YUV2BGR_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2BGRA_NV12:
-    case COLOR_Lab2BGR: case COLOR_Luv2BGR: case COLOR_Lab2LBGR: case COLOR_Luv2LBGR:
-    case COLOR_BGR2Lab: case COLOR_BGR2Luv: case COLOR_LBGR2Lab: case COLOR_LBGR2Luv:
-    case COLOR_HSV2BGR: case COLOR_HLS2BGR: case COLOR_HSV2BGR_FULL: case COLOR_HLS2BGR_FULL:
-    case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2:
-    case COLOR_YUV2BGRA_YUY2:  case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU:
-    case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12:
-        return false;
-    default:
-        return true;
-    }
-}
-
-inline bool isFullRangeHSV(int code)
-{
-    switch (code)
-    {
-    case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
-    case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
-        return true;
-    default:
-        return false;
-    }
-}
-
-inline int dstChannels(int code)
-{
-    switch( code )
-    {
-        case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2RGBA:
-        case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
-        case COLOR_GRAY2BGRA:
-        case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
-        case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
-        case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
-        case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
-
-            return 4;
-
-        case COLOR_BGRA2BGR: case COLOR_RGBA2BGR: case COLOR_RGB2BGR:
-        case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
-        case COLOR_GRAY2BGR:
-        case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12:
-        case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV:
-        case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
-        case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2:
-
-            return 3;
-
-        default:
-            return 0;
-    }
-}
-
-inline int greenBits(int code)
-{
-    switch( code )
-    {
-        case COLOR_BGR2BGR565: case COLOR_RGB2BGR565: case COLOR_BGRA2BGR565: case COLOR_RGBA2BGR565:
-        case COLOR_BGR5652BGR: case COLOR_BGR5652RGB: case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA:
-        case COLOR_BGR5652GRAY: case COLOR_GRAY2BGR565:
-
-            return 6;
-
-        case COLOR_BGR2BGR555: case COLOR_RGB2BGR555: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR555:
-        case COLOR_BGR5552BGR: case COLOR_BGR5552RGB: case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA:
-        case COLOR_BGR5552GRAY: case COLOR_GRAY2BGR555:
-
-            return 5;
-
-        default:
-            return 0;
-    }
-}
-
-inline int uIndex(int code)
-{
-    switch( code )
-    {
-        case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
-
-            return 2;
-
-        case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
-        case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
-        case COLOR_YUV2BGR_NV21:  case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21:
-        case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
-
-            return 1;
-
-        case COLOR_YUV2BGR_NV12:  case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
-        case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
-        case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
-        case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
-
-            return 0;
-
-        default:
-            return -1;
-    }
-}
-
-} // namespace::
 
 template<int i0, int i1 = -1, int i2 = -1>
 struct Set
 {
-    static bool contains(int i)
+    static inline bool contains(int i)
     {
         return (i == i0 || i == i1 || i == i2);
     }
@@ -222,7 +55,7 @@ struct Set
 template<int i0, int i1>
 struct Set<i0, i1, -1>
 {
-    static bool contains(int i)
+    static inline bool contains(int i)
     {
         return (i == i0 || i == i1);
     }
@@ -231,7 +64,7 @@ struct Set<i0, i1, -1>
 template<int i0>
 struct Set<i0, -1, -1>
 {
-    static bool contains(int i)
+    static inline bool contains(int i)
     {
         return (i == i0);
     }
@@ -284,101 +117,6 @@ struct CvtHelper
     Size dstSz;
 };
 
-#ifdef HAVE_OPENCL
-
-template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
-struct OclHelper
-{
-    OclHelper( InputArray _src, OutputArray _dst, int dcn) :
-        nArgs(0)
-    {
-        src = _src.getUMat();
-        Size sz = src.size(), dstSz;
-        int scn = src.channels();
-        int depth = src.depth();
-
-        CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) );
-        switch (sizePolicy)
-        {
-        case TO_YUV:
-            CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
-            dstSz = Size(sz.width, sz.height / 2 * 3);
-            break;
-        case FROM_YUV:
-            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 );
-            dstSz = Size(sz.width, sz.height * 2 / 3);
-            break;
-        case NONE:
-        default:
-            dstSz = sz;
-            break;
-        }
-
-        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-        dst = _dst.getUMat();
-    }
-
-    bool createKernel(cv::String name, ocl::ProgramSource& source, cv::String options)
-    {
-        ocl::Device dev = ocl::Device::getDefault();
-        int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
-        int pxPerWIx = 1;
-
-        cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
-                                        src.depth(), src.channels(), pxPerWIy);
-
-        switch (sizePolicy)
-        {
-        case TO_YUV:
-            if (dev.isIntel() &&
-                    src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
-                    dst.step % 4 == 0 && dst.offset % 4 == 0)
-            {
-                pxPerWIx = 2;
-            }
-            globalSize[0] = (size_t)dst.cols/(2*pxPerWIx);
-            globalSize[1] = ((size_t)dst.rows/3 + pxPerWIy - 1) / pxPerWIy;
-            baseOptions += format("-D PIX_PER_WI_X=%d ", pxPerWIx);
-            break;
-        case FROM_YUV:
-            globalSize[0] = (size_t)dst.cols/2;
-            globalSize[1] = ((size_t)dst.rows/2 + pxPerWIy - 1) / pxPerWIy;
-            break;
-        case NONE:
-        default:
-            globalSize[0] = (size_t)src.cols;
-            globalSize[1] = ((size_t)src.rows + pxPerWIy - 1) / pxPerWIy;
-            break;
-        }
-
-        k.create(name.c_str(), source, baseOptions + options);
-
-        if(k.empty())
-            return false;
-
-        nArgs = k.set(0, ocl::KernelArg::ReadOnlyNoSize(src));
-        nArgs = k.set(nArgs, ocl::KernelArg::WriteOnly(dst));
-        return true;
-    }
-
-    bool run()
-    {
-        return k.run(2, globalSize, NULL, false);
-    }
-
-    template<typename T>
-    void setArg(const T& arg)
-    {
-        nArgs = k.set(nArgs, arg);
-    }
-
-    UMat src, dst;
-    ocl::Kernel k;
-    size_t globalSize[2];
-    int nArgs;
-};
-
-#endif
 
 ///////////////////////////// Top-level template function ////////////////////////////////
 
@@ -413,261 +151,17 @@ private:
     const int width;
     const Cvt& cvt;
 
-    const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
+    CvtColorLoop_Invoker(const CvtColorLoop_Invoker&);  // = delete;
+    const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);  // = delete;
 };
 
-template <typename Cvt>
+template <typename Cvt> static inline
 void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
 {
+    CV_AVX_GUARD
     parallel_for_(Range(0, height),
                   CvtColorLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt),
                   (width * height) / static_cast<double>(1<<16));
 }
 
-#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700)
-#  define NEED_IPP 1
-#else
-#  define NEED_IPP 0
-#endif
-
-#if NEED_IPP
-
-#define MAX_IPP8u   255
-#define MAX_IPP16u  65535
-#define MAX_IPP32f  1.0
-
-typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
-typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
-typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
-
-template <typename Cvt>
-class CvtColorIPPLoop_Invoker :
-        public ParallelLoopBody
-{
-public:
-
-    CvtColorIPPLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt, bool *_ok) :
-        ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), cvt(_cvt), ok(_ok)
-    {
-        *ok = true;
-    }
-
-    virtual void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const void *yS = src_data + src_step * range.start;
-        void *yD = dst_data + dst_step * range.start;
-        if( !cvt(yS, static_cast<int>(src_step), yD, static_cast<int>(dst_step), width, range.end - range.start) )
-            *ok = false;
-        else
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
-        }
-    }
-
-private:
-    const uchar * src_data;
-    const size_t src_step;
-    uchar * dst_data;
-    const size_t dst_step;
-    const int width;
-    const Cvt& cvt;
-    bool *ok;
-
-    const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
-};
-
-
-template <typename Cvt>
-bool CvtColorIPPLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
-{
-    bool ok;
-    parallel_for_(Range(0, height), CvtColorIPPLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt, &ok), (width * height)/(double)(1<<16) );
-    return ok;
-}
-
-
-template <typename Cvt>
-bool CvtColorIPPLoopCopy(const uchar * src_data, size_t src_step, int src_type, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
-{
-    Mat temp;
-    Mat src(Size(width, height), src_type, const_cast<uchar*>(src_data), src_step);
-    Mat source = src;
-    if( src_data == dst_data )
-    {
-        src.copyTo(temp);
-        source = temp;
-    }
-    bool ok;
-    parallel_for_(Range(0, source.rows),
-                  CvtColorIPPLoop_Invoker<Cvt>(source.data, source.step, dst_data, dst_step,
-                                               source.cols, cvt, &ok),
-                  source.total()/(double)(1<<16) );
-    return ok;
-}
-
-
-struct IPPGeneralFunctor
-{
-    IPPGeneralFunctor(ippiGeneralFunc _func) : ippiColorConvertGeneral(_func){}
-    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
-    {
-        return ippiColorConvertGeneral ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
-    }
-private:
-    ippiGeneralFunc ippiColorConvertGeneral;
-};
-
-
-struct IPPReorderFunctor
-{
-    IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : ippiColorConvertReorder(_func)
-    {
-        order[0] = _order0;
-        order[1] = _order1;
-        order[2] = _order2;
-        order[3] = 3;
-    }
-    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
-    {
-        return ippiColorConvertReorder ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
-    }
-private:
-    ippiReorderFunc ippiColorConvertReorder;
-    int order[4];
-};
-
-
-struct IPPReorderGeneralFunctor
-{
-    IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
-        ippiColorConvertReorder(_func1), ippiColorConvertGeneral(_func2), depth(_depth)
-    {
-        order[0] = _order0;
-        order[1] = _order1;
-        order[2] = _order2;
-        order[3] = 3;
-    }
-    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
-    {
-        if (ippiColorConvertReorder == 0 || ippiColorConvertGeneral == 0)
-            return false;
-
-        Mat temp;
-        temp.create(rows, cols, CV_MAKETYPE(depth, 3));
-        if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
-            return false;
-        return CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
-    }
-private:
-    ippiReorderFunc ippiColorConvertReorder;
-    ippiGeneralFunc ippiColorConvertGeneral;
-    int order[4];
-    int depth;
-};
-
-
-struct IPPGeneralReorderFunctor
-{
-    IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
-        ippiColorConvertGeneral(_func1), ippiColorConvertReorder(_func2), depth(_depth)
-    {
-        order[0] = _order0;
-        order[1] = _order1;
-        order[2] = _order2;
-        order[3] = 3;
-    }
-    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
-    {
-        if (ippiColorConvertGeneral == 0 || ippiColorConvertReorder == 0)
-            return false;
-
-        Mat temp;
-        temp.create(rows, cols, CV_MAKETYPE(depth, 3));
-        if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
-            return false;
-        return CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
-    }
-private:
-    ippiGeneralFunc ippiColorConvertGeneral;
-    ippiReorderFunc ippiColorConvertReorder;
-    int order[4];
-    int depth;
-};
-
-extern ippiReorderFunc ippiSwapChannelsC3C4RTab[8];
-extern ippiReorderFunc ippiSwapChannelsC4C3RTab[8];
-extern ippiReorderFunc ippiSwapChannelsC3RTab[8];
-
-#endif
-
-#ifdef HAVE_OPENCL
-
-bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb );
-bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb );
-bool oclCvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb);
-bool oclCvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb);
-bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx );
-bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
-
-bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full );
-bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full );
-bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full );
-bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full );
-
-bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse );
-bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits );
-bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits );
-bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits );
-bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits );
-bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx );
-bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn );
-bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst );
-bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst );
-
-bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx);
-bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx);
-bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx );
-bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
-
-bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx );
-bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
-bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
-bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx );
-bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
-
-#endif
-
-void cvtColorBGR2Lab( InputArray _src, OutputArray _dst, bool swapb, bool srgb);
-void cvtColorBGR2Luv( InputArray _src, OutputArray _dst, bool swapb, bool srgb);
-void cvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb );
-void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb );
-void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb );
-void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb );
-
-void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb);
-void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb);
-
-void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn);
-void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx);
-void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
-void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi );
-
-void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange );
-void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange );
-void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange);
-void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange);
-
-void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb);
-void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits);
-void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits);
-void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb);
-void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn);
-void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits);
-void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits);
-void cvtColorRGBA2mRGBA(InputArray _src, OutputArray _dst);
-void cvtColormRGBA2RGBA(InputArray _src, OutputArray _dst);
-
-} //namespace cv
+} //namespace
diff --git a/modules/imgproc/src/color_hsv.dispatch.cpp b/modules/imgproc/src/color_hsv.dispatch.cpp
index f0a4c87558..f1678f5deb 100644
--- a/modules/imgproc/src/color_hsv.dispatch.cpp
+++ b/modules/imgproc/src/color_hsv.dispatch.cpp
@@ -3,1194 +3,15 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
-#include "color.hpp"
-
-namespace cv
-{
-
-////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
-
-
-struct RGB2HSV_b
-{
-    typedef uchar channel_type;
-
-    RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
-    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
-    {
-        CV_Assert( hrange == 180 || hrange == 256 );
-    }
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int i, bidx = blueIdx, scn = srccn;
-        const int hsv_shift = 12;
-
-        static int sdiv_table[256];
-        static int hdiv_table180[256];
-        static int hdiv_table256[256];
-        static volatile bool initialized = false;
-
-        int hr = hrange;
-        const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
-        n *= 3;
-
-        if( !initialized )
-        {
-            sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
-            for( i = 1; i < 256; i++ )
-            {
-                sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
-                hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
-                hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
-            }
-            initialized = true;
-        }
-
-        for( i = 0; i < n; i += 3, src += scn )
-        {
-            int b = src[bidx], g = src[1], r = src[bidx^2];
-            int h, s, v = b;
-            int vmin = b;
-            int vr, vg;
-
-            CV_CALC_MAX_8U( v, g );
-            CV_CALC_MAX_8U( v, r );
-            CV_CALC_MIN_8U( vmin, g );
-            CV_CALC_MIN_8U( vmin, r );
-
-            uchar diff = saturate_cast<uchar>(v - vmin);
-            vr = v == r ? -1 : 0;
-            vg = v == g ? -1 : 0;
-
-            s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-            h = (vr & (g - b)) +
-                (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
-            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
-            h += h < 0 ? hr : 0;
-
-            dst[i] = saturate_cast<uchar>(h);
-            dst[i+1] = (uchar)s;
-            dst[i+2] = (uchar)v;
-        }
-    }
-
-    int srccn, blueIdx, hrange;
-};
-
-
-struct RGB2HSV_f
-{
-    typedef float channel_type;
-
-    RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
-    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {
-        #if CV_SIMD128
-        hasSIMD = hasSIMD128();
-        #endif
-    }
-
-    #if CV_SIMD128
-    inline void process(v_float32x4& v_r, v_float32x4& v_g,
-                        v_float32x4& v_b, float hscale) const
-    {
-        v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b);
-        v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b);
-
-        v_float32x4 v_eps = v_setall_f32(FLT_EPSILON);
-        v_float32x4 v_diff = v_max_rgb - v_min_rgb;
-        v_float32x4 v_s = v_diff / (v_abs(v_max_rgb) + v_eps);
-
-        v_float32x4 v_r_eq_max = v_r == v_max_rgb;
-        v_float32x4 v_g_eq_max = v_g == v_max_rgb;
-        v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b,
-                          v_select(v_g_eq_max, v_b - v_r, v_r - v_g));
-        v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f),
-                            v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f)));
-        v_float32x4 v_rev_diff = v_setall_f32(60.0f) / (v_diff + v_eps);
-        v_r = v_muladd(v_h, v_rev_diff, v_res) * v_setall_f32(hscale);
-
-        v_g = v_s;
-        v_b = v_max_rgb;
-    }
-    #endif
-
-    void operator()(const float* src, float* dst, int n) const
-    {
-        int i = 0, bidx = blueIdx, scn = srccn;
-        float hscale = hrange*(1.f/360.f);
-        n *= 3;
-
-        #if CV_SIMD128
-        if (hasSIMD)
-        {
-            if (scn == 3) {
-                if (bidx) {
-                    for ( ; i <= n - 12; i += 12, src += scn * 4)
-                    {
-                        v_float32x4 v_r;
-                        v_float32x4 v_g;
-                        v_float32x4 v_b;
-                        v_load_deinterleave(src, v_r, v_g, v_b);
-                        process(v_r, v_g, v_b, hscale);
-                        v_store_interleave(dst + i, v_r, v_g, v_b);
-                    }
-                } else {
-                    for ( ; i <= n - 12; i += 12, src += scn * 4)
-                    {
-                        v_float32x4 v_r;
-                        v_float32x4 v_g;
-                        v_float32x4 v_b;
-                        v_load_deinterleave(src, v_r, v_g, v_b);
-                        process(v_b, v_g, v_r, hscale);
-                        v_store_interleave(dst + i, v_b, v_g, v_r);
-                    }
-                }
-            } else { // scn == 4
-                if (bidx) {
-                    for ( ; i <= n - 12; i += 12, src += scn * 4)
-                    {
-                        v_float32x4 v_r;
-                        v_float32x4 v_g;
-                        v_float32x4 v_b;
-                        v_float32x4 v_a;
-                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
-                        process(v_r, v_g, v_b, hscale);
-                        v_store_interleave(dst + i, v_r, v_g, v_b);
-                    }
-                } else {
-                    for ( ; i <= n - 12; i += 12, src += scn * 4)
-                    {
-                        v_float32x4 v_r;
-                        v_float32x4 v_g;
-                        v_float32x4 v_b;
-                        v_float32x4 v_a;
-                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
-                        process(v_b, v_g, v_r, hscale);
-                        v_store_interleave(dst + i, v_b, v_g, v_r);
-                    }
-                }
-            }
-        }
-        #endif
-
-        for( ; i < n; i += 3, src += scn )
-        {
-            float b = src[bidx], g = src[1], r = src[bidx^2];
-            float h, s, v;
-
-            float vmin, diff;
-
-            v = vmin = r;
-            if( v < g ) v = g;
-            if( v < b ) v = b;
-            if( vmin > g ) vmin = g;
-            if( vmin > b ) vmin = b;
-
-            diff = v - vmin;
-            s = diff/(float)(fabs(v) + FLT_EPSILON);
-            diff = (float)(60./(diff + FLT_EPSILON));
-            if( v == r )
-                h = (g - b)*diff;
-            else if( v == g )
-                h = (b - r)*diff + 120.f;
-            else
-                h = (r - g)*diff + 240.f;
-
-            if( h < 0 ) h += 360.f;
-
-            dst[i] = h*hscale;
-            dst[i+1] = s;
-            dst[i+2] = v;
-        }
-    }
-
-    int srccn, blueIdx;
-    float hrange;
-    #if CV_SIMD128
-    bool hasSIMD;
-    #endif
-};
-
-
-#if CV_SIMD128
-inline void HSV2RGB_simd(v_float32x4& v_h, v_float32x4& v_s, v_float32x4& v_v, float hscale)
-{
-    v_h = v_h * v_setall_f32(hscale);
-    v_float32x4 v_pre_sector = v_cvt_f32(v_trunc(v_h));
-    v_h = v_h - v_pre_sector;
-    v_float32x4 v_tab0 = v_v;
-    v_float32x4 v_one = v_setall_f32(1.0f);
-    v_float32x4 v_tab1 = v_v * (v_one - v_s);
-    v_float32x4 v_tab2 = v_v * (v_one - (v_s * v_h));
-    v_float32x4 v_tab3 = v_v * (v_one - (v_s * (v_one - v_h)));
-
-    v_float32x4 v_one_sixth = v_setall_f32(1.0f / 6.0f);
-    v_float32x4 v_sector = v_pre_sector * v_one_sixth;
-    v_sector = v_cvt_f32(v_trunc(v_sector));
-    v_float32x4 v_six = v_setall_f32(6.0f);
-    v_sector = v_pre_sector - (v_sector * v_six);
-
-    v_float32x4 v_two = v_setall_f32(2.0f);
-    v_h = v_tab1 & (v_sector < v_two);
-    v_h = v_h | (v_tab3 & (v_sector == v_two));
-    v_float32x4 v_three = v_setall_f32(3.0f);
-    v_h = v_h | (v_tab0 & (v_sector == v_three));
-    v_float32x4 v_four = v_setall_f32(4.0f);
-    v_h = v_h | (v_tab0 & (v_sector == v_four));
-    v_h = v_h | (v_tab2 & (v_sector > v_four));
-
-    v_s = v_tab3 & (v_sector < v_one);
-    v_s = v_s | (v_tab0 & (v_sector == v_one));
-    v_s = v_s | (v_tab0 & (v_sector == v_two));
-    v_s = v_s | (v_tab2 & (v_sector == v_three));
-    v_s = v_s | (v_tab1 & (v_sector > v_three));
-
-    v_v = v_tab0 & (v_sector < v_one);
-    v_v = v_v | (v_tab2 & (v_sector == v_one));
-    v_v = v_v | (v_tab1 & (v_sector == v_two));
-    v_v = v_v | (v_tab1 & (v_sector == v_three));
-    v_v = v_v | (v_tab3 & (v_sector == v_four));
-    v_v = v_v | (v_tab0 & (v_sector > v_four));
-}
-#endif
-
-
-inline void HSV2RGB_native(const float* src, float* dst, const float hscale, const int bidx)
-{
-    float h = src[0], s = src[1], v = src[2];
-    float b, g, r;
-
-    if( s == 0 )
-        b = g = r = v;
-    else
-    {
-        static const int sector_data[][3]=
-            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-        float tab[4];
-        int sector;
-        h *= hscale;
-        if( h < 0 )
-            do h += 6; while( h < 0 );
-        else if( h >= 6 )
-            do h -= 6; while( h >= 6 );
-        sector = cvFloor(h);
-        h -= sector;
-        if( (unsigned)sector >= 6u )
-        {
-            sector = 0;
-            h = 0.f;
-        }
-
-        tab[0] = v;
-        tab[1] = v*(1.f - s);
-        tab[2] = v*(1.f - s*h);
-        tab[3] = v*(1.f - s*(1.f - h));
-
-        b = tab[sector_data[sector][0]];
-        g = tab[sector_data[sector][1]];
-        r = tab[sector_data[sector][2]];
-    }
-
-    dst[bidx] = b;
-    dst[1] = g;
-    dst[bidx^2] = r;
-}
-
 
-struct HSV2RGB_f
-{
-    typedef float channel_type;
-
-    HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
-    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
-        #if CV_SIMD128
-        hasSIMD = hasSIMD128();
-        #endif
-    }
-
-    void operator()(const float* src, float* dst, int n) const
-    {
-        int i = 0, bidx = blueIdx, dcn = dstcn;
-        n *= 3;
-
-        if (dcn == 3)
-        {
-            #if CV_SIMD128
-            if (hasSIMD)
-            {
-                for (; i <= n - 12; i += 12, dst += dcn * 4)
-                {
-                    v_float32x4 v_src[3];
-                    v_load_deinterleave(src + i, v_src[0], v_src[1], v_src[2]);
-                    HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale);
-                    v_store_interleave(dst, v_src[bidx], v_src[1], v_src[bidx^2]);
-                }
-            }
-            #endif
-            for( ; i < n; i += 3, dst += dcn )
-            {
-                HSV2RGB_native(src + i, dst, hscale, bidx);
-            }
-        } else { // dcn == 4
-            float alpha = ColorChannel<float>::max();
-            #if CV_SIMD128
-            if (hasSIMD)
-            {
-                for (; i <= n - 12; i += 12, dst += dcn * 4)
-                {
-                    v_float32x4 v_src[3];
-                    v_load_deinterleave(src + i, v_src[0], v_src[1], v_src[2]);
-                    HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale);
-                    v_float32x4 v_a = v_setall_f32(alpha);
-                    v_store_interleave(dst, v_src[bidx], v_src[1], v_src[bidx^2], v_a);
-                }
-            }
-            #endif
-            for( ; i < n; i += 3, dst += dcn )
-            {
-                HSV2RGB_native(src + i, dst, hscale, bidx);
-                dst[3] = alpha;
-            }
-        }
-    }
-
-    int dstcn, blueIdx;
-    float hscale;
-    #if CV_SIMD128
-    bool hasSIMD;
-    #endif
-};
-
-
-struct HSV2RGB_b
-{
-    typedef uchar channel_type;
-
-    HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
-    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.0f / _hrange)
-    {
-        #if CV_SIMD128
-        hasSIMD = hasSIMD128();
-        #endif
-    }
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int j = 0, dcn = dstcn;
-        uchar alpha = ColorChannel<uchar>::max();
-
-        #if CV_SIMD128
-        if (hasSIMD)
-        {
-            for (j = 0; j <= (n - 16) * 3; j += 48, dst += dcn * 16)
-            {
-                v_uint8x16 h_b, s_b, v_b;
-                v_uint16x8 h_w[2], s_w[2], v_w[2];
-                v_uint32x4 h_u[4], s_u[4], v_u[4];
-                v_load_deinterleave(src + j, h_b, s_b, v_b);
-                v_expand(h_b, h_w[0], h_w[1]);
-                v_expand(s_b, s_w[0], s_w[1]);
-                v_expand(v_b, v_w[0], v_w[1]);
-                v_expand(h_w[0], h_u[0], h_u[1]);
-                v_expand(h_w[1], h_u[2], h_u[3]);
-                v_expand(s_w[0], s_u[0], s_u[1]);
-                v_expand(s_w[1], s_u[2], s_u[3]);
-                v_expand(v_w[0], v_u[0], v_u[1]);
-                v_expand(v_w[1], v_u[2], v_u[3]);
-
-                v_int32x4 b_i[4], g_i[4], r_i[4];
-                v_float32x4 v_coeff0 = v_setall_f32(1.0f / 255.0f);
-                v_float32x4 v_coeff1 = v_setall_f32(255.0f);
-
-                for( int k = 0; k < 4; k++ )
-                {
-                    v_float32x4 v_src[3];
-                    v_src[0] = v_cvt_f32(v_reinterpret_as_s32(h_u[k]));
-                    v_src[1] = v_cvt_f32(v_reinterpret_as_s32(s_u[k]));
-                    v_src[2] = v_cvt_f32(v_reinterpret_as_s32(v_u[k]));
-
-                    v_src[1] *= v_coeff0;
-                    v_src[2] *= v_coeff0;
-                    HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale);
-
-                    v_src[0] *= v_coeff1;
-                    v_src[1] *= v_coeff1;
-                    v_src[2] *= v_coeff1;
-                    b_i[k] = v_trunc(v_src[0]);
-                    g_i[k] = v_trunc(v_src[1]);
-                    r_i[k] = v_trunc(v_src[2]);
-                }
-
-                v_uint16x8 r_w[2], g_w[2], b_w[2];
-                v_uint8x16 r_b, g_b, b_b;
-
-                r_w[0] = v_pack_u(r_i[0], r_i[1]);
-                r_w[1] = v_pack_u(r_i[2], r_i[3]);
-                r_b = v_pack(r_w[0], r_w[1]);
-                g_w[0] = v_pack_u(g_i[0], g_i[1]);
-                g_w[1] = v_pack_u(g_i[2], g_i[3]);
-                g_b = v_pack(g_w[0], g_w[1]);
-                b_w[0] = v_pack_u(b_i[0], b_i[1]);
-                b_w[1] = v_pack_u(b_i[2], b_i[3]);
-                b_b = v_pack(b_w[0], b_w[1]);
-
-                if( dcn == 3 )
-                {
-                    if( blueIdx == 0 )
-                        v_store_interleave(dst, b_b, g_b, r_b);
-                    else
-                        v_store_interleave(dst, r_b, g_b, b_b);
-                }
-                else
-                {
-                    v_uint8x16 alpha_b = v_setall_u8(alpha);
-                    if( blueIdx == 0 )
-                        v_store_interleave(dst, b_b, g_b, r_b, alpha_b);
-                    else
-                        v_store_interleave(dst, r_b, g_b, b_b, alpha_b);
-                }
-            }
-        }
-        #endif
-        for( ; j < n * 3; j += 3, dst += dcn )
-        {
-            float buf[6];
-            buf[0] = src[j];
-            buf[1] = src[j+1] * (1.0f / 255.0f);
-            buf[2] = src[j+2] * (1.0f / 255.0f);
-            HSV2RGB_native(buf, buf + 3, hscale, blueIdx);
-            dst[0] = saturate_cast<uchar>(buf[3] * 255.0f);
-            dst[1] = saturate_cast<uchar>(buf[4] * 255.0f);
-            dst[2] = saturate_cast<uchar>(buf[5] * 255.0f);
-            if( dcn == 4 )
-                dst[3] = alpha;
-        }
-    }
-
-    int dstcn;
-    int blueIdx;
-    float hscale;
-    #if CV_SIMD128
-    bool hasSIMD;
-    #endif
-};
-
-
-///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
-
-struct RGB2HLS_f
-{
-    typedef float channel_type;
-
-    RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
-    : srccn(_srccn), blueIdx(_blueIdx), hscale(_hrange/360.f) {
-        #if CV_SIMD128
-        hasSIMD = hasSIMD128();
-        #endif
-    }
-
-    #if CV_SIMD128
-    inline void process(v_float32x4& v_r, v_float32x4& v_g,
-                        v_float32x4& v_b, v_float32x4& v_hscale) const
-    {
-        v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b);
-        v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b);
-
-        v_float32x4 v_diff = v_max_rgb - v_min_rgb;
-        v_float32x4 v_sum = v_max_rgb + v_min_rgb;
-        v_float32x4 v_half = v_setall_f32(0.5f);
-        v_float32x4 v_l = v_sum * v_half;
-
-        v_float32x4 v_s = v_diff / v_select(v_l < v_half, v_sum, v_setall_f32(2.0f) - v_sum);
-
-        v_float32x4 v_r_eq_max = v_max_rgb == v_r;
-        v_float32x4 v_g_eq_max = v_max_rgb == v_g;
-        v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b,
-                          v_select(v_g_eq_max, v_b - v_r, v_r - v_g));
-        v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f),
-                            v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f)));
-        v_float32x4 v_rev_diff = v_setall_f32(60.0f) / v_diff;
-        v_h = v_muladd(v_h, v_rev_diff, v_res) * v_hscale;
-
-        v_float32x4 v_diff_gt_eps = v_diff > v_setall_f32(FLT_EPSILON);
-        v_r = v_diff_gt_eps & v_h;
-        v_g = v_l;
-        v_b = v_diff_gt_eps & v_s;
-    }
-    #endif
-
-    void operator()(const float* src, float* dst, int n) const
-    {
-        int i = 0, bidx = blueIdx, scn = srccn;
-        n *= 3;
-
-        #if CV_SIMD128
-        if (hasSIMD)
-        {
-            v_float32x4 v_hscale = v_setall_f32(hscale);
-            if (scn == 3) {
-                if (bidx) {
-                    for ( ; i <= n - 12; i += 12, src += scn * 4)
-                    {
-                        v_float32x4 v_r;
-                        v_float32x4 v_g;
-                        v_float32x4 v_b;
-                        v_load_deinterleave(src, v_r, v_g, v_b);
-                        process(v_r, v_g, v_b, v_hscale);
-                        v_store_interleave(dst + i, v_r, v_g, v_b);
-                    }
-                } else {
-                    for ( ; i <= n - 12; i += 12, src += scn * 4)
-                    {
-                        v_float32x4 v_r;
-                        v_float32x4 v_g;
-                        v_float32x4 v_b;
-                        v_load_deinterleave(src, v_r, v_g, v_b);
-                        process(v_b, v_g, v_r, v_hscale);
-                        v_store_interleave(dst + i, v_b, v_g, v_r);
-                    }
-                }
-            } else { // scn == 4
-                if (bidx) {
-                    for ( ; i <= n - 12; i += 12, src += scn * 4)
-                    {
-                        v_float32x4 v_r;
-                        v_float32x4 v_g;
-                        v_float32x4 v_b;
-                        v_float32x4 v_a;
-                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
-                        process(v_r, v_g, v_b, v_hscale);
-                        v_store_interleave(dst + i, v_r, v_g, v_b);
-                    }
-                } else {
-                    for ( ; i <= n - 12; i += 12, src += scn * 4)
-                    {
-                        v_float32x4 v_r;
-                        v_float32x4 v_g;
-                        v_float32x4 v_b;
-                        v_float32x4 v_a;
-                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
-                        process(v_b, v_g, v_r, v_hscale);
-                        v_store_interleave(dst + i, v_b, v_g, v_r);
-                    }
-                }
-            }
-        }
-        #endif
-
-        for( ; i < n; i += 3, src += scn )
-        {
-            float b = src[bidx], g = src[1], r = src[bidx^2];
-            float h = 0.f, s = 0.f, l;
-            float vmin, vmax, diff;
-
-            vmax = vmin = r;
-            if( vmax < g ) vmax = g;
-            if( vmax < b ) vmax = b;
-            if( vmin > g ) vmin = g;
-            if( vmin > b ) vmin = b;
-
-            diff = vmax - vmin;
-            l = (vmax + vmin)*0.5f;
-
-            if( diff > FLT_EPSILON )
-            {
-                s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
-                diff = 60.f/diff;
-
-                if( vmax == r )
-                    h = (g - b)*diff;
-                else if( vmax == g )
-                    h = (b - r)*diff + 120.f;
-                else
-                    h = (r - g)*diff + 240.f;
-
-                if( h < 0.f ) h += 360.f;
-            }
-
-            dst[i] = h*hscale;
-            dst[i+1] = l;
-            dst[i+2] = s;
-        }
-    }
+#include "opencl_kernels_imgproc.hpp"
 
-    int srccn, blueIdx;
-    float hscale;
-    #if CV_SIMD128
-    bool hasSIMD;
-    #endif
-};
-
-
-struct RGB2HLS_b
-{
-    typedef uchar channel_type;
-
-    RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
-    : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
-    {
-        #if CV_NEON
-        v_scale_inv = vdupq_n_f32(1.f/255.f);
-        v_scale = vdupq_n_f32(255.f);
-        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
-        #elif CV_SSE2
-        v_scale_inv = _mm_set1_ps(1.f/255.f);
-        v_zero = _mm_setzero_si128();
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-        #endif
-    }
-
-    #if CV_SSE2
-    void process(const float * buf,
-                 __m128 & v_coeffs, uchar * dst) const
-    {
-        __m128 v_l0f = _mm_load_ps(buf);
-        __m128 v_l1f = _mm_load_ps(buf + 4);
-        __m128 v_u0f = _mm_load_ps(buf + 8);
-        __m128 v_u1f = _mm_load_ps(buf + 12);
-
-        v_l0f = _mm_mul_ps(v_l0f, v_coeffs);
-        v_u1f = _mm_mul_ps(v_u1f, v_coeffs);
-        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92));
-        v_u0f = _mm_mul_ps(v_u0f, v_coeffs);
-        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92));
-        v_l1f = _mm_mul_ps(v_l1f, v_coeffs);
-
-        __m128i v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
-        __m128i v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f));
-        __m128i v_l0 = _mm_packus_epi16(v_l, v_u);
-
-        _mm_storeu_si128((__m128i *)(dst), v_l0);
-    }
-    #endif
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int i, j, scn = srccn;
-        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
-        #if CV_SSE2
-        __m128 v_coeffs = _mm_set_ps(1.f, 255.f, 255.f, 1.f);
-        #endif
-
-        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
-        {
-            int dn = std::min(n - i, (int)BLOCK_SIZE);
-            j = 0;
-
-            #if CV_NEON
-            for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
-            {
-                uint16x8_t v_t0, v_t1, v_t2;
-
-                if (scn == 3)
-                {
-                    uint8x8x3_t v_src = vld3_u8(src);
-                    v_t0 = vmovl_u8(v_src.val[0]);
-                    v_t1 = vmovl_u8(v_src.val[1]);
-                    v_t2 = vmovl_u8(v_src.val[2]);
-                }
-                else
-                {
-                    uint8x8x4_t v_src = vld4_u8(src);
-                    v_t0 = vmovl_u8(v_src.val[0]);
-                    v_t1 = vmovl_u8(v_src.val[1]);
-                    v_t2 = vmovl_u8(v_src.val[2]);
-                }
-
-                float32x4x3_t v_dst;
-                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
-                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
-                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
-                vst3q_f32(buf + j, v_dst);
-
-                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
-                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
-                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
-                vst3q_f32(buf + j + 12, v_dst);
-            }
-            #elif CV_SSE2
-            if (scn == 3 && haveSIMD)
-            {
-                for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
-                {
-                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
-
-                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
-                    _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
-                    _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
-
-                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
-                    _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
-                    _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
-                }
-
-                int jr = j % 3;
-                if (jr)
-                    src -= jr, j -= jr;
-            }
-            else if (scn == 4 && haveSIMD)
-            {
-                for ( ; j <= (dn * 3 - 12); j += 12, src += 16)
-                {
-                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
-
-                    __m128i v_src_lo = _mm_unpacklo_epi8(v_src, v_zero);
-                    __m128i v_src_hi = _mm_unpackhi_epi8(v_src, v_zero);
-                    _mm_storeu_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_lo, v_zero)), v_scale_inv));
-                    _mm_storeu_ps(buf + j + 3, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_lo, v_zero)), v_scale_inv));
-                    _mm_storeu_ps(buf + j + 6, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_hi, v_zero)), v_scale_inv));
-                    float tmp = buf[j + 8];
-                    _mm_storeu_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_unpackhi_epi16(v_src_hi, v_zero), 0x90)), v_scale_inv));
-                    buf[j + 8] = tmp;
-                }
-
-                int jr = j % 3;
-                if (jr)
-                    src -= jr, j -= jr;
-            }
-            #endif
-            for( ; j < dn*3; j += 3, src += scn )
-            {
-                buf[j] = src[0]*(1.f/255.f);
-                buf[j+1] = src[1]*(1.f/255.f);
-                buf[j+2] = src[2]*(1.f/255.f);
-            }
-            cvt(buf, buf, dn);
-
-            j = 0;
-            #if CV_NEON
-            for ( ; j <= (dn - 8) * 3; j += 24)
-            {
-                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
-
-                uint8x8x3_t v_dst;
-                v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
-                                                       vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
-                v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
-                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
-                v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
-                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
-                vst3_u8(dst + j, v_dst);
-            }
-            #elif CV_SSE2
-            if (haveSIMD)
-            {
-                for ( ; j <= (dn - 16) * 3; j += 48)
-                {
-                    process(buf + j,
-                            v_coeffs, dst + j);
-
-                    process(buf + j + 16,
-                            v_coeffs, dst + j + 16);
-
-                    process(buf + j + 32,
-                            v_coeffs, dst + j + 32);
-                }
-            }
-            #endif
-            for( ; j < dn*3; j += 3 )
-            {
-                dst[j] = saturate_cast<uchar>(buf[j]);
-                dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
-                dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
-            }
-        }
-    }
-
-    int srccn;
-    RGB2HLS_f cvt;
-    #if CV_NEON
-    float32x4_t v_scale, v_scale_inv;
-    uint8x8_t v_alpha;
-    #elif CV_SSE2
-    __m128 v_scale_inv;
-    __m128i v_zero;
-    bool haveSIMD;
-    #endif
-};
-
-
-struct HLS2RGB_f
-{
-    typedef float channel_type;
-
-    HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
-    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
-        #if CV_SIMD128
-        hasSIMD = hasSIMD128();
-        #endif
-    }
-
-    #if CV_SIMD128
-    inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const
-    {
-        v_float32x4 v_one = v_setall_f32(1.0f);
-
-        v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f);
-        v_float32x4 v_ls = v_l * v_s;
-        v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls);
-
-        v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale);
-        v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw));
-        v_float32x4 v_hs = v_hs_raw - v_pre_hs;
-        v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f)));
-        v_float32x4 v_elem1 = v_hs + v_hs;
-
-        v_float32x4 v_tab0 = v_l + v_elem0;
-        v_float32x4 v_tab1 = v_l - v_elem0;
-        v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1;
-        v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1;
-
-        v_float32x4 v_two  = v_setall_f32(2.0f);
-        v_float32x4 v_four = v_setall_f32(4.0f);
-
-        v_h = v_select(v_sector <  v_two , v_tab1,
-              v_select(v_sector <= v_two , v_tab3,
-              v_select(v_sector <= v_four, v_tab0, v_tab2)));
-
-        v_l = v_select(v_sector <  v_one , v_tab3,
-              v_select(v_sector <= v_two , v_tab0,
-              v_select(v_sector <  v_four, v_tab2, v_tab1)));
-
-        v_s = v_select(v_sector <  v_one , v_tab0,
-              v_select(v_sector <  v_two , v_tab2,
-              v_select(v_sector <  v_four, v_tab1,
-              v_select(v_sector <= v_four, v_tab3, v_tab0))));
-    }
-    #endif
-
-    void operator()(const float* src, float* dst, int n) const
-    {
-        int i = 0, bidx = blueIdx, dcn = dstcn;
-        float alpha = ColorChannel<float>::max();
-        n *= 3;
-
-        #if CV_SIMD128
-        if (hasSIMD)
-        {
-            if (dcn == 3)
-            {
-                if (bidx)
-                {
-                    for (; i <= n - 12; i += 12, dst += dcn * 4)
-                    {
-                        v_float32x4 v_h;
-                        v_float32x4 v_l;
-                        v_float32x4 v_s;
-                        v_load_deinterleave(src + i, v_h, v_l, v_s);
-                        process(v_h, v_l, v_s);
-                        v_store_interleave(dst, v_s, v_l, v_h);
-                    }
-                } else {
-                    for (; i <= n - 12; i += 12, dst += dcn * 4)
-                    {
-                        v_float32x4 v_h;
-                        v_float32x4 v_l;
-                        v_float32x4 v_s;
-                        v_load_deinterleave(src + i, v_h, v_l, v_s);
-                        process(v_h, v_l, v_s);
-                        v_store_interleave(dst, v_h, v_l, v_s);
-                    }
-                }
-            } else { // dcn == 4
-                if (bidx)
-                {
-                    for (; i <= n - 12; i += 12, dst += dcn * 4)
-                    {
-                        v_float32x4 v_h;
-                        v_float32x4 v_l;
-                        v_float32x4 v_s;
-                        v_load_deinterleave(src + i, v_h, v_l, v_s);
-                        process(v_h, v_l, v_s);
-                        v_float32x4 v_a = v_setall_f32(alpha);
-                        v_store_interleave(dst, v_s, v_l, v_h, v_a);
-                    }
-                } else {
-                    for (; i <= n - 12; i += 12, dst += dcn * 4)
-                    {
-                        v_float32x4 v_h;
-                        v_float32x4 v_l;
-                        v_float32x4 v_s;
-                        v_load_deinterleave(src + i, v_h, v_l, v_s);
-                        process(v_h, v_l, v_s);
-                        v_float32x4 v_a = v_setall_f32(alpha);
-                        v_store_interleave(dst, v_h, v_l, v_s, v_a);
-                    }
-                }
-            }
-        }
-        #endif
-
-        for( ; i < n; i += 3, dst += dcn )
-        {
-            float h = src[i], l = src[i+1], s = src[i+2];
-            float b, g, r;
-
-            if( s == 0 )
-                b = g = r = l;
-            else
-            {
-                static const int sector_data[][3]=
-                {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-                float tab[4];
-                int sector;
-
-                float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
-                float p1 = 2*l - p2;
-
-                h *= hscale;
-                if( h < 0 )
-                    do h += 6; while( h < 0 );
-                else if( h >= 6 )
-                    do h -= 6; while( h >= 6 );
-
-                assert( 0 <= h && h < 6 );
-                sector = cvFloor(h);
-                h -= sector;
-
-                tab[0] = p2;
-                tab[1] = p1;
-                tab[2] = p1 + (p2 - p1)*(1-h);
-                tab[3] = p1 + (p2 - p1)*h;
-
-                b = tab[sector_data[sector][0]];
-                g = tab[sector_data[sector][1]];
-                r = tab[sector_data[sector][2]];
-            }
-
-            dst[bidx] = b;
-            dst[1] = g;
-            dst[bidx^2] = r;
-            if( dcn == 4 )
-                dst[3] = alpha;
-        }
-    }
-
-    int dstcn, blueIdx;
-    float hscale;
-    #if CV_SIMD128
-    bool hasSIMD;
-    #endif
-};
-
-
-struct HLS2RGB_b
-{
-    typedef uchar channel_type;
-
-    HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
-    : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
-    {
-        #if CV_NEON
-        v_scale_inv = vdupq_n_f32(1.f/255.f);
-        v_scale = vdupq_n_f32(255.f);
-        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
-        #elif CV_SSE2
-        v_scale = _mm_set1_ps(255.f);
-        v_alpha = _mm_set1_ps(ColorChannel<uchar>::max());
-        v_zero = _mm_setzero_si128();
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-        #endif
-    }
-
-    #if CV_SSE2
-    void process(__m128i v_r, __m128i v_g, __m128i v_b,
-                 const __m128& v_coeffs_,
-                 float * buf) const
-    {
-        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
-        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
-        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
-
-        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
-        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
-        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
-
-        __m128 v_coeffs = v_coeffs_;
-
-        v_r0 = _mm_mul_ps(v_r0, v_coeffs);
-        v_g1 = _mm_mul_ps(v_g1, v_coeffs);
-
-        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
-
-        v_r1 = _mm_mul_ps(v_r1, v_coeffs);
-        v_b0 = _mm_mul_ps(v_b0, v_coeffs);
-
-        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
-
-        v_g0 = _mm_mul_ps(v_g0, v_coeffs);
-        v_b1 = _mm_mul_ps(v_b1, v_coeffs);
-
-        _mm_store_ps(buf, v_r0);
-        _mm_store_ps(buf + 4, v_r1);
-        _mm_store_ps(buf + 8, v_g0);
-        _mm_store_ps(buf + 12, v_g1);
-        _mm_store_ps(buf + 16, v_b0);
-        _mm_store_ps(buf + 20, v_b1);
-    }
-    #endif
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int i, j, dcn = dstcn;
-        uchar alpha = ColorChannel<uchar>::max();
-        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
-        #if CV_SSE2
-        __m128 v_coeffs = _mm_set_ps(1.f, 1.f/255.f, 1.f/255.f, 1.f);
-        #endif
-
-        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
-        {
-            int dn = std::min(n - i, (int)BLOCK_SIZE);
-            j = 0;
-
-            #if CV_NEON
-            for ( ; j <= (dn - 8) * 3; j += 24)
-            {
-                uint8x8x3_t v_src = vld3_u8(src + j);
-                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
-                           v_t1 = vmovl_u8(v_src.val[1]),
-                           v_t2 = vmovl_u8(v_src.val[2]);
-
-                float32x4x3_t v_dst;
-                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
-                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
-                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
-                vst3q_f32(buf + j, v_dst);
-
-                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
-                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
-                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
-                vst3q_f32(buf + j + 12, v_dst);
-            }
-            #elif CV_SSE2
-            if (haveSIMD)
-            {
-                for ( ; j <= (dn - 8) * 3; j += 24)
-                {
-                    __m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j));
-                    __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16));
-
-                    process(_mm_unpacklo_epi8(v_src0, v_zero),
-                            _mm_unpackhi_epi8(v_src0, v_zero),
-                            _mm_unpacklo_epi8(v_src1, v_zero),
-                            v_coeffs,
-                            buf + j);
-                }
-            }
-            #endif
-            for( ; j < dn*3; j += 3 )
-            {
-                buf[j] = src[j];
-                buf[j+1] = src[j+1]*(1.f/255.f);
-                buf[j+2] = src[j+2]*(1.f/255.f);
-            }
-            cvt(buf, buf, dn);
-
-            j = 0;
-            #if CV_NEON
-            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
-            {
-                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
-                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
-                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
-                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
-                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
-                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
-                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
-
-                if (dcn == 4)
-                {
-                    uint8x8x4_t v_dst;
-                    v_dst.val[0] = v_dst0;
-                    v_dst.val[1] = v_dst1;
-                    v_dst.val[2] = v_dst2;
-                    v_dst.val[3] = v_alpha;
-                    vst4_u8(dst, v_dst);
-                }
-                else
-                {
-                    uint8x8x3_t v_dst;
-                    v_dst.val[0] = v_dst0;
-                    v_dst.val[1] = v_dst1;
-                    v_dst.val[2] = v_dst2;
-                    vst3_u8(dst, v_dst);
-                }
-            }
-            #elif CV_SSE2
-            if (dcn == 3 && haveSIMD)
-            {
-                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
-                {
-                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
-                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
-                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
-                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
-
-                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
-                                                     _mm_cvtps_epi32(v_src1));
-                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
-                                                     _mm_cvtps_epi32(v_src3));
-
-                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
-                }
-
-                int jr = j % 3;
-                if (jr)
-                    dst -= jr, j -= jr;
-            }
-            else if (dcn == 4 && haveSIMD)
-            {
-                for ( ; j <= (dn * 3 - 12); j += 12, dst += 16)
-                {
-                    __m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
-                    __m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
-                    __m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
-
-                    __m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha);
-                    __m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha);
-
-                    __m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44));
-                    __m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78);
-                    __m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e));
-                    __m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78);
-
-                    __m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1);
-                    __m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3);
-
-                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
-                }
+#include "color.hpp"
 
-                int jr = j % 3;
-                if (jr)
-                    dst -= jr, j -= jr;
-            }
-            #endif
+#include "color_hsv.simd.hpp"
+#include "color_hsv.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
-            for( ; j < dn*3; j += 3, dst += dcn )
-            {
-                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
-                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
-                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
-                if( dcn == 4 )
-                    dst[3] = alpha;
-            }
-        }
-    }
-
-    int dstcn;
-    HLS2RGB_f cvt;
-    #if CV_NEON
-    float32x4_t v_scale, v_scale_inv;
-    uint8x8_t v_alpha;
-    #elif CV_SSE2
-    __m128 v_scale;
-    __m128 v_alpha;
-    __m128i v_zero;
-    bool haveSIMD;
-    #endif
-};
+namespace cv {
 
 //
 // IPP functions
@@ -1302,29 +123,15 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
     }
 #endif
 
-    int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180;
-    int blueIdx = swapBlue ? 2 : 0;
-    if(isHSV)
-    {
-        if(depth == CV_8U)
-            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_b(scn, blueIdx, hrange));
-        else
-            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_f(scn, blueIdx, static_cast<float>(hrange)));
-    }
-    else
-    {
-        if( depth == CV_8U )
-            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_b(scn, blueIdx, hrange));
-        else
-            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_f(scn, blueIdx, static_cast<float>(hrange)));
-    }
+    CV_CPU_DISPATCH(cvtBGRtoHSV, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 // 8u, 32f
 void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
-                        uchar * dst_data, size_t dst_step,
-                        int width, int height,
-                        int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
 {
     CV_INSTRUMENT_REGION();
 
@@ -1393,22 +200,8 @@ void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
     }
 #endif
 
-    int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180;
-    int blueIdx = swapBlue ? 2 : 0;
-    if(isHSV)
-    {
-        if( depth == CV_8U )
-            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_b(dcn, blueIdx, hrange));
-        else
-            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_f(dcn, blueIdx, static_cast<float>(hrange)));
-    }
-    else
-    {
-        if( depth == CV_8U )
-            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_b(dcn, blueIdx, hrange));
-        else
-            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_f(dcn, blueIdx, static_cast<float>(hrange)));
-    }
+    CV_CPU_DISPATCH(cvtHSVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 } // namespace hal
diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp
index f0a4c87558..30ae7064bc 100644
--- a/modules/imgproc/src/color_hsv.simd.hpp
+++ b/modules/imgproc/src/color_hsv.simd.hpp
@@ -3,11 +3,31 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
-#include "color.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
-namespace cv
-{
+namespace cv {
+namespace hal {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+// forward declarations
+
+void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV);
+void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
+
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#if defined(CV_CPU_BASELINE_MODE)
+// included in color.hpp
+#else
+#include "color.simd_helpers.hpp"
+#endif
 
+namespace {
 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
 
 
@@ -1192,46 +1212,7 @@ struct HLS2RGB_b
     #endif
 };
 
-//
-// IPP functions
-//
-
-#if NEED_IPP
-
-#if !IPP_DISABLE_RGB_HSV
-static ippiGeneralFunc ippiRGB2HSVTab[] =
-{
-    (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
-    0, 0, 0, 0
-};
-#endif
-
-static ippiGeneralFunc ippiHSV2RGBTab[] =
-{
-    (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0,
-    0, 0, 0, 0
-};
-
-static ippiGeneralFunc ippiRGB2HLSTab[] =
-{
-    (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0,
-    0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0
-};
-
-static ippiGeneralFunc ippiHLS2RGBTab[] =
-{
-    (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0,
-    0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
-};
-
-#endif
-
-//
-// HAL functions
-//
-
-namespace hal
-{
+} // namespace anon
 
 // 8u, 32f
 void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
@@ -1241,67 +1222,6 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtBGRtoHSV, cv_hal_cvtBGRtoHSV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV);
-
-#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
-    CV_IPP_CHECK()
-    {
-        if(depth == CV_8U && isFullRange)
-        {
-            if (isHSV)
-            {
-#if !IPP_DISABLE_RGB_HSV // breaks OCL accuracy tests
-                if(scn == 3 && !swapBlue)
-                {
-                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
-                                            IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
-                        return;
-                }
-                else if(scn == 4 && !swapBlue)
-                {
-                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
-                        return;
-                }
-                else if(scn == 4 && swapBlue)
-                {
-                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
-                        return;
-                }
-#endif
-            }
-            else
-            {
-                if(scn == 3 && !swapBlue)
-                {
-                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
-                                            IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
-                        return;
-                }
-                else if(scn == 4 && !swapBlue)
-                {
-                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
-                        return;
-                }
-                else if(scn == 3 && swapBlue)
-                {
-                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
-                                            IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
-                        return;
-                }
-                else if(scn == 4 && swapBlue)
-                {
-                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
-                        return;
-                }
-            }
-        }
-    }
-#endif
-
     int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180;
     int blueIdx = swapBlue ? 2 : 0;
     if(isHSV)
@@ -1322,77 +1242,12 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
 
 // 8u, 32f
 void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
-                        uchar * dst_data, size_t dst_step,
-                        int width, int height,
-                        int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtHSVtoBGR, cv_hal_cvtHSVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV);
-
-#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
-    CV_IPP_CHECK()
-    {
-        if (depth == CV_8U && isFullRange)
-        {
-            if (isHSV)
-            {
-                if(dcn == 3 && !swapBlue)
-                {
-                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
-                                            IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
-                        return;
-                }
-                else if(dcn == 4 && !swapBlue)
-                {
-                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                        IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
-                        return;
-                }
-                else if(dcn == 3 && swapBlue)
-                {
-                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
-                                            IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
-                        return;
-                }
-                else if(dcn == 4 && swapBlue)
-                {
-                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                        IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
-                        return;
-                }
-            }
-            else
-            {
-                if(dcn == 3 && !swapBlue)
-                {
-                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
-                                            IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
-                        return;
-                }
-                else if(dcn == 4 && !swapBlue)
-                {
-                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                        IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
-                        return;
-                }
-                else if(dcn == 3 && swapBlue)
-                {
-                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
-                                            IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
-                        return;
-                }
-                else if(dcn == 4 && swapBlue)
-                {
-                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                        IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
-                        return;
-                }
-            }
-        }
-    }
-#endif
-
     int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180;
     int blueIdx = swapBlue ? 2 : 0;
     if(isHSV)
@@ -1411,155 +1266,6 @@ void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
     }
 }
 
-} // namespace hal
-
-//
-// OCL calls
-//
-
-#ifdef HAVE_OPENCL
-
-bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full )
-{
-    OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
-
-    int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
-
-    if(!h.createKernel("HSV2RGB", ocl::imgproc::color_hsv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full )
-{
-    OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
-
-    int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
-
-    if(!h.createKernel("HLS2RGB", ocl::imgproc::color_hsv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full )
-{
-    OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
-
-    float hscale = (_src.depth() == CV_32F ? 360.f : (!full ? 180.f : 256.f))/360.f;
-
-    if(!h.createKernel("RGB2HLS", ocl::imgproc::color_hsv_oclsrc,
-                       format("-D hscale=%ff -D bidx=%d -D dcn=3", hscale, bidx)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full )
-{
-    OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
-
-    int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 256);
-
-    cv::String options = (_src.depth() == CV_8U ?
-                          format("-D hrange=%d -D bidx=%d -D dcn=3", hrange, bidx) :
-                          format("-D hscale=%ff -D bidx=%d -D dcn=3", hrange*(1.f/360.f), bidx));
-
-    if(!h.createKernel("RGB2HSV", ocl::imgproc::color_hsv_oclsrc, options))
-    {
-        return false;
-    }
-
-    if(_src.depth() == CV_8U)
-    {
-        static UMat sdiv_data;
-        static UMat hdiv_data180;
-        static UMat hdiv_data256;
-        static int sdiv_table[256];
-        static int hdiv_table180[256];
-        static int hdiv_table256[256];
-        static volatile bool initialized180 = false, initialized256 = false;
-        volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
-
-        if (!initialized)
-        {
-            int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
-            UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
-
-            sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
-
-            int v = 255 << hsv_shift;
-            if (!initialized180 && !initialized256)
-            {
-                for(int i = 1; i < 256; i++ )
-                    sdiv_table[i] = saturate_cast<int>(v/(1.*i));
-                Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
-            }
-
-            v = hrange << hsv_shift;
-            for (int i = 1; i < 256; i++ )
-                hdiv_table[i] = saturate_cast<int>(v/(6.*i));
-
-            Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
-            initialized = true;
-        }
-
-        h.setArg(ocl::KernelArg::PtrReadOnly(sdiv_data));
-        h.setArg(hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
-                                 ocl::KernelArg::PtrReadOnly(hdiv_data180));
-    }
-
-    return h.run();
-}
-
 #endif
-
-//
-// HAL calls
-//
-
-void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange )
-{
-    CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
-
-    hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, h.scn, swapb, fullRange, false);
-}
-
-void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange )
-{
-    CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
-
-    hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, h.scn, swapb, fullRange, true);
-}
-
-void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange)
-{
-    if(dcn <= 0) dcn = 3;
-    CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
-
-    hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, dcn, swapb, fullRange, false);
-}
-
-void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange)
-{
-    if(dcn <= 0) dcn = 3;
-    CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
-
-    hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, dcn, swapb, fullRange, true);
-}
-
-
-} // namespace cv
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // namespace
diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp
index 0fff89358c..cb5c0fdf53 100644
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@@ -9,6 +9,10 @@
 \**********************************************************************************/
 
 #include "precomp.hpp"
+#include "opencl_kernels_imgproc.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/softfloat.hpp"
+
 #include "color.hpp"
 
 using cv::softfloat;
diff --git a/modules/imgproc/src/color_rgb.dispatch.cpp b/modules/imgproc/src/color_rgb.dispatch.cpp
index 9245f26d05..ed2961f0fb 100644
--- a/modules/imgproc/src/color_rgb.dispatch.cpp
+++ b/modules/imgproc/src/color_rgb.dispatch.cpp
@@ -3,1047 +3,16 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
-#include "color.hpp"
-
-#define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1
-
-namespace cv
-{
-
-////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
-
-template<typename _Tp> struct v_type;
-
-template<>
-struct v_type<uchar>{
-    typedef v_uint8 t;
-};
-
-template<>
-struct v_type<ushort>{
-    typedef v_uint16 t;
-};
-
-template<>
-struct v_type<float>{
-    typedef v_float32 t;
-};
-
-template<typename _Tp> struct v_set;
-
-template<>
-struct v_set<uchar>
-{
-    static inline v_type<uchar>::t set(uchar x)
-    {
-        return vx_setall_u8(x);
-    }
-};
-
-template<>
-struct v_set<ushort>
-{
-    static inline v_type<ushort>::t set(ushort x)
-    {
-        return vx_setall_u16(x);
-    }
-};
-
-template<>
-struct v_set<float>
-{
-    static inline v_type<float>::t set(float x)
-    {
-        return vx_setall_f32(x);
-    }
-};
-
-template<typename _Tp>
-struct RGB2RGB
-{
-    typedef _Tp channel_type;
-    typedef typename v_type<_Tp>::t vt;
-
-    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
-        srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
-    {
-        CV_Assert(srccn == 3 || srccn == 4);
-        CV_Assert(dstcn == 3 || dstcn == 4);
-    }
-
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        int scn = srccn, dcn = dstcn, bi = blueIdx;
-        int i = 0;
-        _Tp alphav = ColorChannel<_Tp>::max();
-
-#if CV_SIMD
-        const int vsize = vt::nlanes;
-
-        for(; i <= n-vsize;
-            i += vsize, src += vsize*scn, dst += vsize*dcn)
-        {
-            vt a, b, c, d;
-            if(scn == 4)
-            {
-                v_load_deinterleave(src, a, b, c, d);
-            }
-            else
-            {
-                v_load_deinterleave(src, a, b, c);
-                d = v_set<_Tp>::set(alphav);
-            }
-            if(bi == 2)
-                swap(a, c);
-
-            if(dcn == 4)
-            {
-                v_store_interleave(dst, a, b, c, d);
-            }
-            else
-            {
-                v_store_interleave(dst, a, b, c);
-            }
-        }
-        vx_cleanup();
-#endif
-        for ( ; i < n; i++, src += scn, dst += dcn )
-        {
-            _Tp t0 = src[0], t1 = src[1], t2 = src[2];
-            dst[bi  ] = t0;
-            dst[1]    = t1;
-            dst[bi^2] = t2;
-            if(dcn == 4)
-            {
-                _Tp d = scn == 4 ? src[3] : alphav;
-                dst[3] = d;
-            }
-        }
-    }
-
-    int srccn, dstcn, blueIdx;
-};
-
-
-/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
-
-struct RGB5x52RGB
-{
-    typedef uchar channel_type;
-
-    RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
-        : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
-    { }
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int dcn = dstcn, bidx = blueIdx, gb = greenBits;
-        int i = 0;
-
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
-        v_uint8 vz = vx_setzero_u8(), vn0 = vx_setall_u8(255);
-        for(; i <= n-vsize;
-            i += vsize, src += vsize*sizeof(ushort), dst += vsize*dcn)
-        {
-            v_uint16 t0 = v_reinterpret_as_u16(vx_load(src));
-            v_uint16 t1 = v_reinterpret_as_u16(vx_load(src +
-                                                       sizeof(ushort)*v_uint16::nlanes));
-
-            //TODO: shorten registers use when v_interleave is available
-            v_uint8 r, g, b, a;
-            v_uint16 b0 = (t0 << 11) >> 8;
-            v_uint16 b1 = (t1 << 11) >> 8;
-            b = v_pack(b0, b1);
-
-            v_uint16 g0, g1, r0, r1, a0, a1;
-
-            if( gb == 6 )
-            {
-                g0 = ((t0 >> 5) << 10) >> 8;
-                g1 = ((t1 >> 5) << 10) >> 8;
-
-                r0 = (t0 >> 11) << 3;
-                r1 = (t1 >> 11) << 3;
-
-                a = vn0;
-            }
-            else
-            {
-                g0 = ((t0 >> 5) << 11) >> 8;
-                g1 = ((t1 >> 5) << 11) >> 8;
-
-                r0 = ((t0 >> 10) << 11) >> 8;
-                r1 = ((t1 >> 10) << 11) >> 8;
-
-                a0 = t0 >> 15;
-                a1 = t1 >> 15;
-                a = v_pack(a0, a1);
-                a = a != vz;
-            }
-            g = v_pack(g0, g1);
-            r = v_pack(r0, r1);
-
-            if(bidx == 2)
-                swap(b, r);
-
-            if(dcn == 4)
-            {
-                v_store_interleave(dst, b, g, r, a);
-            }
-            else
-            {
-                v_store_interleave(dst, b, g, r);
-            }
-        }
-        vx_cleanup();
-#endif
-
-        for( ; i < n; i++, src += sizeof(ushort), dst += dcn )
-        {
-            unsigned t = ((const ushort*)src)[0];
-            uchar b, g, r, a;
-
-            b = (uchar)(t << 3);
-
-            if( gb == 6 )
-            {
-                g = (uchar)((t >> 3) & ~3);
-                r = (uchar)((t >> 8) & ~7);
-                a = 255;
-            }
-            else
-            {
-                g = (uchar)((t >> 2) & ~7);
-                r = (uchar)((t >> 7) & ~7);
-                a = (uchar)(((t & 0x8000) >> 15) * 255);
-            }
-
-            dst[bidx]     = b;
-            dst[1]        = g;
-            dst[bidx ^ 2] = r;
-            if( dcn == 4 )
-                dst[3] = a;
-        }
-    }
-
-    int dstcn, blueIdx, greenBits;
-};
-
-
-struct RGB2RGB5x5
-{
-    typedef uchar channel_type;
-
-    RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
-        : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
-    { }
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int scn = srccn, bidx = blueIdx, gb = greenBits;
-        int i = 0;
-
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
-        v_uint16 vn3 = vx_setall_u16((ushort)(~3));
-        v_uint16 vn7 = vx_setall_u16((ushort)(~7));
-        v_uint16 vz = vx_setzero_u16();
-        v_uint8 v7 = vx_setall_u8((uchar)(~7));
-        for(; i <= n-vsize;
-            i += vsize, src += vsize*scn, dst += vsize*sizeof(ushort))
-        {
-            v_uint8 r, g, b, a;
-            if(scn == 3)
-            {
-                v_load_deinterleave(src, b, g, r);
-                a = vx_setzero_u8();
-            }
-            else
-            {
-                v_load_deinterleave(src, b, g, r, a);
-            }
-            if(bidx == 2)
-                swap(b, r);
-
-            r = r & v7;
-
-            //TODO: shorten registers use when v_deinterleave is available
-            v_uint16 r0, r1, g0, g1, b0, b1, a0, a1;
-            v_expand(r, r0, r1);
-            v_expand(g, g0, g1);
-            v_expand(b, b0, b1);
-            v_expand(a, a0, a1);
-
-            v_uint16 d0, d1;
-
-            b0 = b0 >> 3;
-            b1 = b1 >> 3;
-            a0 = (a0 != vz) << 15;
-            a1 = (a1 != vz) << 15;
-
-            if(gb == 6)
-            {
-                d0 = b0 | ((g0 & vn3) << 3) | (r0 << 8);
-                d1 = b1 | ((g1 & vn3) << 3) | (r1 << 8);
-            }
-            else
-            {
-                d0 = b0 | ((g0 & vn7) << 2) | (r0 << 7) | a0;
-                d1 = b1 | ((g1 & vn7) << 2) | (r1 << 7) | a1;
-            }
-
-            v_store((ushort*)dst, d0);
-            v_store(((ushort*)dst) + vsize/2, d1);
-        }
-        vx_cleanup();
-#endif
-        for ( ; i < n; i++, src += scn, dst += sizeof(ushort) )
-        {
-            uchar r = src[bidx^2];
-            uchar g = src[1];
-            uchar b = src[bidx];
-            uchar a = scn == 4 ? src[3] : 0;
-
-            ushort d;
-            if (gb == 6)
-            {
-                d = (ushort)((b >> 3)|((g & ~3) << 3)|((r & ~7) << 8));
-            }
-            else
-            {
-                d = (ushort)((b >> 3)|((g & ~7) << 2)|((r & ~7) << 7)|(a ? 0x8000 : 0));
-            }
-            ((ushort*)dst)[0] = d;
-        }
-    }
-
-    int srccn, blueIdx, greenBits;
-};
-
-
-///////////////////////////////// Color to/from Grayscale ////////////////////////////////
-
-template<typename _Tp>
-struct Gray2RGB
-{
-    typedef _Tp channel_type;
-    typedef typename v_type<_Tp>::t vt;
-
-    Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        int dcn = dstcn;
-        int i = 0;
-        _Tp alpha = ColorChannel<_Tp>::max();
-
-#if CV_SIMD
-        const int vsize = vt::nlanes;
-        vt valpha = v_set<_Tp>::set(alpha);
-        for(; i <= n-vsize;
-            i += vsize, src += vsize, dst += vsize*dcn)
-        {
-            vt g = vx_load(src);
-
-            if(dcn == 3)
-            {
-                v_store_interleave(dst, g, g, g);
-            }
-            else
-            {
-                v_store_interleave(dst, g, g, g, valpha);
-            }
-        }
-        vx_cleanup();
-#endif
-        for ( ; i < n; i++, src++, dst += dcn )
-        {
-            dst[0] = dst[1] = dst[2] = src[0];
-            if(dcn == 4)
-                dst[3] = alpha;
-        }
-    }
-
-    int dstcn;
-};
-
-
-struct Gray2RGB5x5
-{
-    typedef uchar channel_type;
-
-    Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
-    { }
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int gb = greenBits;
-        int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
-        v_uint16 v3 = vx_setall_u16((ushort)(~3));
-        for(; i <= n-vsize;
-            i += vsize, src += vsize, dst += vsize*sizeof(ushort))
-        {
-            v_uint8 t8 = vx_load_low(src);
-            v_uint16 t = v_expand_low(t8);
-
-            v_uint16 t3 = t >> 3;
-
-            v_uint16 d = t3;
-            if(gb == 6)
-            {
-                d |= ((t & v3) << 3) | (t3 << 11);
-            }
-            else
-            {
-                d |= (t3 << 5) | (t3 << 10);
-            }
-
-            v_store((ushort*)dst, d);
-        }
-        vx_cleanup();
-#endif
-
-        for( ; i < n; i++, src++, dst += sizeof(ushort))
-        {
-            int t = src[0];
-            int t3 = t >> 3;
-            ushort d;
-            if( gb == 6 )
-            {
-                d = (ushort)(t3 |((t & ~3) << 3)|(t3 << 11));
-            }
-            else
-            {
-                d = (ushort)(t3 |(t3 << 5)|(t3 << 10));
-            }
-            ((ushort*)dst)[0] = d;
-        }
-    }
-    int greenBits;
-};
-
-
-struct RGB5x52Gray
-{
-    typedef uchar channel_type;
+#include "opencl_kernels_imgproc.hpp"
 
-    // can be changed to 15-shift coeffs
-    static const int BY = B2Y;
-    static const int GY = G2Y;
-    static const int RY = R2Y;
-    static const int shift = yuv_shift;
-
-    RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
-    {
-        CV_Assert(BY + GY + RY == (1 << shift));
-    }
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int gb = greenBits;
-        int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
-
-        v_int16 bg2y;
-        v_int16 r12y;
-        v_int16 dummy;
-        v_zip(vx_setall_s16(BY), vx_setall_s16(GY), bg2y, dummy);
-        v_zip(vx_setall_s16(RY), vx_setall_s16( 1), r12y, dummy);
-        v_int16 delta = vx_setall_s16(1 << (shift-1));
-
-        for(; i <= n-vsize;
-            i += vsize, src += vsize*sizeof(ushort), dst += vsize)
-        {
-            v_uint16 t = vx_load((ushort*)src);
-
-            v_uint16 r, g, b;
-            b = (t << 11) >> 8;
-
-            if(gb == 5)
-            {
-                g = ((t >> 5) << 11) >> 8;
-                r = ((t >> 10) << 11) >> 8;
-            }
-            else
-            {
-                g = ((t >> 5) << 10) >> 8;
-                r = (t >> 11) << 3;
-            }
-
-            v_uint8 d;
-            v_uint16 dx;
-
-            v_int16 sr = v_reinterpret_as_s16(r);
-            v_int16 sg = v_reinterpret_as_s16(g);
-            v_int16 sb = v_reinterpret_as_s16(b);
-
-            v_int16 bg0, bg1;
-            v_int16 rd0, rd1;
-            v_zip(sb, sg, bg0, bg1);
-            v_zip(sr, delta, rd0, rd1);
-
-            v_uint32 d0, d1;
-            d0 = v_reinterpret_as_u32(v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y));
-            d1 = v_reinterpret_as_u32(v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y));
-
-            d0 = d0 >> shift;
-            d1 = d1 >> shift;
-
-            dx = v_pack(d0, d1);
-            // high part isn't used
-            d = v_pack(dx, dx);
-
-            v_store_low(dst, d);
-        }
-        vx_cleanup();
-#endif
-        for( ; i < n; i++, src += sizeof(ushort), dst++)
-        {
-            int t = ((ushort*)src)[0];
-            uchar r, g, b;
-            b = (t << 3) & 0xf8;
-            if( gb == 6 )
-            {
-                g = (t >> 3) & 0xfc;
-                r = (t >> 8) & 0xf8;
-            }
-            else
-            {
-                g = (t >> 2) & 0xf8;
-                r = (t >> 7) & 0xf8;
-            }
-            dst[0] = (uchar)CV_DESCALE(b*BY + g*GY + r*RY, shift);
-        }
-    }
-    int greenBits;
-};
-
-
-template<typename _Tp> struct RGB2Gray
-{
-    typedef _Tp channel_type;
-
-    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
-    {
-        static const float coeffs0[] = { R2YF, G2YF, B2YF };
-        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
-        if(blueIdx == 0)
-            std::swap(coeffs[0], coeffs[2]);
-    }
-
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        int scn = srccn;
-        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
-        for(int i = 0; i < n; i++, src += scn)
-            dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
-    }
-    int srccn;
-    float coeffs[3];
-};
-
-
-template <>
-struct RGB2Gray<float>
-{
-    typedef float channel_type;
-
-    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
-    {
-        static const float coeffs0[] = { R2YF, G2YF, B2YF };
-        for(int i = 0; i < 3; i++)
-        {
-            coeffs[i] = _coeffs ? _coeffs[i] : coeffs0[i];
-        }
-        if(blueIdx == 0)
-            std::swap(coeffs[0], coeffs[2]);
-    }
-
-    void operator()(const float * src, float * dst, int n) const
-    {
-        int scn = srccn, i = 0;
-        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
-
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
-        v_float32 rv = vx_setall_f32(cr), gv = vx_setall_f32(cg), bv = vx_setall_f32(cb);
-        for(; i <= n-vsize;
-            i += vsize, src += vsize*scn, dst += vsize)
-        {
-            v_float32 r, g, b, a;
-            if(scn == 3)
-            {
-                v_load_deinterleave(src, b, g, r);
-            }
-            else
-            {
-                v_load_deinterleave(src, b, g, r, a);
-            }
-
-            v_float32 d = v_fma(r, rv, v_fma(g, gv, b*bv));
-
-            v_store(dst, d);
-        }
-        vx_cleanup();
-#endif
-
-        for ( ; i < n; i++, src += scn, dst++)
-            dst[0] = src[0]*cb + src[1]*cg + src[2]*cr;
-    }
-
-    int srccn;
-    float coeffs[3];
-};
-
-template<>
-struct RGB2Gray<uchar>
-{
-    typedef uchar channel_type;
-
-    // can be changed to 15-shift coeffs
-    static const int BY = B2Y;
-    static const int GY = G2Y;
-    static const int RY = R2Y;
-    static const int shift = yuv_shift;
-
-    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
-    {
-        const int coeffs0[] = { RY, GY, BY };
-        for(int i = 0; i < 3; i++)
-                coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
-        if(blueIdx == 0)
-            std::swap(coeffs[0], coeffs[2]);
-
-        CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
-    }
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int scn = srccn;
-        short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
-        int i = 0;
-
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
-        v_int16 bg2y;
-        v_int16 r12y;
-        v_int16 dummy;
-        v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
-        v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
-        v_int16 delta = vx_setall_s16(1 << (shift-1));
-
-        for( ; i <= n-vsize;
-             i += vsize, src += scn*vsize, dst += vsize)
-        {
-            v_uint8 r, g, b, a;
-            if(scn == 3)
-            {
-                v_load_deinterleave(src, b, g, r);
-            }
-            else
-            {
-                v_load_deinterleave(src, b, g, r, a);
-            }
-
-            //TODO: shorten registers use when v_deinterleave is available
-
-            v_uint16 r0, r1, g0, g1, b0, b1;
-            v_expand(r, r0, r1);
-            v_expand(g, g0, g1);
-            v_expand(b, b0, b1);
-
-            v_int16 bg00, bg01, bg10, bg11;
-            v_int16 rd00, rd01, rd10, rd11;
-            v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
-            v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
-            v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
-            v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);
-
-            v_uint32 y00, y01, y10, y11;
-            y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
-            y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
-            y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
-            y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
-
-            v_uint16 y0, y1;
-            y0 = v_pack(y00, y01);
-            y1 = v_pack(y10, y11);
-
-            v_uint8 y = v_pack(y0, y1);
-            v_store(dst, y);
-        }
-        vx_cleanup();
-#endif
-
-        for( ; i < n; i++, src += scn, dst++)
-        {
-            int b = src[0], g = src[1], r = src[2];
-            uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
-            dst[0] = y;
-        }
-    }
-
-    int srccn;
-    short coeffs[3];
-};
-
-
-template<>
-struct RGB2Gray<ushort>
-{
-    typedef ushort channel_type;
-
-    // can be changed to 15-shift coeffs
-    static const int BY = B2Y;
-    static const int GY = G2Y;
-    static const int RY = R2Y;
-    static const int shift = yuv_shift;
-    static const int fix_shift = (int)(sizeof(short)*8 - shift);
-
-    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
-    {
-        const int coeffs0[] = { RY, GY, BY };
-        for(int i = 0; i < 3; i++)
-                coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
-        if(blueIdx == 0)
-            std::swap(coeffs[0], coeffs[2]);
-
-        CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
-    }
-
-    void operator()(const ushort* src, ushort* dst, int n) const
-    {
-        int scn = srccn;
-        short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
-        int i = 0;
-
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
-
-        v_int16 b2y = vx_setall_s16(cb);
-        v_int16 g2y = vx_setall_s16(cg);
-        v_int16 r2y = vx_setall_s16(cr);
-        v_int16 one = vx_setall_s16(1);
-        v_int16 z = vx_setzero_s16();
-
-        v_int16 bg2y, r12y;
-        v_int16 dummy;
-        v_zip(b2y, g2y, bg2y, dummy);
-        v_zip(r2y, one, r12y, dummy);
-
-        v_int16 delta = vx_setall_s16(1 << (shift-1));
-
-        for( ; i <= n-vsize;
-             i += vsize, src += scn*vsize, dst += vsize)
-        {
-            v_uint16 r, g, b, a;
-            if(scn == 3)
-            {
-                v_load_deinterleave(src, b, g, r);
-            }
-            else
-            {
-                v_load_deinterleave(src, b, g, r, a);
-            }
-
-            v_int16 sb = v_reinterpret_as_s16(b);
-            v_int16 sr = v_reinterpret_as_s16(r);
-            v_int16 sg = v_reinterpret_as_s16(g);
-
-            v_int16 bg0, bg1;
-            v_int16 rd0, rd1;
-            v_zip(sb, sg, bg0, bg1);
-            v_zip(sr, delta, rd0, rd1);
-
-            // fixing 16bit signed multiplication
-            v_int16 mr, mg, mb;
-            mr = (sr < z) & r2y;
-            mg = (sg < z) & g2y;
-            mb = (sb < z) & b2y;
-            v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift;
-
-            v_int32 sy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift;
-            v_int32 sy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift;
-
-            v_int16 y = v_add_wrap(v_pack(sy0, sy1), fixmul);
-
-            v_store((short*)dst, y);
-        }
-        vx_cleanup();
-#endif
-        for( ; i < n; i++, src += scn, dst++)
-        {
-            int b = src[0], g = src[1], r = src[2];
-            ushort d = (ushort)CV_DESCALE((unsigned)(b*cb + g*cg + r*cr), shift);
-            dst[0] = d;
-        }
-    }
-
-    int srccn;
-    short coeffs[3];
-};
-
-
-/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
-
-template<typename _Tp>
-struct RGBA2mRGBA
-{
-    typedef _Tp channel_type;
-
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        _Tp max_val  = ColorChannel<_Tp>::max();
-        _Tp half_val = ColorChannel<_Tp>::half();
-        for( int i = 0; i < n; i++ )
-        {
-            _Tp v0 = *src++;
-            _Tp v1 = *src++;
-            _Tp v2 = *src++;
-            _Tp v3 = *src++;
-
-            *dst++ = (v0 * v3 + half_val) / max_val;
-            *dst++ = (v1 * v3 + half_val) / max_val;
-            *dst++ = (v2 * v3 + half_val) / max_val;
-            *dst++ = v3;
-        }
-    }
-};
-
-
-template<>
-struct RGBA2mRGBA<uchar>
-{
-    typedef uchar channel_type;
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        const uchar max_val  = 255;
-        const uchar half_val = 128;
-
-        int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
-        v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000));
-        v_uint16 vh = vx_setall_u16(half_val+1);
-
-        // processing 4 registers per loop cycle is about 10% faster
-        // than processing 1 register
-        for( ; i <= n-vsize;
-             i += vsize, src += 4*vsize, dst += 4*vsize)
-        {
-            v_uint8 v[4];
-            for(int j = 0; j < 4; j++)
-                v[j] = vx_load(src + j*vsize);
-
-            // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 =>
-            // => 00,00,a0,a0,00,00,a1,a1
-            // => a0,a0,a0,a0,a1,a1,a1,a1
-
-            v_uint16 a16[4];
-            for(int j = 0; j < 4; j++)
-                a16[j] = v_reinterpret_as_u16(v[j] & amask);
-
-            v_uint32 a32[4];
-            for(int j = 0; j < 4; j++)
-                a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8));
-
-            v_uint8 a[4];
-            for(int j = 0; j < 4; j++)
-                a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16));
-
-            v_uint16 m[8];
-            for(int j = 0; j < 4; j++)
-                v_mul_expand(v[j], a[j], m[j], m[j+4]);
-
-            for(int j = 0; j < 8; j++)
-                m[j] += vh;
-
-            // div 255: (v+1+(v>>8))>8
-            // +1 is in vh, has no effect on (v>>8)
-            for(int j = 0; j < 8; j++)
-                m[j] = (m[j] + (m[j] >> 8)) >> 8;
-
-            v_uint8 d[4];
-            for(int j = 0; j < 4; j++)
-                d[j] = v_pack(m[j], m[j+4]);
-
-            for(int j = 0; j < 4; j++)
-                d[j] = v_select(amask, a[j], d[j]);
-
-            for(int j = 0; j < 4; j++)
-                v_store(dst + j*vsize, d[j]);
-        }
-
-        vx_cleanup();
-#endif
-        for(; i < n; i++, src += 4, dst += 4 )
-        {
-            uchar v0 = src[0];
-            uchar v1 = src[1];
-            uchar v2 = src[2];
-            uchar v3 = src[3];
-
-            dst[0] = (v0 * v3 + half_val) / max_val;
-            dst[1] = (v1 * v3 + half_val) / max_val;
-            dst[2] = (v2 * v3 + half_val) / max_val;
-            dst[3] = v3;
-        }
-    }
-};
-
-
-template<typename _Tp>
-struct mRGBA2RGBA
-{
-    typedef _Tp channel_type;
-
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        _Tp max_val = ColorChannel<_Tp>::max();
-        for( int i = 0; i < n; i++ )
-        {
-            _Tp v0 = *src++;
-            _Tp v1 = *src++;
-            _Tp v2 = *src++;
-            _Tp v3 = *src++;
-            _Tp v3_half = v3 / 2;
-
-            *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v0 * max_val + v3_half) / v3);
-            *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v1 * max_val + v3_half) / v3);
-            *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v2 * max_val + v3_half) / v3);
-            *dst++ = v3;
-        }
-    }
-};
-
-
-template<>
-struct mRGBA2RGBA<uchar>
-{
-    typedef uchar channel_type;
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        uchar max_val = ColorChannel<uchar>::max();
-        int i = 0;
+#include "color.hpp"
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
-        v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000));
-        v_uint8 vmax = vx_setall_u8(max_val);
+#include "color_rgb.simd.hpp"
+#include "color_rgb.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
-        for( ; i <= n-vsize/4;
-             i += vsize/4, src += vsize, dst += vsize)
-        {
-            v_uint8 s = vx_load(src + 0*vsize);
-
-            // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 =>
-            // => 00,00,a0,a0,00,00,a1,a1
-            // => a0,a0,a0,a0,a1,a1,a1,a1
-            v_uint8 a;
-            v_uint16 a16;
-            v_uint32 a32;
-            a16 = v_reinterpret_as_u16(s & amask);
-            a32 = v_reinterpret_as_u32(a16 | (a16 >> 8));
-            a = v_reinterpret_as_u8(a32 | (a32 >> 16));
-
-            // s *= max_val
-            v_uint16 s0, s1;
-            v_mul_expand(s, vmax, s0, s1);
-
-            // s += a/2
-            v_uint16 ae0, ae1;
-            v_expand(a, ae0, ae1);
-            s0 += ae0 >> 1; s1 += ae1 >> 1;
-
-            // s, a -> u32 -> float
-            v_uint32 u00, u01, u10, u11;
-            v_int32 s00, s01, s10, s11;
-            v_expand(s0, u00, u01);
-            v_expand(s1, u10, u11);
-            s00 = v_reinterpret_as_s32(u00);
-            s01 = v_reinterpret_as_s32(u01);
-            s10 = v_reinterpret_as_s32(u10);
-            s11 = v_reinterpret_as_s32(u11);
-
-            v_uint32 ua00, ua01, ua10, ua11;
-            v_int32 a00, a01, a10, a11;
-            v_expand(ae0, ua00, ua01);
-            v_expand(ae1, ua10, ua11);
-            a00 = v_reinterpret_as_s32(ua00);
-            a01 = v_reinterpret_as_s32(ua01);
-            a10 = v_reinterpret_as_s32(ua10);
-            a11 = v_reinterpret_as_s32(ua11);
-
-            v_float32 fs00, fs01, fs10, fs11;
-            fs00 = v_cvt_f32(s00);
-            fs01 = v_cvt_f32(s01);
-            fs10 = v_cvt_f32(s10);
-            fs11 = v_cvt_f32(s11);
-
-            v_float32 fa00, fa01, fa10, fa11;
-            fa00 = v_cvt_f32(a00);
-            fa01 = v_cvt_f32(a01);
-            fa10 = v_cvt_f32(a10);
-            fa11 = v_cvt_f32(a11);
-
-            // float d = (float)s/(float)a
-            v_float32 fd00, fd01, fd10, fd11;
-            fd00 = fs00/fa00;
-            fd01 = fs01/fa01;
-            fd10 = fs10/fa10;
-            fd11 = fs11/fa11;
-
-            // d -> u32 -> u8
-            v_uint32 ud00, ud01, ud10, ud11;
-            ud00 = v_reinterpret_as_u32(v_trunc(fd00));
-            ud01 = v_reinterpret_as_u32(v_trunc(fd01));
-            ud10 = v_reinterpret_as_u32(v_trunc(fd10));
-            ud11 = v_reinterpret_as_u32(v_trunc(fd11));
-            v_uint16 ud0, ud1;
-            ud0 = v_pack(ud00, ud01);
-            ud1 = v_pack(ud10, ud11);
-            v_uint8 d;
-            d = v_pack(ud0, ud1);
-
-            // if a == 0 then d = 0
-            v_uint8 am;
-            am = a != vx_setzero_u8();
-            d = d & am;
-
-            // put alpha values
-            d = v_select(amask, a, d);
-
-            v_store(dst, d);
-        }
+#define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1
 
-        vx_cleanup();
-#endif
-        for(; i < n; i++, src += 4, dst += 4 )
-        {
-            uchar v0 = src[0];
-            uchar v1 = src[1];
-            uchar v2 = src[2];
-            uchar v3 = src[3];
-
-            uchar v3_half = v3 / 2;
-
-            dst[0] = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
-            dst[1] = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
-            dst[2] = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
-            dst[3] = v3;
-
-            dst[0] = (v3==0)? 0 : saturate_cast<uchar>((v0 * max_val + v3_half) / v3);
-            dst[1] = (v3==0)? 0 : saturate_cast<uchar>((v1 * max_val + v3_half) / v3);
-            dst[2] = (v3==0)? 0 : saturate_cast<uchar>((v2 * max_val + v3_half) / v3);
-            dst[3] = v3;
-        }
-    }
-};
+namespace cv {
 
 //
 // IPP functions
@@ -1051,25 +20,25 @@ struct mRGBA2RGBA<uchar>
 
 #if NEED_IPP
 
-static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
+static const ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
 {
     (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
     0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
 };
 
-static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
+static const ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
 {
     (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
     0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
 };
 
-static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
+static const ippiGeneralFunc ippiRGB2GrayC3Tab[] =
 {
     (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
     0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
 };
 
-static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
+static const ippiGeneralFunc ippiRGB2GrayC4Tab[] =
 {
     (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
     0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
@@ -1208,8 +177,7 @@ static ippiReorderFunc ippiSwapChannelsC4RTab[] =
 // HAL functions
 //
 
-namespace hal
-{
+namespace hal {
 
 // 8u, 16u, 32f
 void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
@@ -1265,13 +233,8 @@ void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
 #endif
 #endif
 
-    int blueIdx = swapBlue ? 2 : 0;
-    if( depth == CV_8U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<uchar>(scn, dcn, blueIdx));
-    else if( depth == CV_16U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<ushort>(scn, dcn, blueIdx));
-    else
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<float>(scn, dcn, blueIdx));
+    CV_CPU_DISPATCH(cvtBGRtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 // only 8u
@@ -1284,7 +247,8 @@ void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
 
     CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits);
 
-    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits));
+    CV_CPU_DISPATCH(cvtBGRtoBGR5x5, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 // only 8u
@@ -1297,7 +261,8 @@ void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
 
     CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits);
 
-    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits));
+    CV_CPU_DISPATCH(cvtBGR5x5toBGR, (src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 // 8u, 16u, 32f
@@ -1340,13 +305,8 @@ void cvtBGRtoGray(const uchar * src_data, size_t src_step,
     }
 #endif
 
-    int blueIdx = swapBlue ? 2 : 0;
-    if( depth == CV_8U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<uchar>(scn, blueIdx, 0));
-    else if( depth == CV_16U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<ushort>(scn, blueIdx, 0));
-    else
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<float>(scn, blueIdx, 0));
+    CV_CPU_DISPATCH(cvtBGRtoGray, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 // 8u, 16u, 32f
@@ -1390,12 +350,8 @@ void cvtGraytoBGR(const uchar * src_data, size_t src_step,
     }
 #endif
 
-    if( depth == CV_8U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<uchar>(dcn));
-    else if( depth == CV_16U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<ushort>(dcn));
-    else
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<float>(dcn));
+    CV_CPU_DISPATCH(cvtGraytoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 // only 8u
@@ -1407,7 +363,9 @@ void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
     CV_INSTRUMENT_REGION();
 
     CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits);
-    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits));
+
+    CV_CPU_DISPATCH(cvtBGR5x5toGray, (src_data, src_step, dst_data, dst_step, width, height, greenBits),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 // only 8u
@@ -1419,7 +377,9 @@ void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
     CV_INSTRUMENT_REGION();
 
     CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits);
-    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits));
+
+    CV_CPU_DISPATCH(cvtGraytoBGR5x5, (src_data, src_step, dst_data, dst_step, width, height, greenBits),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
@@ -1439,7 +399,8 @@ void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
     }
 #endif
 
-    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA<uchar>());
+    CV_CPU_DISPATCH(cvtRGBAtoMultipliedRGBA, (src_data, src_step, dst_data, dst_step, width, height),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
@@ -1449,7 +410,9 @@ void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
     CV_INSTRUMENT_REGION();
 
     CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height);
-    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA<uchar>());
+
+    CV_CPU_DISPATCH(cvtMultipliedRGBAtoRGBA, (src_data, src_step, dst_data, dst_step, width, height),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 } // namespace hal
diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp
index 9245f26d05..76dc4e5e1e 100644
--- a/modules/imgproc/src/color_rgb.simd.hpp
+++ b/modules/imgproc/src/color_rgb.simd.hpp
@@ -3,13 +3,58 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
-#include "color.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
-#define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1
+namespace cv {
+namespace hal {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+// forward declarations
 
-namespace cv
-{
+void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, int dcn, bool swapBlue);
+void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
+                    uchar * dst_data, size_t dst_step,
+                    int width, int height,
+                    int scn, bool swapBlue, int greenBits);
+void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
+                    uchar * dst_data, size_t dst_step,
+                    int width, int height,
+                    int dcn, bool swapBlue, int greenBits);
+void cvtBGRtoGray(const uchar * src_data, size_t src_step,
+                  uchar * dst_data, size_t dst_step,
+                  int width, int height,
+                  int depth, int scn, bool swapBlue);
+void cvtGraytoBGR(const uchar * src_data, size_t src_step,
+                  uchar * dst_data, size_t dst_step,
+                  int width, int height,
+                  int depth, int dcn);
+void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
+                     uchar * dst_data, size_t dst_step,
+                     int width, int height,
+                     int greenBits);
+void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
+                     uchar * dst_data, size_t dst_step,
+                     int width, int height,
+                     int greenBits);
+void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height);
+void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height);
 
+
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#if defined(CV_CPU_BASELINE_MODE)
+// included in color.hpp
+#else
+#include "color.simd_helpers.hpp"
+#endif
+
+namespace {
 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
 
 template<typename _Tp> struct v_type;
@@ -1044,172 +1089,7 @@ struct mRGBA2RGBA<uchar>
         }
     }
 };
-
-//
-// IPP functions
-//
-
-#if NEED_IPP
-
-static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
-{
-    (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
-    0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
-};
-
-static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
-{
-    (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
-    0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
-};
-
-static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
-{
-    (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
-    0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
-};
-
-static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
-{
-    (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
-    0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
-};
-
-
-#if !IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3
-static IppStatus ippiGrayToRGB_C1C3R(const Ipp8u*  pSrc, int srcStep, Ipp8u*  pDst, int dstStep, IppiSize roiSize)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
-}
-#endif
-static IppStatus ippiGrayToRGB_C1C3R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
-}
-static IppStatus ippiGrayToRGB_C1C3R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
-}
-
-static IppStatus ippiGrayToRGB_C1C4R(const Ipp8u*  pSrc, int srcStep, Ipp8u*  pDst, int dstStep, IppiSize roiSize, Ipp8u  aval)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
-}
-static IppStatus ippiGrayToRGB_C1C4R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize, Ipp16u aval)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
-}
-static IppStatus ippiGrayToRGB_C1C4R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize, Ipp32f aval)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
-}
-
-struct IPPColor2GrayFunctor
-{
-    IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
-        ippiColorToGray(_func)
-    {
-        coeffs[0] = B2YF;
-        coeffs[1] = G2YF;
-        coeffs[2] = R2YF;
-    }
-    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
-    {
-        return ippiColorToGray ? CV_INSTRUMENT_FUN_IPP(ippiColorToGray, src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false;
-    }
-private:
-    ippiColor2GrayFunc ippiColorToGray;
-    Ipp32f coeffs[3];
-};
-
-template <typename T>
-struct IPPGray2BGRFunctor
-{
-    IPPGray2BGRFunctor(){}
-
-    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
-    {
-        return ippiGrayToRGB_C1C3R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows)) >= 0;
-    }
-};
-
-template <typename T>
-struct IPPGray2BGRAFunctor
-{
-    IPPGray2BGRAFunctor()
-    {
-        alpha = ColorChannel<T>::max();
-    }
-
-    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
-    {
-        return ippiGrayToRGB_C1C4R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows), alpha) >= 0;
-    }
-
-    T alpha;
-};
-
-static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep,
-         IppiSize roiSize, const int *dstOrder)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_8u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u);
-}
-
-static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep,
-         IppiSize roiSize, const int *dstOrder)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_16u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u);
-}
-
-static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep,
-         IppiSize roiSize, const int *dstOrder)
-{
-    return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_32f_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f);
-}
-
-// shared
-ippiReorderFunc ippiSwapChannelsC3C4RTab[] =
-{
-    (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0,
-    0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0
-};
-
-static ippiGeneralFunc ippiCopyAC4C3RTab[] =
-{
-    (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0,
-    0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0
-};
-
-// shared
-ippiReorderFunc ippiSwapChannelsC4C3RTab[] =
-{
-    (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0,
-    0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0
-};
-
-// shared
-ippiReorderFunc ippiSwapChannelsC3RTab[] =
-{
-    (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0,
-    0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0
-};
-
-#if IPP_VERSION_X100 >= 810
-static ippiReorderFunc ippiSwapChannelsC4RTab[] =
-{
-    (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0,
-    0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0
-};
-#endif
-
-#endif
-
-//
-// HAL functions
-//
-
-namespace hal
-{
+} // namespace anon
 
 // 8u, 16u, 32f
 void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
@@ -1219,52 +1099,6 @@ void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue);
-
-#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
-    CV_IPP_CHECK()
-    {
-    if(scn == 3 && dcn == 4 && !swapBlue)
-    {
-        if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                             IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
-            return;
-    }
-    else if(scn == 4 && dcn == 3 && !swapBlue)
-    {
-        if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                             IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
-            return;
-    }
-    else if(scn == 3 && dcn == 4 && swapBlue)
-    {
-        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                            IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
-            return;
-    }
-    else if(scn == 4 && dcn == 3 && swapBlue)
-    {
-        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                            IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
-            return;
-    }
-    else if(scn == 3 && dcn == 3 && swapBlue)
-    {
-        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
-                                IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
-            return;
-    }
-#if IPP_VERSION_X100 >= 810
-    else if(scn == 4 && dcn == 4 && swapBlue)
-    {
-        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
-                                IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
-            return;
-    }
-    }
-#endif
-#endif
-
     int blueIdx = swapBlue ? 2 : 0;
     if( depth == CV_8U )
         CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<uchar>(scn, dcn, blueIdx));
@@ -1282,8 +1116,6 @@ void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits);
-
     CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits));
 }
 
@@ -1295,8 +1127,6 @@ void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits);
-
     CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits));
 }
 
@@ -1308,38 +1138,6 @@ void cvtBGRtoGray(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtBGRtoGray, cv_hal_cvtBGRtoGray, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue);
-
-#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
-    CV_IPP_CHECK()
-    {
-        if(depth == CV_32F && scn == 3 && !swapBlue)
-        {
-            if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
-                return;
-        }
-        else if(depth == CV_32F && scn == 3 && swapBlue)
-        {
-            if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
-                return;
-        }
-        else if(depth == CV_32F && scn == 4 && !swapBlue)
-        {
-            if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
-                return;
-        }
-        else if(depth == CV_32F && scn == 4 && swapBlue)
-        {
-            if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
-                return;
-        }
-    }
-#endif
-
     int blueIdx = swapBlue ? 2 : 0;
     if( depth == CV_8U )
         CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<uchar>(scn, blueIdx, 0));
@@ -1357,39 +1155,6 @@ void cvtGraytoBGR(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtGraytoBGR, cv_hal_cvtGraytoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn);
-
-#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
-    CV_IPP_CHECK()
-    {
-        bool ippres = false;
-        if(dcn == 3)
-        {
-            if( depth == CV_8U )
-            {
-#if !IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3
-                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp8u>());
-#endif
-            }
-            else if( depth == CV_16U )
-                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp16u>());
-            else
-                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp32f>());
-        }
-        else if(dcn == 4)
-        {
-            if( depth == CV_8U )
-                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp8u>());
-            else if( depth == CV_16U )
-                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp16u>());
-            else
-                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp32f>());
-        }
-        if(ippres)
-            return;
-    }
-#endif
-
     if( depth == CV_8U )
         CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<uchar>(dcn));
     else if( depth == CV_16U )
@@ -1406,7 +1171,6 @@ void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits);
     CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits));
 }
 
@@ -1418,7 +1182,6 @@ void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits);
     CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits));
 }
 
@@ -1428,17 +1191,6 @@ void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtRGBAtoMultipliedRGBA, cv_hal_cvtRGBAtoMultipliedRGBA, src_data, src_step, dst_data, dst_step, width, height);
-
-#ifdef HAVE_IPP
-    CV_IPP_CHECK()
-    {
-    if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                        IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
-        return;
-    }
-#endif
-
     CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA<uchar>());
 }
 
@@ -1448,209 +1200,9 @@ void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height);
     CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA<uchar>());
 }
 
-} // namespace hal
-
-//
-// OCL calls
-//
-
-#ifdef HAVE_OPENCL
-
-bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse )
-{
-    OclHelper< Set<3, 4>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
-
-    if(!h.createKernel("RGB", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER")))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits )
-{
-    OclHelper< Set<3, 4>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
-
-    if(!h.createKernel("RGB2RGB5x5", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, gbits)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits)
-{
-    OclHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
-
-    if(!h.createKernel("RGB5x52RGB", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, gbits)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits)
-{
-    OclHelper< Set<2>, Set<1>, Set<CV_8U> > h(_src, _dst, 1);
-
-    if(!h.createKernel("BGR5x52Gray", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=1 -D bidx=0 -D greenbits=%d", gbits)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits)
-{
-    OclHelper< Set<1>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
-
-    if(!h.createKernel("Gray2BGR5x5", ocl::imgproc::color_rgb_oclsrc,
-                        format("-D dcn=2 -D bidx=0 -D greenbits=%d", gbits)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx)
-{
-    OclHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
-
-    int stripeSize = 1;
-    if(!h.createKernel("RGB2Gray", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", bidx, stripeSize)))
-    {
-        return false;
-    }
-
-    h.globalSize[0] = (h.src.cols + stripeSize - 1)/stripeSize;
-    return h.run();
-}
-
-bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn)
-{
-    OclHelper< Set<1>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
-    if(!h.createKernel("Gray2RGB", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D bidx=0 -D dcn=%d", dcn)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst)
-{
-    OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
-
-    if(!h.createKernel("RGBA2mRGBA", ocl::imgproc::color_rgb_oclsrc,
-                       "-D dcn=4 -D bidx=3"))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst)
-{
-    OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
-
-    if(!h.createKernel("mRGBA2RGBA", ocl::imgproc::color_rgb_oclsrc,
-                       "-D dcn=4 -D bidx=3"))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
 #endif
-
-//
-// HAL calls
-//
-
-void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb)
-{
-    CvtHelper< Set<3, 4>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
-
-    hal::cvtBGRtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, h.scn, dcn, swapb);
-}
-
-void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits)
-{
-    CvtHelper< Set<3, 4>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
-
-    hal::cvtBGRtoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                        h.scn, swapb, gbits);
-}
-
-void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits)
-{
-    if(dcn <= 0) dcn = 3;
-    CvtHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
-
-    hal::cvtBGR5x5toBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                        dcn, swapb, gbits);
-}
-
-void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb)
-{
-    CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
-
-    hal::cvtBGRtoGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                      h.depth, h.scn, swapb);
-}
-
-void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn)
-{
-    if(dcn <= 0) dcn = 3;
-    CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
-
-    hal::cvtGraytoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, h.depth, dcn);
-}
-
-void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits)
-{
-    CvtHelper< Set<2>, Set<1>, Set<CV_8U> > h(_src, _dst, 1);
-
-    hal::cvtBGR5x5toGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits);
-}
-
-void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits)
-{
-    CvtHelper< Set<1>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
-
-    hal::cvtGraytoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits);
-}
-
-void cvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst)
-{
-    CvtHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
-
-    hal::cvtRGBAtoMultipliedRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows);
-}
-
-void cvtColormRGBA2RGBA( InputArray _src, OutputArray _dst)
-{
-    CvtHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
-
-    hal::cvtMultipliedRGBAtoRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows);
-}
-
-} // namespace cv
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // namespace
diff --git a/modules/imgproc/src/color_yuv.dispatch.cpp b/modules/imgproc/src/color_yuv.dispatch.cpp
index 7d731378e2..6cb508f980 100644
--- a/modules/imgproc/src/color_yuv.dispatch.cpp
+++ b/modules/imgproc/src/color_yuv.dispatch.cpp
@@ -3,1747 +3,19 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
-#include "color.hpp"
-
-namespace cv
-{
-
-//constants for conversion from/to RGB and YUV, YCrCb according to BT.601
-
-//to YCbCr
-static const float YCBF = 0.564f; // == 1/2/(1-B2YF)
-static const float YCRF = 0.713f; // == 1/2/(1-R2YF)
-static const int YCBI = 9241;  // == YCBF*16384
-static const int YCRI = 11682; // == YCRF*16384
-//to YUV
-static const float B2UF = 0.492f;
-static const float R2VF = 0.877f;
-static const int B2UI = 8061;  // == B2UF*16384
-static const int R2VI = 14369; // == R2VF*16384
-//from YUV
-static const float U2BF = 2.032f;
-static const float U2GF = -0.395f;
-static const float V2GF = -0.581f;
-static const float V2RF = 1.140f;
-static const int U2BI = 33292;
-static const int U2GI = -6472;
-static const int V2GI = -9519;
-static const int V2RI = 18678;
-//from YCrCb
-static const float CB2BF = 1.773f;
-static const float CB2GF = -0.344f;
-static const float CR2GF = -0.714f;
-static const float CR2RF = 1.403f;
-static const int CB2BI = 29049;
-static const int CB2GI = -5636;
-static const int CR2GI = -11698;
-static const int CR2RI = 22987;
-
-///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
-
-template<typename _Tp> struct RGB2YCrCb_f
-{
-    typedef _Tp channel_type;
-
-    RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) :
-        srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF };
-        static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF };
-        for(int i = 0; i < 5; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i];
-        }
-        if(blueIdx == 0)
-            std::swap(coeffs[0], coeffs[2]);
-    }
-
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        int scn = srccn, bidx = blueIdx;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        const _Tp delta = ColorChannel<_Tp>::half();
-        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
-        n *= 3;
-        for(int i = 0; i < n; i += 3, src += scn)
-        {
-            _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
-            _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta);
-            _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta);
-            dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb;
-        }
-    }
-    int srccn, blueIdx;
-    bool isCrCb;
-    float coeffs[5];
-};
-
-template <>
-struct RGB2YCrCb_f<float>
-{
-    typedef float channel_type;
-
-    RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) :
-        srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF };
-        static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF };
-        for(int i = 0; i < 5; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i];
-        }
-        if(blueIdx == 0)
-            std::swap(coeffs[0], coeffs[2]);
-    }
-
-    void operator()(const float * src, float * dst, int n) const
-    {
-        int scn = srccn, bidx = blueIdx;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        const float delta = ColorChannel<float>::half();
-        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
-
-        int i = 0;
-#if CV_SIMD
-        v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
-        v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4);
-        v_float32 vdelta = vx_setall_f32(delta);
-        const int vsize = v_float32::nlanes;
-        for( ; i <= n-vsize;
-             i += vsize, src += vsize*scn, dst += vsize*3)
-        {
-            v_float32 b, g, r, dummy;
-            if(scn == 3)
-            {
-                v_load_deinterleave(src, b, g, r);
-            }
-            else
-            {
-                v_load_deinterleave(src, b, g, r, dummy);
-            }
-
-            v_float32 y, cr, cb;
-            y = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
-
-            if(bidx)
-                std::swap(r, b);
-
-            cr = v_fma(r - y, vc3, vdelta);
-            cb = v_fma(b - y, vc4, vdelta);
-
-            if(yuvOrder)
-            {
-                v_store_interleave(dst, y, cb, cr);
-            }
-            else
-            {
-                v_store_interleave(dst, y, cr, cb);
-            }
-        }
-        vx_cleanup();
-#endif
-        for ( ; i < n; i ++, src += scn, dst += 3)
-        {
-            float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
-            float Cr = (src[bidx^2] - Y)*C3 + delta;
-            float Cb = (src[bidx] - Y)*C4 + delta;
-            dst[0         ] = Y;
-            dst[1+yuvOrder] = Cr;
-            dst[2-yuvOrder] = Cb;
-        }
-    }
-
-    int srccn, blueIdx;
-    bool isCrCb;
-    float coeffs[5];
-};
-
-
-template<typename _Tp> struct RGB2YCrCb_i
-{
-    typedef _Tp channel_type;
-    static const int shift = yuv_shift;
-
-    RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
-        : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
-        static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
-
-        for(int i = 0; i < 5; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i];
-        }
-        if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
-    }
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        int scn = srccn, bidx = blueIdx;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
-        int delta = ColorChannel<_Tp>::half()*(1 << shift);
-        n *= 3;
-        for(int i = 0; i < n; i += 3, src += scn)
-        {
-            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift);
-            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift);
-            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift);
-            dst[i] = saturate_cast<_Tp>(Y);
-            dst[i+1+yuvOrder] = saturate_cast<_Tp>(Cr);
-            dst[i+2-yuvOrder] = saturate_cast<_Tp>(Cb);
-        }
-    }
-    int srccn, blueIdx;
-    bool isCrCb;
-    int coeffs[5];
-};
-
-
-template<>
-struct RGB2YCrCb_i<ushort>
-{
-    typedef ushort channel_type;
-    static const int shift = yuv_shift;
-    static const int fix_shift = (int)(sizeof(short)*8 - shift);
-
-    RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
-        : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
-        static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
-
-        for(int i = 0; i < 5; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i];
-        }
-        if(blueIdx==0)
-            std::swap(coeffs[0], coeffs[2]);
-    }
-
-    void operator()(const ushort* src, ushort* dst, int n) const
-    {
-        int scn = srccn, bidx = blueIdx;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
-        int sdelta = ColorChannel<ushort>::half()*(1 << shift);
-        int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
-        const int descale = 1 << (shift-1);
-
-        v_int16 b2y = vx_setall_s16((short)C0);
-        v_int16 g2y = vx_setall_s16((short)C1);
-        v_int16 r2y = vx_setall_s16((short)C2);
-        v_int16 one = vx_setall_s16(1);
-        v_int16 z = vx_setzero_s16();
-
-        v_int16 bg2y, r12y;
-        v_int16 dummy;
-        v_zip(b2y, g2y, bg2y, dummy);
-        v_zip(r2y, one, r12y, dummy);
-
-        v_int16 vdescale = vx_setall_s16(1 << (shift-1));
-        v_int32 vc3 = vx_setall_s32(C3);
-        v_int32 vc4 = vx_setall_s32(C4);
-        v_int32 vdd = vx_setall_s32(sdelta + descale);
-
-        for(; i <= n-vsize;
-            i += vsize, src += vsize*scn, dst += vsize*3)
-        {
-            v_uint16 r, g, b, a;
-            if(scn == 3)
-            {
-                v_load_deinterleave(src, b, g, r);
-            }
-            else
-            {
-                v_load_deinterleave(src, b, g, r, a);
-            }
-
-            v_uint16 y, cr, cb;
-
-            v_int16 sb = v_reinterpret_as_s16(b);
-            v_int16 sr = v_reinterpret_as_s16(r);
-            v_int16 sg = v_reinterpret_as_s16(g);
-
-            v_int16 bg0, bg1;
-            v_int16 rd0, rd1;
-            v_zip(sb, sg, bg0, bg1);
-            v_zip(sr, vdescale, rd0, rd1);
-
-            // fixing 16bit signed multiplication
-            v_int16 mr, mg, mb;
-            mr = (sr < z) & r2y;
-            mg = (sg < z) & g2y;
-            mb = (sb < z) & b2y;
-            v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift;
-
-            v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift;
-            v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift;
-
-            y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul));
-
-            if(bidx)
-                swap(r, b);
-
-            // (r-Y) and (b-Y) don't fit into int16 or uint16 range
-            v_uint32 r0, r1, b0, b1;
-            v_expand(r, r0, r1);
-            v_expand(b, b0, b1);
-
-            v_uint32 uy0, uy1;
-            v_expand(y, uy0, uy1);
-
-            v_int32 sr0 = v_reinterpret_as_s32(r0);
-            v_int32 sr1 = v_reinterpret_as_s32(r1);
-            v_int32 sb0 = v_reinterpret_as_s32(b0);
-            v_int32 sb1 = v_reinterpret_as_s32(b1);
-            v_int32 sy0 = v_reinterpret_as_s32(uy0);
-            v_int32 sy1 = v_reinterpret_as_s32(uy1);
-
-            sr0 = sr0 - sy0; sr1 = sr1 - sy1;
-            sb0 = sb0 - sy0; sb1 = sb1 - sy1;
-
-            v_int32 scr0, scr1, scb0, scb1;
-
-            scr0 = (sr0*vc3 + vdd) >> shift;
-            scr1 = (sr1*vc3 + vdd) >> shift;
-            scb0 = (sb0*vc4 + vdd) >> shift;
-            scb1 = (sb1*vc4 + vdd) >> shift;
-
-            // saturate and pack
-            cr = v_pack_u(scr0, scr1);
-            cb = v_pack_u(scb0, scb1);
-
-            if(yuvOrder)
-            {
-                v_store_interleave(dst, y, cb, cr);
-            }
-            else
-            {
-                v_store_interleave(dst, y, cr, cb);
-            }
-        }
-        vx_cleanup();
-#endif
-        for( ; i < n; i++, src += scn, dst += 3)
-        {
-            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift);
-            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + sdelta, shift);
-            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + sdelta, shift);
-            dst[0]          = saturate_cast<ushort>(Y);
-            dst[1+yuvOrder] = saturate_cast<ushort>(Cr);
-            dst[2-yuvOrder] = saturate_cast<ushort>(Cb);
-        }
-    }
-    int srccn, blueIdx;
-    bool isCrCb;
-    int coeffs[5];
-};
-
-
-template <>
-struct RGB2YCrCb_i<uchar>
-{
-    typedef uchar channel_type;
-    static const int shift = yuv_shift;
-
-    RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
-        : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
-        static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
-        for(int i = 0; i < 5; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i];
-        }
-        if (blueIdx==0)
-            std::swap(coeffs[0], coeffs[2]);
-    }
-
-    void operator()(const uchar * src, uchar * dst, int n) const
-    {
-        int scn = srccn, bidx = blueIdx, i = 0;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
-        int delta = ColorChannel<uchar>::half()*(1 << shift);
-
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
-        const int descaleShift = 1 << (shift-1);
-        v_int16 bg2y;
-        v_int16 r12y;
-        v_int16 dummy;
-        v_zip(vx_setall_s16((short)C0), vx_setall_s16((short)C1), bg2y, dummy);
-        v_zip(vx_setall_s16((short)C2), vx_setall_s16( 1), r12y, dummy);
-
-        // delta + descaleShift == descaleShift*(half*2+1)
-        v_int16 c3h, c4h;
-        const short h21 = (short)(ColorChannel<uchar>::half()*2+1);
-        v_zip(vx_setall_s16((short)C3), vx_setall_s16(h21), c3h, dummy);
-        v_zip(vx_setall_s16((short)C4), vx_setall_s16(h21), c4h, dummy);
-
-        v_int16 vdescale = vx_setall_s16(descaleShift);
-
-        for( ; i <= n-vsize;
-             i += vsize, src += scn*vsize, dst += 3*vsize)
-        {
-            v_uint8 r, g, b, a;
-            if(scn == 3)
-            {
-                v_load_deinterleave(src, b, g, r);
-            }
-            else
-            {
-                v_load_deinterleave(src, b, g, r, a);
-            }
-
-            v_uint8 y;
-
-            v_uint16 r0, r1, g0, g1, b0, b1;
-            v_expand(r, r0, r1);
-            v_expand(g, g0, g1);
-            v_expand(b, b0, b1);
-
-            v_int16 sr0, sr1, sg0, sg1, sb0, sb1;
-            sr0 = v_reinterpret_as_s16(r0); sr1 = v_reinterpret_as_s16(r1);
-            sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1);
-            sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1);
-
-            v_uint32 y00, y01, y10, y11;
-            {
-                v_int16 bg00, bg01, bg10, bg11;
-                v_int16 rd00, rd01, rd10, rd11;
-                v_zip(sb0, sg0, bg00, bg01);
-                v_zip(sb1, sg1, bg10, bg11);
-                v_zip(sr0, vdescale, rd00, rd01);
-                v_zip(sr1, vdescale, rd10, rd11);
-
-                y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
-                y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
-                y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
-                y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
-            }
-
-            v_uint16 y0, y1;
-            y0 = v_pack(y00, y01);
-            y1 = v_pack(y10, y11);
-
-            y = v_pack(y0, y1);
-
-            v_int16 sy0, sy1;
-            sy0 = v_reinterpret_as_s16(y0);
-            sy1 = v_reinterpret_as_s16(y1);
-
-            // (r-Y) and (b-Y) don't fit into 8 bit, use 16 bits instead
-            sr0 = v_sub_wrap(sr0, sy0);
-            sr1 = v_sub_wrap(sr1, sy1);
-            sb0 = v_sub_wrap(sb0, sy0);
-            sb1 = v_sub_wrap(sb1, sy1);
-
-            if(bidx)
-            {
-                swap(sr0, sb0); swap(sr1, sb1);
-            }
-
-            v_int32 cr00, cr01, cr10, cr11;
-            v_int32 cb00, cb01, cb10, cb11;
-
-            // delta + descaleShift == descaleShift*(half*2+1)
-            {
-                v_int16 rd00, rd01, rd10, rd11;
-                v_int16 bd00, bd01, bd10, bd11;
-
-                v_zip(sr0, vdescale, rd00, rd01);
-                v_zip(sr1, vdescale, rd10, rd11);
-
-                v_zip(sb0, vdescale, bd00, bd01);
-                v_zip(sb1, vdescale, bd10, bd11);
-
-                cr00 = v_dotprod(rd00, c3h);
-                cr01 = v_dotprod(rd01, c3h);
-                cr10 = v_dotprod(rd10, c3h);
-                cr11 = v_dotprod(rd11, c3h);
-
-                cb00 = v_dotprod(bd00, c4h);
-                cb01 = v_dotprod(bd01, c4h);
-                cb10 = v_dotprod(bd10, c4h);
-                cb11 = v_dotprod(bd11, c4h);
-            }
-
-            v_uint8 cr, cb;
-
-            cr00 = cr00 >> shift;
-            cr01 = cr01 >> shift;
-            cr10 = cr10 >> shift;
-            cr11 = cr11 >> shift;
-
-            cb00 = cb00 >> shift;
-            cb01 = cb01 >> shift;
-            cb10 = cb10 >> shift;
-            cb11 = cb11 >> shift;
-
-            v_int16 cr0, cr1, cb0, cb1;
-            cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11);
-            cb0 = v_pack(cb00, cb01); cb1 = v_pack(cb10, cb11);
-
-            cr = v_pack_u(cr0, cr1);
-            cb = v_pack_u(cb0, cb1);
-
-            if(yuvOrder)
-            {
-                v_store_interleave(dst, y, cb, cr);
-            }
-            else
-            {
-                v_store_interleave(dst, y, cr, cb);
-            }
-        }
-        vx_cleanup();
-#endif
-
-        for ( ; i < n; i++, src += scn, dst += 3)
-        {
-            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift);
-            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift);
-            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift);
-            dst[0] = saturate_cast<uchar>(Y);
-            dst[1+yuvOrder] = saturate_cast<uchar>(Cr);
-            dst[2-yuvOrder] = saturate_cast<uchar>(Cb);
-        }
-    }
-
-    int srccn, blueIdx, coeffs[5];
-    bool isCrCb;
-};
-
-
-template<typename _Tp> struct YCrCb2RGB_f
-{
-    typedef _Tp channel_type;
-
-    YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb)
-        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF};
-        static const float coeffs_yuv[] = { V2RF,  V2GF,  U2GF,  U2BF};
-        for(int i = 0; i < 4; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i];
-        }
-    }
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        int dcn = dstcn, bidx = blueIdx;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
-        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
-        n *= 3;
-        for(int i = 0; i < n; i += 3, dst += dcn)
-        {
-            _Tp Y = src[i];
-            _Tp Cr = src[i+1+yuvOrder];
-            _Tp Cb = src[i+2-yuvOrder];
-
-            _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
-            _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
-            _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0);
-
-            dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
-            if( dcn == 4 )
-                dst[3] = alpha;
-        }
-    }
-    int dstcn, blueIdx;
-    bool isCrCb;
-    float coeffs[4];
-};
-
-
-template<>
-struct YCrCb2RGB_f<float>
-{
-    typedef float channel_type;
-
-    YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb)
-        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF};
-        static const float coeffs_yuv[] = { V2RF,  V2GF,  U2GF,  U2BF};
-        for(int i = 0; i < 4; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i];
-        }
-    }
-
-    void operator()(const float* src, float* dst, int n) const
-    {
-        int dcn = dstcn, bidx = blueIdx;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
-        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
-
-        int i = 0;
-#if CV_SIMD
-        v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1);
-        v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3);
-        v_float32 vdelta = vx_setall_f32(delta);
-        v_float32 valpha = vx_setall_f32(alpha);
-        const int vsize = v_float32::nlanes;
-        for( ; i <= n-vsize;
-             i += vsize, src += vsize*3, dst += vsize*dcn)
-        {
-            v_float32 y, cr, cb;
-            if(yuvOrder)
-                v_load_deinterleave(src, y, cb, cr);
-            else
-                v_load_deinterleave(src, y, cr, cb);
-
-            v_float32 b, g, r;
-
-            cb -= vdelta; cr -= vdelta;
-            b = v_fma(cb, vc3, y);
-            g = v_fma(cr, vc1, v_fma(cb, vc2, y));
-            r = v_fma(cr, vc0, y);
-
-            if(bidx)
-                swap(r, b);
-
-            if(dcn == 3)
-                v_store_interleave(dst, b, g, r);
-            else
-                v_store_interleave(dst, b, g, r, valpha);
-        }
-        vx_cleanup();
-#endif
-        for(; i < n; i++, src += 3, dst += dcn)
-        {
-            float Y  = src[0];
-            float Cr = src[1+yuvOrder];
-            float Cb = src[2-yuvOrder];
-
-            float b = Y + (Cb - delta)*C3;
-            float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
-            float r = Y + (Cr - delta)*C0;
-
-            dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
-            if( dcn == 4 )
-                dst[3] = alpha;
-        }
-    }
-    int dstcn, blueIdx;
-    bool isCrCb;
-    float coeffs[4];
-};
-
-
-template<typename _Tp> struct YCrCb2RGB_i
-{
-    typedef _Tp channel_type;
-    static const int shift = yuv_shift;
-
-    YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb)
-        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI};
-        static const int coeffs_yuv[] = {  V2RI,  V2GI,  U2GI, U2BI };
-        for(int i = 0; i < 4; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i];
-        }
-    }
-
-    void operator()(const _Tp* src, _Tp* dst, int n) const
-    {
-        int dcn = dstcn, bidx = blueIdx;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
-        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
-        n *= 3;
-        for(int i = 0; i < n; i += 3, dst += dcn)
-        {
-            _Tp Y = src[i];
-            _Tp Cr = src[i+1+yuvOrder];
-            _Tp Cb = src[i+2-yuvOrder];
-
-            int b = Y + CV_DESCALE((Cb - delta)*C3, shift);
-            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift);
-            int r = Y + CV_DESCALE((Cr - delta)*C0, shift);
-
-            dst[bidx] = saturate_cast<_Tp>(b);
-            dst[1] = saturate_cast<_Tp>(g);
-            dst[bidx^2] = saturate_cast<_Tp>(r);
-            if( dcn == 4 )
-                dst[3] = alpha;
-        }
-    }
-    int dstcn, blueIdx;
-    bool isCrCb;
-    int coeffs[4];
-};
-
-
-template <>
-struct YCrCb2RGB_i<uchar>
-{
-    typedef uchar channel_type;
-    static const int shift = yuv_shift;
-
-    YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb)
-        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI};
-        static const int coeffs_yuv[] = {  V2RI,  V2GI,  U2GI, U2BI };
-        for(int i = 0; i < 4; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i];
-        }
-    }
-
-    void operator()(const uchar* src, uchar* dst, int n) const
-    {
-        int dcn = dstcn, bidx = blueIdx, i = 0;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
-        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
-
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
-        v_uint8 valpha = vx_setall_u8(alpha);
-        v_uint8 vdelta = vx_setall_u8(delta);
-        const int descaleShift = 1 << (shift - 1);
-        v_int32 vdescale = vx_setall_s32(descaleShift);
-
-        v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2);
-        // if YUV then C3 > 2^15, need to subtract it
-        // to fit in short by short multiplication
-        v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3);
-
-        for( ; i <= n-vsize;
-             i += vsize, src += 3*vsize, dst += dcn*vsize)
-        {
-            v_uint8 y, cr, cb;
-            if(yuvOrder)
-            {
-                v_load_deinterleave(src, y, cb, cr);
-            }
-            else
-            {
-                v_load_deinterleave(src, y, cr, cb);
-            }
-
-            cr = v_sub_wrap(cr, vdelta);
-            cb = v_sub_wrap(cb, vdelta);
-
-            v_int8 scr = v_reinterpret_as_s8(cr);
-            v_int8 scb = v_reinterpret_as_s8(cb);
-
-            v_int16 scr0, scr1, scb0, scb1;
-            v_expand(scr, scr0, scr1);
-            v_expand(scb, scb0, scb1);
-
-            v_int32 b00, b01, b10, b11;
-            v_int32 g00, g01, g10, g11;
-            v_int32 r00, r01, r10, r11;
-
-            v_mul_expand(scb0, vc3, b00, b01);
-            v_mul_expand(scb1, vc3, b10, b11);
-            if(yuvOrder)
-            {
-                // if YUV then C3 > 2^15
-                // so we fix the multiplication
-                v_int32 cb00, cb01, cb10, cb11;
-                v_expand(scb0, cb00, cb01);
-                v_expand(scb1, cb10, cb11);
-                b00 += cb00 << 15; b01 += cb01 << 15;
-                b10 += cb10 << 15; b11 += cb11 << 15;
-            }
-
-            v_int32 t00, t01, t10, t11;
-            v_mul_expand(scb0, vc2, t00, t01);
-            v_mul_expand(scb1, vc2, t10, t11);
-            v_mul_expand(scr0, vc1, g00, g01);
-            v_mul_expand(scr1, vc1, g10, g11);
-            g00 += t00; g01 += t01;
-            g10 += t10; g11 += t11;
-            v_mul_expand(scr0, vc0, r00, r01);
-            v_mul_expand(scr1, vc0, r10, r11);
-
-            b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift;
-            b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift;
-            g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift;
-            g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift;
-            r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift;
-            r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift;
-
-            v_int16 b0, b1, g0, g1, r0, r1;
-            b0 = v_pack(b00, b01); b1 = v_pack(b10, b11);
-            g0 = v_pack(g00, g01); g1 = v_pack(g10, g11);
-            r0 = v_pack(r00, r01); r1 = v_pack(r10, r11);
-
-            v_uint16 y0, y1;
-            v_expand(y, y0, y1);
-            v_int16 sy0, sy1;
-            sy0 = v_reinterpret_as_s16(y0);
-            sy1 = v_reinterpret_as_s16(y1);
-
-            b0 = v_add_wrap(b0, sy0); b1 = v_add_wrap(b1, sy1);
-            g0 = v_add_wrap(g0, sy0); g1 = v_add_wrap(g1, sy1);
-            r0 = v_add_wrap(r0, sy0); r1 = v_add_wrap(r1, sy1);
-
-            v_uint8 b, g, r;
-            b = v_pack_u(b0, b1);
-            g = v_pack_u(g0, g1);
-            r = v_pack_u(r0, r1);
-
-            if(bidx)
-                swap(r, b);
-
-            if(dcn == 3)
-            {
-                v_store_interleave(dst, b, g, r);
-            }
-            else
-            {
-                v_store_interleave(dst, b, g, r, valpha);
-            }
-        }
-        vx_cleanup();
-#endif
-
-        for ( ; i < n; i++, src += 3, dst += dcn)
-        {
-            uchar Y  = src[0];
-            uchar Cr = src[1+yuvOrder];
-            uchar Cb = src[2-yuvOrder];
-
-            int b = Y + CV_DESCALE((Cb - delta)*C3, shift);
-            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift);
-            int r = Y + CV_DESCALE((Cr - delta)*C0, shift);
-
-            dst[bidx] = saturate_cast<uchar>(b);
-            dst[1] = saturate_cast<uchar>(g);
-            dst[bidx^2] = saturate_cast<uchar>(r);
-            if( dcn == 4 )
-                dst[3] = alpha;
-        }
-    }
-    int dstcn, blueIdx;
-    bool isCrCb;
-    int coeffs[4];
-};
+#include "opencl_kernels_imgproc.hpp"
 
+#include "color.hpp"
 
-template <>
-struct YCrCb2RGB_i<ushort>
-{
-    typedef ushort channel_type;
-    static const int shift = yuv_shift;
-
-    YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb)
-        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
-    {
-        static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI};
-        static const int coeffs_yuv[] = {  V2RI,  V2GI,  U2GI, U2BI };
-        for(int i = 0; i < 4; i++)
-        {
-            coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i];
-        }
-    }
-
-    void operator()(const ushort* src, ushort* dst, int n) const
-    {
-        int dcn = dstcn, bidx = blueIdx, i = 0;
-        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
-        const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
-        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
-
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
-        const int descaleShift = 1 << (shift-1);
-        v_uint16 valpha = vx_setall_u16(alpha);
-        v_uint16 vdelta = vx_setall_u16(delta);
-        v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2);
-        // if YUV then C3 > 2^15, need to subtract it
-        // to fit in short by short multiplication
-        v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3);
-        v_int32 vdescale = vx_setall_s32(descaleShift);
-        for(; i <= n-vsize;
-            i += vsize, src += vsize*3, dst += vsize*dcn)
-        {
-            v_uint16 y, cr, cb;
-            if(yuvOrder)
-            {
-                v_load_deinterleave(src, y, cb, cr);
-            }
-            else
-            {
-                v_load_deinterleave(src, y, cr, cb);
-            }
-
-            v_uint32 uy0, uy1;
-            v_expand(y, uy0, uy1);
-            v_int32 y0 = v_reinterpret_as_s32(uy0);
-            v_int32 y1 = v_reinterpret_as_s32(uy1);
-
-            cr = v_sub_wrap(cr, vdelta);
-            cb = v_sub_wrap(cb, vdelta);
-
-            v_int32 b0, b1, g0, g1, r0, r1;
-
-            v_int16 scb = v_reinterpret_as_s16(cb);
-            v_int16 scr = v_reinterpret_as_s16(cr);
-            v_mul_expand(scb, vc3, b0, b1);
-            if(yuvOrder)
-            {
-                // if YUV then C3 > 2^15
-                // so we fix the multiplication
-                v_int32 cb0, cb1;
-                v_expand(scb, cb0, cb1);
-                b0 += cb0 << 15;
-                b1 += cb1 << 15;
-            }
-            v_int32 t0, t1;
-            v_mul_expand(scb, vc2, t0, t1);
-            v_mul_expand(scr, vc1, g0, g1);
-            g0 += t0; g1 += t1;
-            v_mul_expand(scr, vc0, r0, r1);
-
-            // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits
-            b0 = ((b0 + vdescale) >> shift) + y0;
-            b1 = ((b1 + vdescale) >> shift) + y1;
-            g0 = ((g0 + vdescale) >> shift) + y0;
-            g1 = ((g1 + vdescale) >> shift) + y1;
-            r0 = ((r0 + vdescale) >> shift) + y0;
-            r1 = ((r1 + vdescale) >> shift) + y1;
-
-            // saturate and pack
-            v_uint16 b, g, r;
-            b = v_pack_u(b0, b1);
-            g = v_pack_u(g0, g1);
-            r = v_pack_u(r0, r1);
-
-            if(bidx)
-                swap(r, b);
-
-            if(dcn == 3)
-            {
-                v_store_interleave(dst, b, g, r);
-            }
-            else
-            {
-                v_store_interleave(dst, b, g, r, valpha);
-            }
-        }
-        vx_cleanup();
-#endif
-
-        for ( ; i < n; i++, src += 3, dst += dcn)
-        {
-            ushort Y  = src[0];
-            ushort Cr = src[1+yuvOrder];
-            ushort Cb = src[2-yuvOrder];
-
-            int b = Y + CV_DESCALE((Cb - delta)*C3, shift);
-            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift);
-            int r = Y + CV_DESCALE((Cr - delta)*C0, shift);
-
-            dst[bidx]   = saturate_cast<ushort>(b);
-            dst[1]      = saturate_cast<ushort>(g);
-            dst[bidx^2] = saturate_cast<ushort>(r);
-            if( dcn == 4 )
-                dst[3] = alpha;
-        }
-    }
-    int dstcn, blueIdx;
-    bool isCrCb;
-    int coeffs[4];
-};
-
-
-///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
-
-static const int ITUR_BT_601_CY = 1220542;
-static const int ITUR_BT_601_CUB = 2116026;
-static const int ITUR_BT_601_CUG = -409993;
-static const int ITUR_BT_601_CVG = -852492;
-static const int ITUR_BT_601_CVR = 1673527;
-static const int ITUR_BT_601_SHIFT = 20;
-
-// Coefficients for RGB to YUV420p conversion
-static const int ITUR_BT_601_CRY =  269484;
-static const int ITUR_BT_601_CGY =  528482;
-static const int ITUR_BT_601_CBY =  102760;
-static const int ITUR_BT_601_CRU = -155188;
-static const int ITUR_BT_601_CGU = -305135;
-static const int ITUR_BT_601_CBU =  460324;
-static const int ITUR_BT_601_CGV = -385875;
-static const int ITUR_BT_601_CBV = -74448;
-
-//R = 1.164(Y - 16) + 1.596(V - 128)
-//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
-//B = 1.164(Y - 16)                  + 2.018(U - 128)
-
-//R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
-//G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
-//B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
-
-static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv)
-{
-    int uu, vv;
-    uu = int(u) - 128;
-    vv = int(v) - 128;
-
-    ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv;
-    guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu;
-    buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
-}
-
-static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
-                             v_int32 (&ruv)[4],
-                             v_int32 (&guv)[4],
-                             v_int32 (&buv)[4])
-{
-    v_uint8 v128 = vx_setall_u8(128);
-    v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
-    v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128));
-
-    v_int16 uu0, uu1, vv0, vv1;
-    v_expand(su, uu0, uu1);
-    v_expand(sv, vv0, vv1);
-    v_int32 uu[4], vv[4];
-    v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]);
-    v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]);
-
-    v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1));
-    v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR);
-    v_int32 vg = vx_setall_s32(ITUR_BT_601_CVG);
-    v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG);
-    v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB);
-
-    for (int k = 0; k < 4; k++)
-    {
-        ruv[k] = vshift + vr * vv[k];
-        guv[k] = vshift + vg * vv[k] + ug * uu[k];
-        buv[k] = vshift + ub * uu[k];
-    }
-}
-
-static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv,
-                                uchar& r, uchar& g, uchar& b, uchar& a)
-{
-    int yy = int(vy);
-    int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
-    r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
-    g = saturate_cast<uchar>((y + guv) >> ITUR_BT_601_SHIFT);
-    b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
-    a = uchar(0xff);
-}
-
-static inline void yRGBuvToRGBA(const v_uint8& vy,
-                                const v_int32 (&ruv)[4],
-                                const v_int32 (&guv)[4],
-                                const v_int32 (&buv)[4],
-                                v_uint8& rr, v_uint8& gg, v_uint8& bb)
-{
-    v_uint8 v16 = vx_setall_u8(16);
-    v_uint8 posY = vy - v16;
-    v_uint16 yy0, yy1;
-    v_expand(posY, yy0, yy1);
-    v_int32 yy[4];
-    v_int32 yy00, yy01, yy10, yy11;
-    v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]);
-    v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]);
-
-    v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY);
-
-    v_int32 y[4], r[4], g[4], b[4];
-    for(int k = 0; k < 4; k++)
-    {
-        y[k] = yy[k]*vcy;
-        r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT;
-        g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT;
-        b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT;
-    }
-
-    v_int16 r0, r1, g0, g1, b0, b1;
-    r0 = v_pack(r[0], r[1]);
-    r1 = v_pack(r[2], r[3]);
-    g0 = v_pack(g[0], g[1]);
-    g1 = v_pack(g[2], g[3]);
-    b0 = v_pack(b[0], b[1]);
-    b1 = v_pack(b[2], b[3]);
-
-    rr = v_pack_u(r0, r1);
-    gg = v_pack_u(g0, g1);
-    bb = v_pack_u(b0, b1);
-}
-
-template<int bIdx, int dcn, bool is420>
-static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v,
-                                    const uchar vy01, const uchar vy11, const uchar vy02, const uchar vy12,
-                                    uchar* row1, uchar* row2)
-{
-    int ruv, guv, buv;
-    uvToRGBuv(u, v, ruv, guv, buv);
-
-    uchar r00, g00, b00, a00;
-    uchar r01, g01, b01, a01;
-
-    yRGBuvToRGBA(vy01, ruv, guv, buv, r00, g00, b00, a00);
-    yRGBuvToRGBA(vy11, ruv, guv, buv, r01, g01, b01, a01);
-
-    row1[2-bIdx] = r00;
-    row1[1]      = g00;
-    row1[bIdx]   = b00;
-    if(dcn == 4)
-        row1[3] = a00;
-
-    row1[dcn+2-bIdx] = r01;
-    row1[dcn+1]      = g01;
-    row1[dcn+0+bIdx] = b01;
-    if(dcn == 4)
-        row1[7] = a01;
-
-    if(is420)
-    {
-        uchar r10, g10, b10, a10;
-        uchar r11, g11, b11, a11;
-
-        yRGBuvToRGBA(vy02, ruv, guv, buv, r10, g10, b10, a10);
-        yRGBuvToRGBA(vy12, ruv, guv, buv, r11, g11, b11, a11);
-
-        row2[2-bIdx] = r10;
-        row2[1]      = g10;
-        row2[bIdx]   = b10;
-        if(dcn == 4)
-            row2[3] = a10;
-
-        row2[dcn+2-bIdx] = r11;
-        row2[dcn+1]      = g11;
-        row2[dcn+0+bIdx] = b11;
-        if(dcn == 4)
-            row2[7] = a11;
-    }
-}
-
-// bIdx is 0 or 2, uIdx is 0 or 1, dcn is 3 or 4
-template<int bIdx, int uIdx, int dcn>
-struct YUV420sp2RGB8Invoker : ParallelLoopBody
-{
-    uchar * dst_data;
-    size_t dst_step;
-    int width;
-    const uchar* my1, *muv;
-    size_t stride;
-
-    YUV420sp2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv)
-        : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), muv(_uv), stride(_stride) {}
-
-    void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const int rangeBegin = range.start * 2;
-        const int rangeEnd   = range.end   * 2;
-
-        const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
-
-        for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
-        {
-            uchar* row1 = dst_data + dst_step * j;
-            uchar* row2 = dst_data + dst_step * (j + 1);
-            const uchar* y2 = y1 + stride;
-
-            int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
-            v_uint8 a = vx_setall_u8(uchar(0xff));
-            for( ; i <= width - 2*vsize;
-                 i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
-            {
-                v_uint8 u, v;
-                v_load_deinterleave(uv + i, u, v);
-
-                if(uIdx)
-                {
-                    swap(u, v);
-                }
-
-                v_uint8 vy[4];
-                v_load_deinterleave(y1 + i, vy[0], vy[1]);
-                v_load_deinterleave(y2 + i, vy[2], vy[3]);
-
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
-
-                v_uint8 r[4], g[4], b[4];
-
-                for(int k = 0; k < 4; k++)
-                {
-                    yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
-                }
-
-                if(bIdx)
-                {
-                    for(int k = 0; k < 4; k++)
-                        swap(r[k], b[k]);
-                }
-
-                // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
-                v_uint8 r0_0, r0_1, r1_0, r1_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_zip(r[2], r[3], r1_0, r1_1);
-                v_uint8 g0_0, g0_1, g1_0, g1_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_zip(g[2], g[3], g1_0, g1_1);
-                v_uint8 b0_0, b0_1, b1_0, b1_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-                v_zip(b[2], b[3], b1_0, b1_1);
-
-                if(dcn == 4)
-                {
-                    v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a);
-                    v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a);
-
-                    v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a);
-                    v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a);
-                }
-                else //dcn == 3
-                {
-                    v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0);
-                    v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1);
-
-                    v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0);
-                    v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1);
-                }
-            }
-            vx_cleanup();
-#endif
-            for ( ; i < width; i += 2, row1 += dcn*2, row2 += dcn*2)
-            {
-                uchar u = uv[i + 0 + uIdx];
-                uchar v = uv[i + 1 - uIdx];
-
-                uchar vy01 = y1[i];
-                uchar vy11 = y1[i + 1];
-                uchar vy02 = y2[i];
-                uchar vy12 = y2[i + 1];
-
-                cvtYuv42xxp2RGB8<bIdx, dcn, true>(u, v, vy01, vy11, vy02, vy12, row1, row2);
-            }
-        }
-    }
-};
-
-template<int bIdx, int dcn>
-struct YUV420p2RGB8Invoker : ParallelLoopBody
-{
-    uchar * dst_data;
-    size_t dst_step;
-    int width;
-    const uchar* my1, *mu, *mv;
-    size_t stride;
-    int ustepIdx, vstepIdx;
-
-    YUV420p2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
-        : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), mu(_u), mv(_v), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
-
-    void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const int rangeBegin = range.start * 2;
-        const int rangeEnd = range.end * 2;
-
-        int uvsteps[2] = {width/2, static_cast<int>(stride) - width/2};
-        int usIdx = ustepIdx, vsIdx = vstepIdx;
-
-        const uchar* y1 = my1 + rangeBegin * stride;
-        const uchar* u1 = mu + (range.start / 2) * stride;
-        const uchar* v1 = mv + (range.start / 2) * stride;
-
-        if(range.start % 2 == 1)
-        {
-            u1 += uvsteps[(usIdx++) & 1];
-            v1 += uvsteps[(vsIdx++) & 1];
-        }
-
-        for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
-        {
-            uchar* row1 = dst_data + dst_step * j;
-            uchar* row2 = dst_data + dst_step * (j + 1);
-            const uchar* y2 = y1 + stride;
-            int i = 0;
-
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
-            v_uint8 a = vx_setall_u8(uchar(0xff));
-            for( ; i <= width/2 - vsize;
-                 i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
-            {
-                v_uint8 u, v;
-                u = vx_load(u1 + i);
-                v = vx_load(v1 + i);
-
-                v_uint8 vy[4];
-                v_load_deinterleave(y1 + 2*i, vy[0], vy[1]);
-                v_load_deinterleave(y2 + 2*i, vy[2], vy[3]);
-
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
-
-                v_uint8 r[4], g[4], b[4];
-
-                for(int k = 0; k < 4; k++)
-                {
-                    yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
-                }
-
-                if(bIdx)
-                {
-                    for(int k = 0; k < 4; k++)
-                        swap(r[k], b[k]);
-                }
-
-                // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
-                v_uint8 r0_0, r0_1, r1_0, r1_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_zip(r[2], r[3], r1_0, r1_1);
-                v_uint8 g0_0, g0_1, g1_0, g1_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_zip(g[2], g[3], g1_0, g1_1);
-                v_uint8 b0_0, b0_1, b1_0, b1_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-                v_zip(b[2], b[3], b1_0, b1_1);
-
-                if(dcn == 4)
-                {
-                    v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a);
-                    v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a);
-
-                    v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a);
-                    v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a);
-                }
-                else //dcn == 3
-                {
-                    v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0);
-                    v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1);
-
-                    v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0);
-                    v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1);
-                }
-            }
-            vx_cleanup();
-#endif
-            for (; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2)
-            {
-                uchar u = u1[i];
-                uchar v = v1[i];
-
-                uchar vy01 = y1[2 * i];
-                uchar vy11 = y1[2 * i + 1];
-                uchar vy02 = y2[2 * i];
-                uchar vy12 = y2[2 * i + 1];
-
-                cvtYuv42xxp2RGB8<bIdx, dcn, true>(u, v, vy01, vy11, vy02, vy12, row1, row2);
-            }
-        }
-    }
-};
-
-
-#define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240)
-
-template<int bIdx, int uIdx, int dcn>
-inline void cvtYUV420sp2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _uv)
-{
-    YUV420sp2RGB8Invoker<bIdx, uIdx, dcn> converter(dst_data, dst_step, dst_width, _stride, _y1,  _uv);
-    if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for_(Range(0, dst_height/2), converter);
-    else
-        converter(Range(0, dst_height/2));
-}
-
-template<int bIdx, int dcn>
-inline void cvtYUV420p2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
-{
-    YUV420p2RGB8Invoker<bIdx, dcn> converter(dst_data, dst_step, dst_width, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-    if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for_(Range(0, dst_height/2), converter);
-    else
-        converter(Range(0, dst_height/2));
-}
-
-///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
-
-static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
-{
-    const int shifted16 = (16 << ITUR_BT_601_SHIFT);
-    const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
-    int yy = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16;
-
-    return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
-}
-
-static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
-{
-    const int shifted16 = (16 << ITUR_BT_601_SHIFT);
-    const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
-    v_uint16 r0, r1, g0, g1, b0, b1;
-    v_expand(r, r0, r1);
-    v_expand(g, g0, g1);
-    v_expand(b, b0, b1);
-
-    v_uint32 rq[4], gq[4], bq[4];
-    v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]);
-    v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]);
-    v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]);
-
-    v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY);
-    v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16);
-
-    v_uint32 y[4];
-    for(int k = 0; k < 4; k++)
-    {
-        y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT;
-    }
-
-    v_uint16 y0, y1;
-    y0 = v_pack(y[0], y[1]);
-    y1 = v_pack(y[2], y[3]);
-
-    return v_pack(y0, y1);
-}
-
-static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
-{
-    const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
-    const int shifted128 = (128 << ITUR_BT_601_SHIFT);
-    int uu = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128;
-    int vv = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128;
-
-    u = saturate_cast<uchar>(uu >> ITUR_BT_601_SHIFT);
-    v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
-}
-
-static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
-                              const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
-{
-    // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..]
-    v_int16 vlowByte = vx_setall_s16(0x00ff);
-    v_int16 rd0, rd1, gd0, gd1, bd0, bd1;
-    rd0 = v_reinterpret_as_s16(r0) & vlowByte;
-    rd1 = v_reinterpret_as_s16(r1) & vlowByte;
-    gd0 = v_reinterpret_as_s16(g0) & vlowByte;
-    gd1 = v_reinterpret_as_s16(g1) & vlowByte;
-    bd0 = v_reinterpret_as_s16(b0) & vlowByte;
-    bd1 = v_reinterpret_as_s16(b1) & vlowByte;
-
-    v_int32 rq[4], gq[4], bq[4];
-    v_expand(rd0, rq[0], rq[1]);
-    v_expand(rd1, rq[2], rq[3]);
-    v_expand(gd0, gq[0], gq[1]);
-    v_expand(gd1, gq[2], gq[3]);
-    v_expand(bd0, bq[0], bq[1]);
-    v_expand(bd1, bq[2], bq[3]);
-
-    const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
-    const int shifted128 = (128 << ITUR_BT_601_SHIFT);
-    v_int32 shift = vx_setall_s32(halfShift + shifted128);
-    v_int32 ru, gu, bu, gv, bv;
-    ru = vx_setall_s32(ITUR_BT_601_CRU);
-    gu = vx_setall_s32(ITUR_BT_601_CGU);
-    gv = vx_setall_s32(ITUR_BT_601_CGV);
-    bu = vx_setall_s32(ITUR_BT_601_CBU);
-    bv = vx_setall_s32(ITUR_BT_601_CBV);
-
-    v_int32 uq[4], vq[4];
-    for(int k = 0; k < 4; k++)
-    {
-        uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT;
-        vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT;
-    }
-
-    v_int16 u0, u1, v0, v1;
-    u0 = v_pack(uq[0], uq[1]);
-    u1 = v_pack(uq[2], uq[3]);
-    v0 = v_pack(vq[0], vq[1]);
-    v1 = v_pack(vq[2], vq[3]);
-
-    u = v_pack_u(u0, u1);
-    v = v_pack_u(v0, v1);
-}
-
-
-struct RGB8toYUV420pInvoker: public ParallelLoopBody
-{
-    RGB8toYUV420pInvoker(const uchar * _srcData, size_t _srcStep,
-                           uchar * _yData, uchar * _uvData, size_t _dstStep,
-                           int _srcWidth, int _srcHeight, int _scn, bool _swapBlue, bool _swapUV, bool _interleave)
-        : srcData(_srcData), srcStep(_srcStep),
-          yData(_yData), uvData(_uvData), dstStep(_dstStep),
-          srcWidth(_srcWidth), srcHeight(_srcHeight),
-          srcCn(_scn), swapBlue(_swapBlue), swapUV(_swapUV), interleave(_interleave) { }
-
-    void operator()(const Range& rowRange) const CV_OVERRIDE
-    {
-        const int w = srcWidth;
-        const int h = srcHeight;
-        const int scn = srcCn;
-        const uchar* srcRow = (uchar*)0;
-        uchar* yRow = (uchar*)0, *uRow = (uchar*)0, *vRow = (uchar*)0, *uvRow = (uchar*)0;
-        for( int sRow = rowRange.start*2; sRow < rowRange.end*2; sRow++)
-        {
-            srcRow = srcData + srcStep*sRow;
-            yRow = yData + dstStep * sRow;
-            bool evenRow = (sRow % 2) == 0;
-            if(evenRow)
-            {
-                if (interleave)
-                {
-                    uvRow = uvData + dstStep*(sRow/2);
-                }
-                else
-                {
-                    uRow = uvData + dstStep * (sRow/4) + ((sRow/2) % 2) * (w/2);
-                    vRow = uvData + dstStep * ((sRow + h)/4) + (((sRow + h)/2) % 2) * (w/2);
-                }
-            }
-            int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
-
-            for( ; i <= w/2 - vsize;
-                 i += vsize)
-            {
-                // processing (2*vsize) pixels at once
-                v_uint8 b0, b1, g0, g1, r0, r1, a0, a1;
-                if(scn == 4)
-                {
-                    v_load_deinterleave(srcRow + 2*4*i + 0*vsize, b0, g0, r0, a0);
-                    v_load_deinterleave(srcRow + 2*4*i + 4*vsize, b1, g1, r1, a1);
-                }
-                else // scn == 3
-                {
-                    v_load_deinterleave(srcRow + 2*3*i + 0*vsize, b0, g0, r0);
-                    v_load_deinterleave(srcRow + 2*3*i + 3*vsize, b1, g1, r1);
-                }
-
-                if(swapBlue)
-                {
-                    swap(b0, r0); swap(b1, r1);
-                }
-
-                v_uint8 y0, y1;
-
-                y0 = rgbToY42x(r0, g0, b0);
-                y1 = rgbToY42x(r1, g1, b1);
-
-                v_store(yRow + 2*i + 0*vsize, y0);
-                v_store(yRow + 2*i + 1*vsize, y1);
-
-                if(evenRow)
-                {
-                    v_uint8 u, v;
-                    rgbToUV42x(r0, r1, g0, g1, b0, b1, u, v);
-
-                    if(swapUV)
-                    {
-                        swap(u, v);
-                    }
-
-                    if(interleave)
-                    {
-                        v_store_interleave(uvRow + 2*i, u, v);
-                    }
-                    else
-                    {
-                        v_store(uRow + i, u);
-                        v_store(vRow + i, v);
-                    }
-                }
-            }
-            vx_cleanup();
-#endif
-            // processing two pixels at once
-            for( ; i < w/2; i++)
-            {
-                uchar b0, g0, r0;
-                uchar b1, g1, r1;
-                b0 = srcRow[(2*i+0)*scn + 0];
-                g0 = srcRow[(2*i+0)*scn + 1];
-                r0 = srcRow[(2*i+0)*scn + 2];
-                b1 = srcRow[(2*i+1)*scn + 0];
-                g1 = srcRow[(2*i+1)*scn + 1];
-                r1 = srcRow[(2*i+1)*scn + 2];
-
-                if(swapBlue)
-                {
-                    swap(b0, r0); swap(b1, r1);
-                }
-
-                uchar y0 = rgbToY42x(r0, g0, b0);
-                uchar y1 = rgbToY42x(r1, g1, b1);
-
-                yRow[2*i+0] = y0;
-                yRow[2*i+1] = y1;
-
-                if(evenRow)
-                {
-                    uchar uu, vv;
-                    rgbToUV42x(r0, g0, b0, uu, vv);
-                    if(swapUV)
-                    {
-                        swap(uu, vv);
-                    }
-
-                    if(interleave)
-                    {
-                        uvRow[2*i+0] = uu;
-                        uvRow[2*i+1] = vv;
-                    }
-                    else
-                    {
-                        uRow[i] = uu;
-                        vRow[i] = vv;
-                    }
-                }
-            }
-        }
-    }
-
-    const uchar * srcData;
-    size_t srcStep;
-    uchar *yData, *uvData;
-    size_t dstStep;
-    int srcWidth;
-    int srcHeight;
-    const int srcCn;
-    bool swapBlue;
-    bool swapUV;
-    bool interleave;
-};
-
-
-///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
-
-// bIdx is 0 or 2; [uIdx, yIdx] is [0, 0], [0, 1], [1, 0]; dcn is 3 or 4
-template<int bIdx, int uIdx, int yIdx, int dcn>
-struct YUV422toRGB8Invoker : ParallelLoopBody
-{
-    uchar * dst_data;
-    size_t dst_step;
-    const uchar * src_data;
-    size_t src_step;
-    int width;
-
-    YUV422toRGB8Invoker(uchar * _dst_data, size_t _dst_step,
-                        const uchar * _src_data, size_t _src_step,
-                        int _width)
-        : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {}
-
-    void operator()(const Range& range) const CV_OVERRIDE
-    {
-        int rangeBegin = range.start;
-        int rangeEnd = range.end;
-
-        // [yIdx, uIdx] | [uidx, vidx]:
-        //     0, 0     |     1, 3
-        //     0, 1     |     3, 1
-        //     1, 0     |     0, 2
-        const int uidx = 1 - yIdx + uIdx * 2;
-        const int vidx = (2 + uidx) % 4;
-        const uchar* yuv_src = src_data + rangeBegin * src_step;
-
-        for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step)
-        {
-            uchar* row = dst_data + dst_step * j;
-            int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
-            v_uint8 a = vx_setall_u8(uchar(0xff));
-            for(; i <= 2*width - 4*vsize;
-                i += 4*vsize, row += vsize*dcn*2)
-            {
-                v_uint8 u, v, vy[2];
-                if(yIdx == 1) // UYVY
-                {
-                    v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]);
-                }
-                else // YUYV or YVYU
-                {
-                    v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v);
-                    if(uIdx == 1) // YVYU
-                    {
-                        swap(u, v);
-                    }
-                }
-
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
-
-                v_uint8 r[2], g[2], b[2];
-
-                yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]);
-                yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]);
-
-                if(bIdx)
-                {
-                    swap(r[0], b[0]);
-                    swap(r[1], b[1]);
-                }
-
-                // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
-                v_uint8 r0_0, r0_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_uint8 g0_0, g0_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_uint8 b0_0, b0_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-
-                if(dcn == 4)
-                {
-                    v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0, a);
-                    v_store_interleave(row + 4*vsize, b0_1, g0_1, r0_1, a);
-                }
-                else //dcn == 3
-                {
-                    v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0);
-                    v_store_interleave(row + 3*vsize, b0_1, g0_1, r0_1);
-                }
-            }
-            vx_cleanup();
-#endif
-            for (; i < 2 * width; i += 4, row += dcn*2)
-            {
-                uchar u = yuv_src[i + uidx];
-                uchar v = yuv_src[i + vidx];
-
-                uchar vy0 = yuv_src[i + yIdx];
-                uchar vy1 = yuv_src[i + yIdx + 2];
-
-                cvtYuv42xxp2RGB8<bIdx, dcn, false>(u, v, vy0, vy1, 0, 0, row, (uchar*)(0));
-            }
-        }
-    }
-};
-
-#define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240)
+#include "color_yuv.simd.hpp"
+#include "color_yuv.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
-template<int bIdx, int uIdx, int yIdx, int dcn>
-inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step,
-                           int width, int height)
-{
-    YUV422toRGB8Invoker<bIdx, uIdx, yIdx, dcn> converter(dst_data, dst_step, src_data, src_step, width);
-    if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for_(Range(0, height), converter);
-    else
-        converter(Range(0, height));
-}
+namespace cv {
 
 //
 // HAL functions
 //
-
-namespace hal
-{
+namespace hal {
 
 // 8u, 16u, 32f
 void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
@@ -1790,13 +62,8 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
 #endif
 #endif
 
-    int blueIdx = swapBlue ? 2 : 0;
-    if( depth == CV_8U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<uchar>(scn, blueIdx, isCbCr));
-    else if( depth == CV_16U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<ushort>(scn, blueIdx, isCbCr));
-    else
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_f<float>(scn, blueIdx, isCbCr));
+    CV_CPU_DISPATCH(cvtBGRtoYUV, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
@@ -1844,13 +111,8 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
 #endif
 #endif
 
-    int blueIdx = swapBlue ? 2 : 0;
-    if( depth == CV_8U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<uchar>(dcn, blueIdx, isCbCr));
-    else if( depth == CV_16U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<ushort>(dcn, blueIdx, isCbCr));
-    else
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_f<float>(dcn, blueIdx, isCbCr));
+    CV_CPU_DISPATCH(cvtYUVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
@@ -1861,17 +123,10 @@ void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
     CV_INSTRUMENT_REGION();
 
     CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
-    const uchar* uv = src_data + src_step * static_cast<size_t>(dst_height);
-    cvtTwoPlaneYUVtoBGR(src_data, uv, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
-}
 
-typedef void (*cvt_2plane_yuv_ptr_t)(uchar * /* dst_data*/,
-                       size_t /* dst_step */,
-                       int /* dst_width */,
-                       int /* dst_height */,
-                       size_t /* _stride */,
-                       const uchar* /* _y1 */,
-                       const uchar* /* _uv */);
+    CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
 
 void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
@@ -1880,66 +135,21 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src
 {
     CV_INSTRUMENT_REGION();
 
-    // TODO: add hal replacement method
-
-    int blueIdx = swapBlue ? 2 : 0;
-
-    cvt_2plane_yuv_ptr_t cvtPtr;
-    switch(dcn*100 + blueIdx * 10 + uIdx)
-    {
-    case 300: cvtPtr = cvtYUV420sp2RGB<0, 0, 3>; break;
-    case 301: cvtPtr = cvtYUV420sp2RGB<0, 1, 3>; break;
-    case 320: cvtPtr = cvtYUV420sp2RGB<2, 0, 3>; break;
-    case 321: cvtPtr = cvtYUV420sp2RGB<2, 1, 3>; break;
-    case 400: cvtPtr = cvtYUV420sp2RGB<0, 0, 4>; break;
-    case 401: cvtPtr = cvtYUV420sp2RGB<0, 1, 4>; break;
-    case 420: cvtPtr = cvtYUV420sp2RGB<2, 0, 4>; break;
-    case 421: cvtPtr = cvtYUV420sp2RGB<2, 1, 4>; break;
-    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
-    };
-
-    cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data);
+    CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (y_data, uv_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
-typedef void (*cvt_3plane_yuv_ptr_t)(uchar * /* dst_data */,
-                                     size_t /* dst_step */,
-                                     int /* dst_width */,
-                                     int /* dst_height */,
-                                     size_t /* _stride */,
-                                     const uchar* /* _y1 */,
-                                     const uchar* /* _u */,
-                                     const uchar* /* _v */,
-                                     int /* ustepIdx */,
-                                     int /* vstepIdx */);
-
 void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
-                                  uchar * dst_data, size_t dst_step,
-                                  int dst_width, int dst_height,
-                                  int dcn, bool swapBlue, int uIdx)
+                           uchar * dst_data, size_t dst_step,
+                           int dst_width, int dst_height,
+                           int dcn, bool swapBlue, int uIdx)
 {
     CV_INSTRUMENT_REGION();
 
     CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
-    const uchar* u = src_data + src_step * static_cast<size_t>(dst_height);
-    const uchar* v = src_data + src_step * static_cast<size_t>(dst_height + dst_height/4) + (dst_width/2) * ((dst_height % 4)/2);
-
-    int ustepIdx = 0;
-    int vstepIdx = dst_height % 4 == 2 ? 1 : 0;
 
-    if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
-    int blueIdx = swapBlue ? 2 : 0;
-
-    cvt_3plane_yuv_ptr_t cvtPtr;
-    switch(dcn*10 + blueIdx)
-    {
-    case 30: cvtPtr = cvtYUV420p2RGB<0, 3>; break;
-    case 32: cvtPtr = cvtYUV420p2RGB<2, 3>; break;
-    case 40: cvtPtr = cvtYUV420p2RGB<0, 4>; break;
-    case 42: cvtPtr = cvtYUV420p2RGB<2, 4>; break;
-    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
-    };
-
-    cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx);
+    CV_CPU_DISPATCH(cvtThreePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
@@ -1950,15 +160,9 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
     CV_INSTRUMENT_REGION();
 
     CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx);
-    uchar * uv_data = dst_data + dst_step * height;
 
-    RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height,
-                             scn, swapBlue, uIdx == 2, false);
-
-    if( width * height >= 320*240 )
-        parallel_for_(Range(0, height/2), cvt);
-    else
-        cvt(Range(0, height/2));
+    CV_CPU_DISPATCH(cvtBGRtoThreePlaneYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
@@ -1970,22 +174,10 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
 
     // TODO: add hal replacement method
 
-    RGB8toYUV420pInvoker cvt(src_data, src_step, y_data, uv_data, dst_step, width, height,
-                             scn, swapBlue, uIdx == 2, true);
-
-    if( width * height >= 320*240 )
-        parallel_for_(Range(0, height/2), cvt);
-    else
-        cvt(Range(0, height/2));
+    CV_CPU_DISPATCH(cvtBGRtoTwoPlaneYUV, (src_data, src_step, y_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
-typedef void (*cvt_1plane_yuv_ptr_t)(uchar * /* dst_data */,
-                                     size_t /* dst_step */,
-                                     const uchar * /* src_data */,
-                                     size_t /* src_step */,
-                                     int /* width */,
-                                     int /* height */);
-
 void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int width, int height,
@@ -1995,26 +187,8 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 
     CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn);
 
-    cvt_1plane_yuv_ptr_t cvtPtr;
-    int blueIdx = swapBlue ? 2 : 0;
-    switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn)
-    {
-    case 3000: cvtPtr = cvtYUV422toRGB<0,0,0,3>; break;
-    case 3001: cvtPtr = cvtYUV422toRGB<0,0,1,3>; break;
-    case 3010: cvtPtr = cvtYUV422toRGB<0,1,0,3>; break;
-    case 3200: cvtPtr = cvtYUV422toRGB<2,0,0,3>; break;
-    case 3201: cvtPtr = cvtYUV422toRGB<2,0,1,3>; break;
-    case 3210: cvtPtr = cvtYUV422toRGB<2,1,0,3>; break;
-    case 4000: cvtPtr = cvtYUV422toRGB<0,0,0,4>; break;
-    case 4001: cvtPtr = cvtYUV422toRGB<0,0,1,4>; break;
-    case 4010: cvtPtr = cvtYUV422toRGB<0,1,0,4>; break;
-    case 4200: cvtPtr = cvtYUV422toRGB<2,0,0,4>; break;
-    case 4201: cvtPtr = cvtYUV422toRGB<2,0,1,4>; break;
-    case 4210: cvtPtr = cvtYUV422toRGB<2,1,0,4>; break;
-    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
-    };
-
-    cvtPtr(dst_data, dst_step, src_data, src_step, width, height);
+    CV_CPU_DISPATCH(cvtOnePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
 } // namespace hal
diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp
index 7d731378e2..8bbd78b244 100644
--- a/modules/imgproc/src/color_yuv.simd.hpp
+++ b/modules/imgproc/src/color_yuv.simd.hpp
@@ -3,11 +3,54 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "precomp.hpp"
-#include "color.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
-namespace cv
-{
+namespace cv {
+namespace hal {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+// forward declarations
+void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, bool swapBlue, bool isCbCr);
+void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int dcn, bool swapBlue, bool isCbCr);
+void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int dst_width, int dst_height,
+                         int dcn, bool swapBlue, int uIdx);
+void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int dst_width, int dst_height,
+                         int dcn, bool swapBlue, int uIdx);
+void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                           uchar * dst_data, size_t dst_step,
+                           int dst_width, int dst_height,
+                           int dcn, bool swapBlue, int uIdx);
+void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
+                           uchar * dst_data, size_t dst_step,
+                           int width, int height,
+                           int scn, bool swapBlue, int uIdx);
+void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
+                         uchar * y_data, uchar * uv_data, size_t dst_step,
+                         int width, int height,
+                         int scn, bool swapBlue, int uIdx);
+void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int dcn, bool swapBlue, int uIdx, int ycn);
+
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#if defined(CV_CPU_BASELINE_MODE)
+// included in color.hpp
+#else
+#include "color.simd_helpers.hpp"
+#endif
 
+namespace {
 //constants for conversion from/to RGB and YUV, YCrCb according to BT.601
 
 //to YCbCr
@@ -1738,12 +1781,8 @@ inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_
         converter(Range(0, height));
 }
 
-//
-// HAL functions
-//
+} // namespace anon
 
-namespace hal
-{
 
 // 8u, 16u, 32f
 void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
@@ -1753,43 +1792,6 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr);
-
-#if defined(HAVE_IPP)
-#if !IPP_DISABLE_RGB_YUV
-    CV_IPP_CHECK()
-    {
-        if (scn == 3 && depth == CV_8U && swapBlue && !isCbCr)
-        {
-            if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
-                return;
-        }
-        else if (scn == 3 && depth == CV_8U && !swapBlue && !isCbCr)
-        {
-            if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
-                                                         (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
-                return;
-        }
-        else if (scn == 4 && depth == CV_8U && swapBlue && !isCbCr)
-        {
-            if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                         (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
-                return;
-        }
-        else if (scn == 4 && depth == CV_8U && !swapBlue && !isCbCr)
-        {
-            if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                         (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
-                return;
-        }
-    }
-#endif
-#endif
-
     int blueIdx = swapBlue ? 2 : 0;
     if( depth == CV_8U )
         CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<uchar>(scn, blueIdx, isCbCr));
@@ -1806,44 +1808,6 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr);
-
-
-#if defined(HAVE_IPP)
-#if !IPP_DISABLE_YUV_RGB
-    CV_IPP_CHECK()
-    {
-        if (dcn == 3 && depth == CV_8U && swapBlue && !isCbCr)
-        {
-            if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
-                return;
-        }
-        else if (dcn == 3 && depth == CV_8U && !swapBlue && !isCbCr)
-        {
-            if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
-                                                                   ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
-                return;
-        }
-        else if (dcn == 4 && depth == CV_8U && swapBlue && !isCbCr)
-        {
-            if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
-                                                                   ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
-                return;
-        }
-        else if (dcn == 4 && depth == CV_8U && !swapBlue && !isCbCr)
-        {
-            if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
-                                IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
-                                                                   ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
-                return;
-        }
-    }
-#endif
-#endif
-
     int blueIdx = swapBlue ? 2 : 0;
     if( depth == CV_8U )
         CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<uchar>(dcn, blueIdx, isCbCr));
@@ -1860,7 +1824,6 @@ void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
     const uchar* uv = src_data + src_step * static_cast<size_t>(dst_height);
     cvtTwoPlaneYUVtoBGR(src_data, uv, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
 }
@@ -1880,8 +1843,6 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src
 {
     CV_INSTRUMENT_REGION();
 
-    // TODO: add hal replacement method
-
     int blueIdx = swapBlue ? 2 : 0;
 
     cvt_2plane_yuv_ptr_t cvtPtr;
@@ -1919,7 +1880,6 @@ void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
     const uchar* u = src_data + src_step * static_cast<size_t>(dst_height);
     const uchar* v = src_data + src_step * static_cast<size_t>(dst_height + dst_height/4) + (dst_width/2) * ((dst_height % 4)/2);
 
@@ -1949,7 +1909,6 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx);
     uchar * uv_data = dst_data + dst_step * height;
 
     RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height,
@@ -1968,8 +1927,6 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    // TODO: add hal replacement method
-
     RGB8toYUV420pInvoker cvt(src_data, src_step, y_data, uv_data, dst_step, width, height,
                              scn, swapBlue, uIdx == 2, true);
 
@@ -1993,8 +1950,6 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn);
-
     cvt_1plane_yuv_ptr_t cvtPtr;
     int blueIdx = swapBlue ? 2 : 0;
     switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn)
@@ -2017,227 +1972,6 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
     cvtPtr(dst_data, dst_step, src_data, src_step, width, height);
 }
 
-} // namespace hal
-
-//
-// OCL calls
-//
-
-#ifdef HAVE_OPENCL
-
-bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx )
-{
-    OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
-
-    if(!h.createKernel("YUV2RGB", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d", dcn, bidx)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx )
-{
-    OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
-
-    if(!h.createKernel("RGB2YUV", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=3 -D bidx=%d", bidx)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx)
-{
-    OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
-
-    if(!h.createKernel("YCrCb2RGB", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d", dcn, bidx)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx)
-{
-    OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
-
-    if(!h.createKernel("RGB2YCrCb", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=3 -D bidx=%d", bidx)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx )
-{
-    OclHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
-
-    bool optimized = _src.offset() % 4 == 0 && _src.step() % 4 == 0;
-    if(!h.createKernel("YUV2RGB_422", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx,
-                       optimized ? " -D USE_OPTIMIZED_LOAD" : "")))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
-{
-    OclHelper< Set<1>, Set<1>, Set<CV_8U>, FROM_YUV> h(_src, _dst, 1);
-
-    h.src.rowRange(0, _dst.rows()).copyTo(_dst);
-    return true;
-}
-
-bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx )
-{
-    OclHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV > h(_src, _dst, dcn);
-
-    if(!h.createKernel("YUV2RGB_NVx", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx )
-{
-    OclHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV > h(_src, _dst, dcn);
-
-    if(!h.createKernel("YUV2RGB_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
-                       _src.isContinuous() ? " -D SRC_CONT" : "")))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
-bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx )
-{
-    OclHelper< Set<3, 4>, Set<1>, Set<CV_8U>, TO_YUV > h(_src, _dst, 1);
-
-    if(!h.createKernel("RGB2YUV_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=1 -D bidx=%d -D uidx=%d", bidx, uidx)))
-    {
-        return false;
-    }
-
-    return h.run();
-}
-
 #endif
-
-//
-// HAL calls
-//
-
-void cvtColorBGR2YUV(InputArray _src, OutputArray _dst, bool swapb, bool crcb)
-{
-    CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
-
-    hal::cvtBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, h.scn, swapb, crcb);
-}
-
-void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb)
-{
-    if(dcn <= 0) dcn = 3;
-    CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
-
-    hal::cvtYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, dcn, swapb, crcb);
-}
-
-void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn)
-{
-    CvtHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
-
-    hal::cvtOnePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                             dcn, swapb, uidx, ycn);
-}
-
-void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi )
-{
-    CV_Assert( _src.channels() == 2 && _src.depth() == CV_8U );
-
-    extractChannel(_src, _dst, coi);
-}
-
-void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx)
-{
-    CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U>, TO_YUV > h(_src, _dst, 1);
-
-    hal::cvtBGRtoThreePlaneYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                               h.scn, swapb, uidx);
-}
-
-void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
-{
-    CvtHelper< Set<1>, Set<1>, Set<CV_8U>, FROM_YUV > h(_src, _dst, 1);
-
-#ifdef HAVE_IPP
-#if IPP_VERSION_X100 >= 201700
-    if (CV_INSTRUMENT_FUN_IPP(ippiCopy_8u_C1R_L, h.src.data, (IppSizeL)h.src.step, h.dst.data, (IppSizeL)h.dst.step,
-                              ippiSizeL(h.dstSz.width, h.dstSz.height)) >= 0)
-        return;
-#endif
-#endif
-    h.src(Range(0, h.dstSz.height), Range::all()).copyTo(h.dst);
-}
-
-void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx)
-{
-    if(dcn <= 0) dcn = 3;
-    CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV> h(_src, _dst, dcn);
-
-    hal::cvtThreePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows,
-                               dcn, swapb, uidx);
-}
-
-// http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
-// http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
-
-void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx )
-{
-    if(dcn <= 0) dcn = 3;
-    CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV> h(_src, _dst, dcn);
-
-    hal::cvtTwoPlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows,
-                             dcn, swapb, uidx);
-}
-
-void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx )
-{
-    int stype = _ysrc.type();
-    int depth = CV_MAT_DEPTH(stype);
-    Size ysz = _ysrc.size(), uvs = _uvsrc.size();
-    CV_Assert( dcn == 3 || dcn == 4 );
-    CV_Assert( depth == CV_8U );
-    CV_Assert( ysz.width == uvs.width * 2 && ysz.height == uvs.height * 2 );
-
-    Mat ysrc = _ysrc.getMat(), uvsrc = _uvsrc.getMat();
-
-    _dst.create( ysz, CV_MAKETYPE(depth, dcn));
-    Mat dst = _dst.getMat();
-
-    hal::cvtTwoPlaneYUVtoBGR(ysrc.data, uvsrc.data, ysrc.step,
-                             dst.data, dst.step, dst.cols, dst.rows,
-                             dcn, swapb, uidx);
-}
-
-} // namespace cv
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // namespace