diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake
index 5bb6d57f52..70752c3fba 100644
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -82,7 +82,7 @@ endif(WITH_OPENGL)
 if(APPLE)
   if(WITH_CARBON)
     set(HAVE_CARBON YES)
-  elseif(NOT IOS)
+  elseif(NOT IOS AND CMAKE_COMPILER_IS_CLANGCXX)
     set(HAVE_COCOA YES)
   endif()
 endif()
diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index 93cce2b7ad..5520d05521 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -273,7 +273,7 @@ endif()
 if (NOT IOS)
   if(WITH_QUICKTIME)
     set(HAVE_QUICKTIME YES)
-  elseif(APPLE)
+  elseif(APPLE AND CMAKE_COMPILER_IS_CLANGCXX)
     set(HAVE_QTKIT YES)
   endif()
 endif()
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index aa9469c04c..b74678ca67 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -448,11 +448,13 @@ template<typename T> struct OpNot
     T operator()( T a, T ) const { return ~a; }
 };
 
+#if (ARITHM_USE_IPP == 1)
 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
 {
     if( sz.height == 1 )
         step1 = step2 = step = sz.width*elemSize;
 }
+#endif
 
 static void add8u( const uchar* src1, size_t step1,
                    const uchar* src2, size_t step2,
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 65f78de085..bb33e952dd 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -46,7 +46,6 @@
 namespace cv
 {
 
-static const int MAX_BLOCK_SIZE = 1024;
 typedef void (*MathFunc)(const void* src, void* dst, int len);
 
 static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index ecc0f76cb8..6d590f9075 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -680,7 +680,8 @@ static bool ocl_countNonZero( InputArray _src, int & res )
 
 int cv::countNonZero( InputArray _src )
 {
-    CV_Assert( _src.channels() == 1 );
+    int type = _src.type(), cn = CV_MAT_CN(type);
+    CV_Assert( cn == 1 );
 
 #ifdef HAVE_OPENCL
     int res = -1;
@@ -690,8 +691,33 @@ int cv::countNonZero( InputArray _src )
 #endif
 
     Mat src = _src.getMat();
-    CountNonZeroFunc func = getCountNonZeroTab(src.depth());
 
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (src.dims <= 2 || src.isContinuous())
+    {
+        IppiSize roiSize = { src.cols, src.rows };
+        Ipp32s count, srcstep = (Ipp32s)src.step;
+        IppStatus status = (IppStatus)-1;
+
+        if (src.isContinuous())
+        {
+            roiSize.width = (Ipp32s)src.total();
+            roiSize.height = 1;
+            srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type);
+        }
+
+        int depth = CV_MAT_DEPTH(type);
+        if (depth == CV_8U)
+            status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0);
+        else if (depth == CV_32F)
+            status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0);
+
+        if (status >= 0)
+            return (Ipp32s)src.total() - count;
+    }
+#endif
+
+    CountNonZeroFunc func = getCountNonZeroTab(src.depth());
     CV_Assert( func != 0 );
 
     const Mat* arrays[] = {&src, 0};
diff --git a/modules/core/test/test_countnonzero.cpp b/modules/core/test/test_countnonzero.cpp
index 176d324c9e..0b8210322d 100644
--- a/modules/core/test/test_countnonzero.cpp
+++ b/modules/core/test/test_countnonzero.cpp
@@ -52,9 +52,6 @@ using namespace std;
 
 #define sign(a) a > 0 ? 1 : a == 0 ? 0 : -1
 
-const int FLOAT_TYPE [2] = {CV_32F, CV_64F};
-const int INT_TYPE [5] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S};
-
 #define MAX_WIDTH 100
 #define MAX_HEIGHT 100
 
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 190e8ac665..4129705033 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -405,8 +405,16 @@ public:
 protected:
     virtual void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
     void buildPattern();
-    uchar meanIntensity( InputArray image, InputArray integral, const float kp_x, const float kp_y,
-                         const unsigned int scale, const unsigned int rot, const unsigned int point ) const;
+
+    template <typename imgType, typename iiType>
+    imgType meanIntensity( InputArray image, InputArray integral, const float kp_x, const float kp_y,
+                           const unsigned int scale, const unsigned int rot, const unsigned int point ) const;
+
+    template <typename srcMatType, typename iiMatType>
+    void computeDescriptors( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
+
+    template <typename srcMatType>
+    void extractDescriptor(srcMatType *pointsValue, void ** ptr) const;
 
     bool orientationNormalized; //true if the orientation is normalized, false otherwise
     bool scaleNormalized; //true if the scale is normalized, false otherwise
diff --git a/modules/features2d/src/freak.cpp b/modules/features2d/src/freak.cpp
index 8759efa2e5..00c0e35ae8 100644
--- a/modules/features2d/src/freak.cpp
+++ b/modules/features2d/src/freak.cpp
@@ -239,13 +239,129 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
 
     ((FREAK*)this)->buildPattern();
 
+    // Convert to gray if not already
+    Mat grayImage = image;
+//    if( image.channels() > 1 )
+//        cvtColor( image, grayImage, COLOR_BGR2GRAY );
+
+    // Use 32-bit integers if we won't overflow in the integral image
+    if ((image.depth() == CV_8U || image.depth() == CV_8S) &&
+        (image.rows * image.cols) < 8388608 ) // 8388608 = 2 ^ (32 - 8(bit depth) - 1(sign bit))
+    {
+        // Create the integral image appropriate for our type & usage
+        if (image.depth() == CV_8U)
+            computeDescriptors<uchar, int>(grayImage, keypoints, _descriptors);
+        else if (image.depth() == CV_8S)
+            computeDescriptors<char, int>(grayImage, keypoints, _descriptors);
+        else
+            CV_Error( Error::StsUnsupportedFormat, "" );
+    } else {
+        // Create the integral image appropriate for our type & usage
+        if ( image.depth() == CV_8U )
+            computeDescriptors<uchar, double>(grayImage, keypoints, _descriptors);
+        else if ( image.depth() == CV_8S )
+            computeDescriptors<char, double>(grayImage, keypoints, _descriptors);
+        else if ( image.depth() == CV_16U )
+            computeDescriptors<ushort, double>(grayImage, keypoints, _descriptors);
+        else if ( image.depth() == CV_16S )
+            computeDescriptors<short, double>(grayImage, keypoints, _descriptors);
+        else
+            CV_Error( Error::StsUnsupportedFormat, "" );
+    }
+}
+
+template <typename srcMatType>
+void FREAK::extractDescriptor(srcMatType *pointsValue, void ** ptr) const
+{
+    std::bitset<FREAK_NB_PAIRS>** ptrScalar = (std::bitset<FREAK_NB_PAIRS>**) ptr;
+
+    // extracting descriptor preserving the order of SSE version
+    int cnt = 0;
+    for( int n = 7; n < FREAK_NB_PAIRS; n += 128)
+    {
+        for( int m = 8; m--; )
+        {
+            int nm = n-m;
+            for(int kk = nm+15*8; kk >= nm; kk-=8, ++cnt)
+            {
+                (*ptrScalar)->set(kk, pointsValue[descriptionPairs[cnt].i] >= pointsValue[descriptionPairs[cnt].j]);
+            }
+        }
+    }
+    --(*ptrScalar);
+}
+
+#if CV_SSE2
+template <>
+void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const
+{
+    __m128i** ptrSSE = (__m128i**) ptr;
+
+    // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
+    int cnt = 0;
+    for( int n = FREAK_NB_PAIRS/128; n-- ; )
+    {
+        __m128i result128 = _mm_setzero_si128();
+        for( int m = 128/16; m--; cnt += 16 )
+        {
+            __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i],
+                                            pointsValue[descriptionPairs[cnt+1].i],
+                                            pointsValue[descriptionPairs[cnt+2].i],
+                                            pointsValue[descriptionPairs[cnt+3].i],
+                                            pointsValue[descriptionPairs[cnt+4].i],
+                                            pointsValue[descriptionPairs[cnt+5].i],
+                                            pointsValue[descriptionPairs[cnt+6].i],
+                                            pointsValue[descriptionPairs[cnt+7].i],
+                                            pointsValue[descriptionPairs[cnt+8].i],
+                                            pointsValue[descriptionPairs[cnt+9].i],
+                                            pointsValue[descriptionPairs[cnt+10].i],
+                                            pointsValue[descriptionPairs[cnt+11].i],
+                                            pointsValue[descriptionPairs[cnt+12].i],
+                                            pointsValue[descriptionPairs[cnt+13].i],
+                                            pointsValue[descriptionPairs[cnt+14].i],
+                                            pointsValue[descriptionPairs[cnt+15].i]);
+
+            __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j],
+                                            pointsValue[descriptionPairs[cnt+1].j],
+                                            pointsValue[descriptionPairs[cnt+2].j],
+                                            pointsValue[descriptionPairs[cnt+3].j],
+                                            pointsValue[descriptionPairs[cnt+4].j],
+                                            pointsValue[descriptionPairs[cnt+5].j],
+                                            pointsValue[descriptionPairs[cnt+6].j],
+                                            pointsValue[descriptionPairs[cnt+7].j],
+                                            pointsValue[descriptionPairs[cnt+8].j],
+                                            pointsValue[descriptionPairs[cnt+9].j],
+                                            pointsValue[descriptionPairs[cnt+10].j],
+                                            pointsValue[descriptionPairs[cnt+11].j],
+                                            pointsValue[descriptionPairs[cnt+12].j],
+                                            pointsValue[descriptionPairs[cnt+13].j],
+                                            pointsValue[descriptionPairs[cnt+14].j],
+                                            pointsValue[descriptionPairs[cnt+15].j]);
+
+            __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
+            workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers
+
+            workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full
+            result128 = _mm_or_si128(result128, workReg);
+        }
+        (**ptrSSE) = result128;
+        ++(*ptrSSE);
+    }
+    (*ptrSSE) -= 8;
+}
+#endif
+
+template <typename srcMatType, typename iiMatType>
+void FREAK::computeDescriptors( InputArray _image, std::vector<KeyPoint>& keypoints, OutputArray _descriptors ) const {
+
+    Mat image = _image.getMat();
     Mat imgIntegral;
-    integral(image, imgIntegral);
+    integral(image, imgIntegral, DataType<iiMatType>::type);
     std::vector<int> kpScaleIdx(keypoints.size()); // used to save pattern scale index corresponding to each keypoints
     const std::vector<int>::iterator ScaleIdxBegin = kpScaleIdx.begin(); // used in std::vector erase function
     const std::vector<cv::KeyPoint>::iterator kpBegin = keypoints.begin(); // used in std::vector erase function
     const float sizeCst = static_cast<float>(FREAK_NB_SCALES/(FREAK_LOG2* nOctaves));
-    uchar pointsValue[FREAK_NB_POINTS];
+    srcMatType pointsValue[FREAK_NB_POINTS];
     int thetaIdx = 0;
     int direction0;
     int direction1;
@@ -300,13 +416,10 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
         _descriptors.create((int)keypoints.size(), FREAK_NB_PAIRS/8, CV_8U);
         _descriptors.setTo(Scalar::all(0));
         Mat descriptors = _descriptors.getMat();
-#if CV_SSE2
-        __m128i* ptr= (__m128i*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]);
-#else
-        std::bitset<FREAK_NB_PAIRS>* ptr = (std::bitset<FREAK_NB_PAIRS>*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]);
-#endif
-        for( size_t k = keypoints.size(); k--; )
-        {
+
+        void *ptr = descriptors.data+(keypoints.size()-1)*descriptors.step[0];
+
+        for( size_t k = keypoints.size(); k--; ) {
             // estimate orientation (gradient)
             if( !orientationNormalized )
             {
@@ -316,9 +429,10 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
             else
             {
                 // get the points intensity value in the un-rotated pattern
-                for( int i = FREAK_NB_POINTS; i--; )
-                {
-                    pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], 0, i);
+                for( int i = FREAK_NB_POINTS; i--; ) {
+                    pointsValue[i] = meanIntensity<srcMatType, iiMatType>(image, imgIntegral,
+                                                                          keypoints[k].pt.x, keypoints[k].pt.y,
+                                                                          kpScaleIdx[k], 0, i);
                 }
                 direction0 = 0;
                 direction1 = 0;
@@ -339,80 +453,14 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
                     thetaIdx -= FREAK_NB_ORIENTATION;
             }
             // extract descriptor at the computed orientation
-            for( int i = FREAK_NB_POINTS; i--; )
-            {
-                pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], thetaIdx, i);
+            for( int i = FREAK_NB_POINTS; i--; ) {
+                pointsValue[i] = meanIntensity<srcMatType, iiMatType>(image, imgIntegral,
+                                                                      keypoints[k].pt.x, keypoints[k].pt.y,
+                                                                      kpScaleIdx[k], thetaIdx, i);
             }
-#if CV_SSE2
-            // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
-            int cnt = 0;
-            for( int n = FREAK_NB_PAIRS/128; n-- ; )
-            {
-                __m128i result128 = _mm_setzero_si128();
-                for( int m = 128/16; m--; cnt += 16 )
-                {
-                    __m128i operand1 = _mm_set_epi8(
-                        pointsValue[descriptionPairs[cnt+0].i],
-                        pointsValue[descriptionPairs[cnt+1].i],
-                        pointsValue[descriptionPairs[cnt+2].i],
-                        pointsValue[descriptionPairs[cnt+3].i],
-                        pointsValue[descriptionPairs[cnt+4].i],
-                        pointsValue[descriptionPairs[cnt+5].i],
-                        pointsValue[descriptionPairs[cnt+6].i],
-                        pointsValue[descriptionPairs[cnt+7].i],
-                        pointsValue[descriptionPairs[cnt+8].i],
-                        pointsValue[descriptionPairs[cnt+9].i],
-                        pointsValue[descriptionPairs[cnt+10].i],
-                        pointsValue[descriptionPairs[cnt+11].i],
-                        pointsValue[descriptionPairs[cnt+12].i],
-                        pointsValue[descriptionPairs[cnt+13].i],
-                        pointsValue[descriptionPairs[cnt+14].i],
-                        pointsValue[descriptionPairs[cnt+15].i]);
-
-                    __m128i operand2 = _mm_set_epi8(
-                        pointsValue[descriptionPairs[cnt+0].j],
-                        pointsValue[descriptionPairs[cnt+1].j],
-                        pointsValue[descriptionPairs[cnt+2].j],
-                        pointsValue[descriptionPairs[cnt+3].j],
-                        pointsValue[descriptionPairs[cnt+4].j],
-                        pointsValue[descriptionPairs[cnt+5].j],
-                        pointsValue[descriptionPairs[cnt+6].j],
-                        pointsValue[descriptionPairs[cnt+7].j],
-                        pointsValue[descriptionPairs[cnt+8].j],
-                        pointsValue[descriptionPairs[cnt+9].j],
-                        pointsValue[descriptionPairs[cnt+10].j],
-                        pointsValue[descriptionPairs[cnt+11].j],
-                        pointsValue[descriptionPairs[cnt+12].j],
-                        pointsValue[descriptionPairs[cnt+13].j],
-                        pointsValue[descriptionPairs[cnt+14].j],
-                        pointsValue[descriptionPairs[cnt+15].j]);
-
-                    __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
-                    workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers
-
-                    workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full
-                    result128 = _mm_or_si128(result128, workReg);
-                }
-                (*ptr) = result128;
-                ++ptr;
-            }
-            ptr -= 8;
-#else
-            // extracting descriptor preserving the order of SSE version
-            int cnt = 0;
-            for( int n = 7; n < FREAK_NB_PAIRS; n += 128)
-            {
-                for( int m = 8; m--; )
-                {
-                    int nm = n-m;
-                    for(int kk = nm+15*8; kk >= nm; kk-=8, ++cnt)
-                    {
-                        ptr->set(kk, pointsValue[descriptionPairs[cnt].i] >= pointsValue[descriptionPairs[cnt].j]);
-                    }
-                }
-            }
-            --ptr;
-#endif
+
+            // Extract descriptor
+            extractDescriptor<srcMatType>(pointsValue, &ptr);
         }
     }
     else // extract all possible comparisons for selection
@@ -434,7 +482,9 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
             {
                 //get the points intensity value in the un-rotated pattern
                 for( int i = FREAK_NB_POINTS;i--; )
-                    pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], 0, i);
+                    pointsValue[i] = meanIntensity<srcMatType, iiMatType>(image, imgIntegral,
+                                                                          keypoints[k].pt.x,keypoints[k].pt.y,
+                                                                          kpScaleIdx[k], 0, i);
 
                 direction0 = 0;
                 direction1 = 0;
@@ -456,10 +506,10 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
                     thetaIdx -= FREAK_NB_ORIENTATION;
             }
             // get the points intensity value in the rotated pattern
-            for( int i = FREAK_NB_POINTS; i--; )
-            {
-                pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,
-                                             keypoints[k].pt.y, kpScaleIdx[k], thetaIdx, i);
+            for( int i = FREAK_NB_POINTS; i--; ) {
+                pointsValue[i] = meanIntensity<srcMatType, iiMatType>(image, imgIntegral,
+                                                                      keypoints[k].pt.x, keypoints[k].pt.y,
+                                                                      kpScaleIdx[k], thetaIdx, i);
             }
 
             int cnt(0);
@@ -478,13 +528,13 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
 }
 
 // simply take average on a square patch, not even gaussian approx
-uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
-                            const float kp_x,
-                            const float kp_y,
-                            const unsigned int scale,
-                            const unsigned int rot,
-                            const unsigned int point) const
-{
+template <typename imgType, typename iiType>
+imgType FREAK::meanIntensity( InputArray _image, InputArray _integral,
+                              const float kp_x,
+                              const float kp_y,
+                              const unsigned int scale,
+                              const unsigned int rot,
+                              const unsigned int point) const {
     Mat image = _image.getMat(), integral = _integral.getMat();
     // get point position in image
     const PatternPoint& FreakPoint = patternLookup[scale*FREAK_NB_ORIENTATION*FREAK_NB_POINTS + rot*FREAK_NB_POINTS + point];
@@ -492,7 +542,6 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
     const float yf = FreakPoint.y+kp_y;
     const int x = int(xf);
     const int y = int(yf);
-    const int& imagecols = image.cols;
 
     // get the sigma:
     const float radius = FreakPoint.sigma;
@@ -505,19 +554,15 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
         const int r_y = static_cast<int>((yf-y)*1024);
         const int r_x_1 = (1024-r_x);
         const int r_y_1 = (1024-r_y);
-        uchar* ptr = image.data+x+y*imagecols;
         unsigned int ret_val;
         // linear interpolation:
-        ret_val = (r_x_1*r_y_1*int(*ptr));
-        ptr++;
-        ret_val += (r_x*r_y_1*int(*ptr));
-        ptr += imagecols;
-        ret_val += (r_x*r_y*int(*ptr));
-        ptr--;
-        ret_val += (r_x_1*r_y*int(*ptr));
+        ret_val = r_x_1*r_y_1*int(image.at<imgType>(y  , x  ))
+                + r_x  *r_y_1*int(image.at<imgType>(y  , x+1))
+                + r_x_1*r_y  *int(image.at<imgType>(y+1, x  ))
+                + r_x  *r_y  *int(image.at<imgType>(y+1, x+1));
         //return the rounded mean
         ret_val += 2 * 1024 * 1024;
-        return static_cast<uchar>(ret_val / (4 * 1024 * 1024));
+        return static_cast<imgType>(ret_val / (4 * 1024 * 1024));
     }
 
     // expected case:
@@ -527,15 +572,15 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
     const int y_top = int(yf-radius+0.5);
     const int x_right = int(xf+radius+1.5);//integral image is 1px wider
     const int y_bottom = int(yf+radius+1.5);//integral image is 1px higher
-    int ret_val;
+    iiType ret_val;
 
-    ret_val = integral.at<int>(y_bottom,x_right);//bottom right corner
-    ret_val -= integral.at<int>(y_bottom,x_left);
-    ret_val += integral.at<int>(y_top,x_left);
-    ret_val -= integral.at<int>(y_top,x_right);
+    ret_val = integral.at<iiType>(y_bottom,x_right);//bottom right corner
+    ret_val -= integral.at<iiType>(y_bottom,x_left);
+    ret_val += integral.at<iiType>(y_top,x_left);
+    ret_val -= integral.at<iiType>(y_top,x_right);
     ret_val = ret_val/( (x_right-x_left)* (y_bottom-y_top) );
     //~ std::cout<<integral.step[1]<<std::endl;
-    return static_cast<uchar>(ret_val);
+    return static_cast<imgType>(ret_val);
 }
 
 // pair selection algorithm from a set of training images and corresponding keypoints
diff --git a/modules/features2d/src/stardetector.cpp b/modules/features2d/src/stardetector.cpp
index 1e00ee6047..0b09b8678b 100644
--- a/modules/features2d/src/stardetector.cpp
+++ b/modules/features2d/src/stardetector.cpp
@@ -44,20 +44,24 @@
 namespace cv
 {
 
-static void
-computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT )
+template <typename inMatType, typename outMatType> static void
+computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT,
+                       int iiType )
 {
-    CV_Assert( matI.type() == CV_8U );
-
     int x, y, rows = matI.rows, cols = matI.cols;
 
-    matS.create(rows + 1, cols + 1, CV_32S);
-    matT.create(rows + 1, cols + 1, CV_32S);
-    _FT.create(rows + 1, cols + 1, CV_32S);
+    matS.create(rows + 1, cols + 1, iiType );
+    matT.create(rows + 1, cols + 1, iiType );
+    _FT.create(rows + 1, cols + 1, iiType );
+
+    const inMatType* I = matI.ptr<inMatType>();
+
+    outMatType *S = matS.ptr<outMatType>();
+    outMatType *T = matT.ptr<outMatType>();
+    outMatType *FT = _FT.ptr<outMatType>();
 
-    const uchar* I = matI.ptr<uchar>();
-    int *S = matS.ptr<int>(), *T = matT.ptr<int>(), *FT = _FT.ptr<int>();
-    int istep = (int)matI.step, step = (int)(matS.step/sizeof(S[0]));
+    int istep = (int)(matI.step/matI.elemSize());
+    int step = (int)(matS.step/matS.elemSize());
 
     for( x = 0; x <= cols; x++ )
         S[x] = T[x] = FT[x] = 0;
@@ -95,14 +99,9 @@ computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT )
     }
 }
 
-struct StarFeature
-{
-    int area;
-    int* p[8];
-};
-
-static int
-StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int maxSize )
+template <typename iiMatType> static int
+StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes,
+                              int maxSize, int iiType )
 {
     const int MAX_PATTERN = 17;
     static const int sizes0[] = {1, 2, 3, 4, 6, 8, 11, 12, 16, 22, 23, 32, 45, 46, 64, 90, 128, -1};
@@ -116,16 +115,21 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma
     __m128 sizes1_4[MAX_PATTERN];
     union { int i; float f; } absmask;
     absmask.i = 0x7fffffff;
-    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2);
+    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2) && iiType == CV_32S;
 #endif
+
+    struct StarFeature
+    {
+        int area;
+        iiMatType* p[8];
+    };
+
     StarFeature f[MAX_PATTERN];
 
     Mat sum, tilted, flatTilted;
     int y, rows = img.rows, cols = img.cols;
     int border, npatterns=0, maxIdx=0;
 
-    CV_Assert( img.type() == CV_8UC1 );
-
     responses.create( img.size(), CV_32F );
     sizes.create( img.size(), CV_16S );
 
@@ -139,7 +143,18 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma
     npatterns += (pairs[npatterns-1][0] >= 0);
     maxIdx = pairs[npatterns-1][0];
 
-    computeIntegralImages( img, sum, tilted, flatTilted );
+    // Create the integral image appropriate for our type & usage
+    if ( img.type() == CV_8U )
+        computeIntegralImages<uchar, iiMatType>( img, sum, tilted, flatTilted, iiType );
+    else if ( img.type() == CV_8S )
+        computeIntegralImages<char, iiMatType>( img, sum, tilted, flatTilted, iiType );
+    else if ( img.type() == CV_16U )
+        computeIntegralImages<ushort, iiMatType>( img, sum, tilted, flatTilted, iiType );
+    else if ( img.type() == CV_16S )
+        computeIntegralImages<short, iiMatType>( img, sum, tilted, flatTilted, iiType );
+    else
+        CV_Error( Error::StsUnsupportedFormat, "" );
+
     int step = (int)(sum.step/sum.elemSize());
 
     for(int i = 0; i <= maxIdx; i++ )
@@ -148,15 +163,15 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma
         int ur_area = (2*ur_size + 1)*(2*ur_size + 1);
         int t_area = t_size*t_size + (t_size + 1)*(t_size + 1);
 
-        f[i].p[0] = sum.ptr<int>() + (ur_size + 1)*step + ur_size + 1;
-        f[i].p[1] = sum.ptr<int>() - ur_size*step + ur_size + 1;
-        f[i].p[2] = sum.ptr<int>() + (ur_size + 1)*step - ur_size;
-        f[i].p[3] = sum.ptr<int>() - ur_size*step - ur_size;
+        f[i].p[0] = sum.ptr<iiMatType>() + (ur_size + 1)*step + ur_size + 1;
+        f[i].p[1] = sum.ptr<iiMatType>() - ur_size*step + ur_size + 1;
+        f[i].p[2] = sum.ptr<iiMatType>() + (ur_size + 1)*step - ur_size;
+        f[i].p[3] = sum.ptr<iiMatType>() - ur_size*step - ur_size;
 
-        f[i].p[4] = tilted.ptr<int>() + (t_size + 1)*step + 1;
-        f[i].p[5] = flatTilted.ptr<int>() - t_size;
-        f[i].p[6] = flatTilted.ptr<int>() + t_size + 1;
-        f[i].p[7] = tilted.ptr<int>() - t_size*step + 1;
+        f[i].p[4] = tilted.ptr<iiMatType>() + (t_size + 1)*step + 1;
+        f[i].p[5] = flatTilted.ptr<iiMatType>() - t_size;
+        f[i].p[6] = flatTilted.ptr<iiMatType>() + t_size + 1;
+        f[i].p[7] = tilted.ptr<iiMatType>() - t_size*step + 1;
 
         f[i].area = ur_area + t_area;
         sizes1[i] = sizes0[i];
@@ -227,7 +242,7 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma
 
                 for(int i = 0; i <= maxIdx; i++ )
                 {
-                    const int** p = (const int**)&f[i].p[0];
+                    const iiMatType** p = (const iiMatType**)&f[i].p[0];
                     __m128i r0 = _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(p[0]+ofs)),
                                                _mm_loadu_si128((const __m128i*)(p[1]+ofs)));
                     __m128i r1 = _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(p[3]+ofs)),
@@ -269,9 +284,9 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma
 
             for(int i = 0; i <= maxIdx; i++ )
             {
-                const int** p = (const int**)&f[i].p[0];
-                vals[i] = p[0][ofs] - p[1][ofs] - p[2][ofs] + p[3][ofs] +
-                    p[4][ofs] - p[5][ofs] - p[6][ofs] + p[7][ofs];
+                const iiMatType** p = (const iiMatType**)&f[i].p[0];
+                vals[i] = (int)(p[0][ofs] - p[1][ofs] - p[2][ofs] + p[3][ofs] +
+                    p[4][ofs] - p[5][ofs] - p[6][ofs] + p[7][ofs]);
             }
             for(int i = 0; i < npatterns; i++ )
             {
@@ -429,7 +444,7 @@ StarDetector::StarDetector(int _maxSize, int _responseThreshold,
 void StarDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
     Mat image = _image.getMat(), mask = _mask.getMat(), grayImage = image;
-    if( image.type() != CV_8U ) cvtColor( image, grayImage, COLOR_BGR2GRAY );
+    if( image.channels() > 1 ) cvtColor( image, grayImage, COLOR_BGR2GRAY );
 
     (*this)(grayImage, keypoints);
     KeyPointsFilter::runByPixelsMask( keypoints, mask );
@@ -438,7 +453,15 @@ void StarDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoin
 void StarDetector::operator()(const Mat& img, std::vector<KeyPoint>& keypoints) const
 {
     Mat responses, sizes;
-    int border = StarDetectorComputeResponses( img, responses, sizes, maxSize );
+    int border;
+
+    // Use 32-bit integers if we won't overflow in the integral image
+    if ((img.depth() == CV_8U || img.depth() == CV_8S) &&
+        (img.rows * img.cols) < 8388608 ) // 8388608 = 2 ^ (32 - 8(bit depth) - 1(sign bit))
+        border = StarDetectorComputeResponses<int>( img, responses, sizes, maxSize, CV_32S );
+    else
+        border = StarDetectorComputeResponses<double>( img, responses, sizes, maxSize, CV_64F );
+
     keypoints.clear();
     if( border >= 0 )
         StarDetectorSuppressNonmax( responses, sizes, keypoints, border,
diff --git a/modules/flann/include/opencv2/flann/defines.h b/modules/flann/include/opencv2/flann/defines.h
index 13833b3c0b..f0264f74e3 100644
--- a/modules/flann/include/opencv2/flann/defines.h
+++ b/modules/flann/include/opencv2/flann/defines.h
@@ -107,6 +107,7 @@ enum flann_centers_init_t
     FLANN_CENTERS_RANDOM = 0,
     FLANN_CENTERS_GONZALES = 1,
     FLANN_CENTERS_KMEANSPP = 2,
+    FLANN_CENTERS_GROUPWISE = 3,
 
     // deprecated constants, should use the FLANN_CENTERS_* ones instead
     CENTERS_RANDOM = 0,
diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
index 710382053c..88af4e706e 100644
--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@@ -257,6 +257,84 @@ private:
     }
 
 
+    /**
+     * Chooses the initial centers in a way inspired by Gonzales (by Pierre-Emmanuel Viel):
+     * select the first point of the list as a candidate, then parse the points list. If another
+     * point is further than current candidate from the other centers, test if it is a good center
+     * of a local aggregation. If it is, replace current candidate by this point. And so on...
+     *
+     * Used with KMeansIndex that computes centers coordinates by averaging positions of clusters points,
+     * this doesn't make a real difference with previous methods. But used with HierarchicalClusteringIndex
+     * class that pick centers among existing points instead of computing the barycenters, there is a real
+     * improvement.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     * Returns:
+     */
+    void GroupWiseCenterChooser(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
+    {
+        const float kSpeedUpFactor = 1.3f;
+
+        int n = indices_length;
+
+        DistanceType* closestDistSq = new DistanceType[n];
+
+        // Choose one random center and set the closestDistSq values
+        int index = rand_int(n);
+        assert(index >=0 && index < n);
+        centers[0] = dsindices[index];
+
+        for (int i = 0; i < n; i++) {
+            closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
+        }
+
+
+        // Choose each center
+        int centerCount;
+        for (centerCount = 1; centerCount < k; centerCount++) {
+
+            // Repeat several trials
+            double bestNewPot = -1;
+            int bestNewIndex = 0;
+            DistanceType furthest = 0;
+            for (index = 0; index < n; index++) {
+
+                // We will test only the potential of the points further than current candidate
+                if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) {
+
+                    // Compute the new potential
+                    double newPot = 0;
+                    for (int i = 0; i < n; i++) {
+                        newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols)
+                                            , closestDistSq[i] );
+                    }
+
+                    // Store the best result
+                    if ((bestNewPot < 0)||(newPot <= bestNewPot)) {
+                        bestNewPot = newPot;
+                        bestNewIndex = index;
+                        furthest = closestDistSq[index];
+                    }
+                }
+            }
+
+            // Add the appropriate center
+            centers[centerCount] = dsindices[bestNewIndex];
+            for (int i = 0; i < n; i++) {
+                closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols)
+                                             , closestDistSq[i] );
+            }
+        }
+
+        centers_length = centerCount;
+
+        delete[] closestDistSq;
+    }
+
+
 public:
 
 
@@ -290,6 +368,9 @@ public:
         else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
             chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
         }
+        else if (centers_init_==FLANN_CENTERS_GROUPWISE) {
+            chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;
+        }
         else {
             throw FLANNException("Unknown algorithm for choosing initial centers.");
         }
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 287a188807..ce70ea5c6c 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -2716,6 +2716,8 @@ struct mRGBA2RGBA
 
 #ifdef HAVE_OPENCL
 
+#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
+
 static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
     bool ok = false;
@@ -2729,6 +2731,17 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
     if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
         return false;
 
+    cv::String opts = format("-D depth=%d -D scn=%d ", depth, scn);
+
+    ocl::Device dev = ocl::Device::getDefault();
+    int pxPerWIy = 1;
+    if (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU))
+    {
+        pxPerWIy = 4;
+    }
+    globalsize[1] = DIVUP(globalsize[1], pxPerWIy);
+    opts +=  format("-D PIX_PER_WI_Y=%d ", pxPerWIy);
+
     switch (code)
     {
     case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
@@ -2738,7 +2751,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
         bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
         k.create("RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=%d -D bidx=0 -D %s", depth, scn, dcn,
+                 opts + format("-D dcn=%d -D bidx=0 -D %s", dcn,
                         reverse ? "REVERSE" : "ORDER"));
         break;
     }
@@ -2752,7 +2765,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
             code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
         k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=2 -D dcn=%d -D bidx=%d -D greenbits=%d", depth, dcn, bidx, greenbits));
+                 opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits));
         break;
     }
     case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
@@ -2765,7 +2778,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
         dcn = 2;
         k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=2 -D bidx=%d -D greenbits=%d", depth, scn, bidx, greenbits));
+                 opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits));
         break;
     }
     case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
@@ -2774,7 +2787,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         dcn = 1;
         int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
         k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=2 -D dcn=1 -D bidx=0 -D greenbits=%d", depth, greenbits));
+                 opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits));
         break;
     }
     case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
@@ -2783,7 +2796,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         dcn = 2;
         int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
         k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=1 -D dcn=2 -D bidx=0 -D greenbits=%d", depth, greenbits));
+                 opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits));
         break;
     }
     case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
@@ -2793,8 +2806,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
         dcn = 1;
         k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
-                        depth, scn, bidx, stripeSize));
+                 opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
+                               bidx, stripeSize));
         globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
         break;
     }
@@ -2804,7 +2817,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         CV_Assert(scn == 1);
         dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
         k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D bidx=0 -D scn=1 -D dcn=%d", depth, dcn));
+                 opts + format("-D bidx=0 -D dcn=%d", dcn));
         break;
     }
     case COLOR_BGR2YUV:
@@ -2814,7 +2827,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         bidx = code == COLOR_RGB2YUV ? 0 : 2;
         dcn = 3;
         k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+                 opts + format("-D dcn=3 -D bidx=%d", bidx));
         break;
     }
     case COLOR_YUV2BGR:
@@ -2824,7 +2837,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         CV_Assert(dcn == 3 || dcn == 4);
         bidx = code == COLOR_YUV2RGB ? 0 : 2;
         k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
+                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
         break;
     }
     case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12:
@@ -2837,7 +2850,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         dstSz = Size(sz.width, sz.height * 2 / 3);
         k.create("YUV2RGB_NV12", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx));
+                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
         break;
     }
     case COLOR_BGR2YCrCb:
@@ -2847,7 +2860,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
         dcn = 3;
         k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+                 opts + format("-D dcn=3 -D bidx=%d", bidx));
         break;
     }
     case COLOR_YCrCb2BGR:
@@ -2858,7 +2871,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
         bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
         k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=%d -D bidx=%d", depth, scn, dcn, bidx));
+                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
         break;
     }
     case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
@@ -2904,7 +2917,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         dst = _dst.getUMat();
 
         k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+                 opts + format("-D dcn=3 -D bidx=%d", bidx));
         if (k.empty())
             return false;
         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
@@ -2955,7 +2968,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         dst = _dst.getUMat();
 
         k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
+                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
         if (k.empty())
             return false;
         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
@@ -3010,8 +3023,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             _dst.create(dstSz, CV_8UC3);
             dst = _dst.getUMat();
 
-            k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc, format("-D depth=%d -D hrange=%d -D bidx=%d -D dcn=3 -D scn=%d",
-                                                                      depth, hrange, bidx, scn));
+            k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc,
+                     opts + format("-D hrange=%d -D bidx=%d -D dcn=3",
+                                   hrange, bidx));
             if (k.empty())
                 return false;
 
@@ -3023,7 +3037,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         }
         else
             k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
-                     format("-D depth=%d -D hscale=%ff -D bidx=%d -D scn=%d -D dcn=3", depth, hrange*(1.f/360.f), bidx, scn));
+                     opts + format("-D hscale=%ff -D bidx=%d -D dcn=3",
+                                   hrange*(1.f/360.f), bidx));
         break;
     }
     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
@@ -3041,8 +3056,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
         k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d -D hrange=%d -D hscale=%ff",
-                        depth, dcn, bidx, hrange, 6.f/hrange));
+                 opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff",
+                               dcn, bidx, hrange, 6.f/hrange));
         break;
     }
     case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
@@ -3051,7 +3066,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         dcn = 4;
 
         k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=4 -D scn=4 -D bidx=3", depth));
+                 opts + "-D dcn=4 -D bidx=3");
         break;
     }
     case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
@@ -3063,8 +3078,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         dcn = 3;
 
         k.create("BGR2Lab", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=3 -D scn=%d -D bidx=%d%s",
-                        depth, scn, bidx, srgb ? " -D SRGB" : ""));
+                 opts + format("-D dcn=3 -D bidx=%d%s",
+                               bidx, srgb ? " -D SRGB" : ""));
         if (k.empty())
             return false;
 
@@ -3165,8 +3180,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB;
 
         k.create("Lab2BGR", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d%s",
-                        depth, dcn, bidx, srgb ? " -D SRGB" : ""));
+                 opts + format("-D dcn=%d -D bidx=%d%s",
+                               dcn, bidx, srgb ? " -D SRGB" : ""));
         if (k.empty())
             return false;
 
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 04dc7e2824..8e8484d6b7 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -1175,6 +1175,48 @@ calcHist_8u( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
     }
 }
 
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+
+class IPPCalcHistInvoker :
+    public ParallelLoopBody
+{
+public:
+    IPPCalcHistInvoker(const Mat & _src, Mat & _hist, AutoBuffer<Ipp32s> & _levels, Ipp32s _histSize, Ipp32s _low, Ipp32s _high, bool * _ok) :
+        ParallelLoopBody(), src(&_src), hist(&_hist), levels(&_levels), histSize(_histSize), low(_low), high(_high), ok(_ok)
+    {
+        *ok = true;
+    }
+
+    virtual void operator() (const Range & range) const
+    {
+        Mat phist(hist->size(), hist->type(), Scalar::all(0));
+
+        IppStatus status = ippiHistogramEven_8u_C1R(
+            src->data + src->step * range.start, (int)src->step, ippiSize(src->cols, range.end - range.start),
+            (Ipp32s *)phist.data, (Ipp32s *)*levels, histSize, low, high);
+
+        if (status < 0)
+        {
+            *ok = false;
+            return;
+        }
+
+        for (int i = 0; i < histSize; ++i)
+            CV_XADD((int *)(hist->data + i * hist->step), *(int *)(phist.data + i * phist.step));
+    }
+
+private:
+    const Mat * src;
+    Mat * hist;
+    AutoBuffer<Ipp32s> * levels;
+    Ipp32s histSize, low, high;
+    bool * ok;
+
+    const IPPCalcHistInvoker & operator = (const IPPCalcHistInvoker & );
+};
+
+#endif
+
 }
 
 void cv::calcHist( const Mat* images, int nimages, const int* channels,
@@ -1190,6 +1232,32 @@ void cv::calcHist( const Mat* images, int nimages, const int* channels,
     Mat hist = _hist.getMat(), ihist = hist;
     ihist.flags = (ihist.flags & ~CV_MAT_TYPE_MASK)|CV_32S;
 
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (nimages == 1 && images[0].type() == CV_8UC1 && dims == 1 && channels &&
+            channels[0] == 0 && mask.empty() && images[0].dims <= 2 &&
+            !accumulate && uniform)
+    {
+        ihist.setTo(Scalar::all(0));
+        AutoBuffer<Ipp32s> levels(histSize[0] + 1);
+
+        bool ok = true;
+        const Mat & src = images[0];
+        int nstripes = std::min<int>(8, src.total() / (1 << 16));
+#ifdef HAVE_CONCURRENCY
+        nstripes = 1;
+#endif
+        IPPCalcHistInvoker invoker(src, ihist, levels, histSize[0] + 1, (Ipp32s)ranges[0][0], (Ipp32s)ranges[0][1], &ok);
+        Range range(0, src.rows);
+        parallel_for_(range, invoker, nstripes);
+
+        if (ok)
+        {
+            ihist.convertTo(hist, CV_32F);
+            return;
+        }
+    }
+#endif
+
     if( !accumulate || histdata != hist.data )
         hist = Scalar(0.);
     else
@@ -1477,7 +1545,7 @@ void cv::calcHist( InputArrayOfArrays images, const std::vector<int>& channels,
     CV_OCL_RUN(images.total() == 1 && channels.size() == 1 && images.channels(0) == 1 &&
                channels[0] == 0 && images.isUMatVector() && mask.empty() && !accumulate &&
                histSize.size() == 1 && histSize[0] == BINS && ranges.size() == 2 &&
-               ranges[0] == 0 && ranges[1] == 256,
+               ranges[0] == 0 && ranges[1] == BINS,
                ocl_calcHist(images, hist))
 
     int i, dims = (int)histSize.size(), rsz = (int)ranges.size(), csz = (int)channels.size();
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index a8cfc96132..f3fb6962da 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -466,6 +466,61 @@ cv::Moments cv::moments( InputArray _src, bool binary )
         if( cn > 1 )
             CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" );
 
+#if (IPP_VERSION_X100 >= 801)
+        if (!binary)
+        {
+            IppiSize roi = {mat.cols, mat.rows};
+            IppiMomentState_64f *moment;
+            // ippiMomentInitAlloc_64f, ippiMomentFree_64f are deprecated in 8.1, but there are not another way
+            // to initialize IppiMomentState_64f. When GetStateSize and Init functions will appear we have to
+            // change our code.
+            if (0 <= ippiMomentInitAlloc_64f(&moment, ippAlgHintAccurate))
+            {
+                IppStatus sts = (IppStatus)(-1);
+                if (depth == CV_8U)
+                    sts = ippiMoments64f_8u_C1R((const Ipp8u *)mat.data, (int)mat.step, roi, moment);
+                else if( depth == CV_16U )
+                    sts = ippiMoments64f_16u_C1R((const Ipp16u *)mat.data, (int)mat.step, roi, moment);
+                else if( depth == CV_32F )
+                    sts = ippiMoments64f_32f_C1R((const Ipp32f *)mat.data, (int)mat.step, roi, moment);
+                if (0 <= sts)
+                {
+                    IppiPoint point = {0, 0};
+                    ippiGetSpatialMoment_64f(moment, 0, 0, 0, point, &m.m00);
+                    ippiGetSpatialMoment_64f(moment, 1, 0, 0, point, &m.m10);
+                    ippiGetSpatialMoment_64f(moment, 0, 1, 0, point, &m.m01);
+
+                    ippiGetSpatialMoment_64f(moment, 2, 0, 0, point, &m.m20);
+                    ippiGetSpatialMoment_64f(moment, 1, 1, 0, point, &m.m11);
+                    ippiGetSpatialMoment_64f(moment, 0, 2, 0, point, &m.m02);
+
+                    ippiGetSpatialMoment_64f(moment, 3, 0, 0, point, &m.m30);
+                    ippiGetSpatialMoment_64f(moment, 2, 1, 0, point, &m.m21);
+                    ippiGetSpatialMoment_64f(moment, 1, 2, 0, point, &m.m12);
+                    ippiGetSpatialMoment_64f(moment, 0, 3, 0, point, &m.m03);
+                    ippiGetCentralMoment_64f(moment, 2, 0, 0, &m.mu20);
+                    ippiGetCentralMoment_64f(moment, 1, 1, 0, &m.mu11);
+                    ippiGetCentralMoment_64f(moment, 0, 2, 0, &m.mu02);
+                    ippiGetCentralMoment_64f(moment, 3, 0, 0, &m.mu30);
+                    ippiGetCentralMoment_64f(moment, 2, 1, 0, &m.mu21);
+                    ippiGetCentralMoment_64f(moment, 1, 2, 0, &m.mu12);
+                    ippiGetCentralMoment_64f(moment, 0, 3, 0, &m.mu03);
+                    ippiGetNormalizedCentralMoment_64f(moment, 2, 0, 0, &m.nu20);
+                    ippiGetNormalizedCentralMoment_64f(moment, 1, 1, 0, &m.nu11);
+                    ippiGetNormalizedCentralMoment_64f(moment, 0, 2, 0, &m.nu02);
+                    ippiGetNormalizedCentralMoment_64f(moment, 3, 0, 0, &m.nu30);
+                    ippiGetNormalizedCentralMoment_64f(moment, 2, 1, 0, &m.nu21);
+                    ippiGetNormalizedCentralMoment_64f(moment, 1, 2, 0, &m.nu12);
+                    ippiGetNormalizedCentralMoment_64f(moment, 0, 3, 0, &m.nu03);
+
+                    ippiMomentFree_64f(moment);
+                    return m;
+                }
+                ippiMomentFree_64f(moment);
+            }
+        }
+#endif
+
         if( binary || depth == CV_8U )
             func = momentsInTile<uchar, int, int>;
         else if( depth == CV_16U )
diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl
index 115bfbd7ae..0034395458 100644
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@@ -99,64 +99,81 @@ enum
 #define hrange 0
 #endif
 
+#if bidx == 0
+#define R_COMP z
+#define G_COMP y
+#define B_COMP x
+#elif bidx == 2
+#define R_COMP x
+#define G_COMP y
+#define B_COMP z
+#elif bidx == 3
+// The only kernel that uses bidx == 3 doesn't use these macros.
+// But we still need to make the compiler happy.
+#define R_COMP w
+#define G_COMP w
+#define B_COMP w
+#endif
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+
 ///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
 
 __kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset,
                        __global uchar* dstptr, int dststep, int dstoffset,
                        int rows, int cols)
 {
-#if 1
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
-        __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
-#ifdef DEPTH_5
-        dst[0] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f;
-#else
-        dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift);
-#endif
-    }
-#else
-    const int x_min = get_global_id(0)*STRIPE_SIZE;
-    const int x_max = min(x_min + STRIPE_SIZE, cols);
-    const int y = get_global_id(1);
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if( y < rows )
+    if (x < cols)
     {
-        __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr +
-                                        mad24(y, srcstep, srcoffset)) + x_min*scn;
-        __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset));
-        int x;
-        for( x = x_min; x < x_max; x++, src += scn )
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+                DATA_TYPE_4 src_pix = vload4(0, src);
 #ifdef DEPTH_5
-        dst[x] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f;
+                dst[0] = src_pix.B_COMP * 0.114f + src_pix.G_COMP * 0.587f + src_pix.R_COMP * 0.299f;
 #else
-        dst[x] = (DATA_TYPE)(mad24(src[bidx], B2Y, mad24(src[1], G2Y,
-                        mad24(src[(bidx^2)], R2Y, 1 << (yuv_shift-1)))) >> yuv_shift);
+                dst[0] = (DATA_TYPE)CV_DESCALE((src_pix.B_COMP * B2Y + src_pix.G_COMP * G2Y + src_pix.R_COMP * R2Y), yuv_shift);
 #endif
+            }
+            ++y;
+        }
     }
-#endif
 }
 
 __kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
                        __global uchar* dstptr, int dststep, int dstoffset,
                        int rows, int cols)
 {
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
-        __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
-        DATA_TYPE val = src[0];
-        dst[0] = dst[1] = dst[2] = val;
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+                DATA_TYPE val = src[0];
+                dst[0] = dst[1] = dst[2] = val;
 #if dcn == 4
-        dst[3] = MAX_NUM;
+                dst[3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -170,30 +187,39 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset,
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
-        __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
-        DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2];
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP;
 
 #ifdef DEPTH_5
-        __constant float * coeffs = c_RGB2YUVCoeffs_f;
-        const DATA_TYPE Y  = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
-        const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX;
-        const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX;
+                __constant float * coeffs = c_RGB2YUVCoeffs_f;
+                const DATA_TYPE Y  = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
+                const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX;
+                const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX;
 #else
-        __constant int * coeffs = c_RGB2YUVCoeffs_i;
-        const int delta = HALF_MAX * (1 << yuv_shift);
-        const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
-        const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift);
-        const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift);
+                __constant int * coeffs = c_RGB2YUVCoeffs_i;
+                const int delta = HALF_MAX * (1 << yuv_shift);
+                const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
+                const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift);
+                const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift);
 #endif
 
-        dst[0] = SAT_CAST( Y );
-        dst[1] = SAT_CAST( U );
-        dst[2] = SAT_CAST( V );
+                dst[0] = SAT_CAST( Y );
+                dst[1] = SAT_CAST( U );
+                dst[2] = SAT_CAST( V );
+            }
+            ++y;
+        }
     }
 }
 
@@ -205,32 +231,41 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
-        __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
-        DATA_TYPE Y = src[0], U = src[1], V = src[2];
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;
 
 #ifdef DEPTH_5
-        __constant float * coeffs = c_YUV2RGBCoeffs_f;
-        const float r = Y + (V - HALF_MAX) * coeffs[3];
-        const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
-        const float b = Y + (U - HALF_MAX) * coeffs[0];
+                __constant float * coeffs = c_YUV2RGBCoeffs_f;
+                const float r = Y + (V - HALF_MAX) * coeffs[3];
+                const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
+                const float b = Y + (U - HALF_MAX) * coeffs[0];
 #else
-        __constant int * coeffs = c_YUV2RGBCoeffs_i;
-        const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift);
-        const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
-        const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift);
+                __constant int * coeffs = c_YUV2RGBCoeffs_i;
+                const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift);
+                const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
+                const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift);
 #endif
 
-        dst[bidx] = SAT_CAST( b );
-        dst[1] = SAT_CAST( g );
-        dst[bidx^2] = SAT_CAST( r );
+                dst[bidx] = SAT_CAST( b );
+                dst[1] = SAT_CAST( g );
+                dst[bidx^2] = SAT_CAST( r );
 #if dcn == 4
-        dst[3] = MAX_NUM;
+                dst[3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -246,58 +281,66 @@ __kernel void YUV2RGB_NV12(__global const uchar* srcptr, int srcstep, int srcoff
                             int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows / 2 && x < cols / 2 )
+    if (x < cols / 2)
     {
-        __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset);
-        __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset);
-        __global uchar*       dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset);
-        __global uchar*       dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset);
-
-        int Y1 = ysrc[0];
-        int Y2 = ysrc[1];
-        int Y3 = ysrc[srcstep];
-        int Y4 = ysrc[srcstep + 1];
-
-        int U  = usrc[0] - 128;
-        int V  = usrc[1] - 128;
-
-        int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
-        int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
-        int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
-
-        Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
-        dst1[2 - bidx]     = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT);
-        dst1[1]        = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT);
-        dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows / 2 )
+            {
+                __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset);
+                __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset);
+                __global uchar*       dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset);
+                __global uchar*       dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset);
+
+                int Y1 = ysrc[0];
+                int Y2 = ysrc[1];
+                int Y3 = ysrc[srcstep];
+                int Y4 = ysrc[srcstep + 1];
+
+                int U  = usrc[0] - 128;
+                int V  = usrc[1] - 128;
+
+                int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
+                int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
+                int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
+
+                Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
+                dst1[2 - bidx]     = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT);
+                dst1[1]        = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT);
+                dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT);
 #if dcn == 4
-        dst1[3]        = 255;
+                dst1[3]        = 255;
 #endif
 
-        Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY;
-        dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT);
-        dst1[dcn + 1]        = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT);
-        dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT);
+                Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY;
+                dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT);
+                dst1[dcn + 1]        = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT);
+                dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT);
 #if dcn == 4
-        dst1[7]        = 255;
+                dst1[7]        = 255;
 #endif
 
-        Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY;
-        dst2[2 - bidx]     = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT);
-        dst2[1]        = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT);
-        dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT);
+                Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY;
+                dst2[2 - bidx]     = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT);
+                dst2[1]        = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT);
+                dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT);
 #if dcn == 4
-        dst2[3]        = 255;
+                dst2[3]        = 255;
 #endif
 
-        Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY;
-        dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT);
-        dst2[dcn + 1]        = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT);
-        dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT);
+                Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY;
+                dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT);
+                dst2[dcn + 1]        = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT);
+                dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT);
 #if dcn == 4
-        dst2[7]        = 255;
+                dst2[7]        = 255;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -311,30 +354,39 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset
                         int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
-        __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
-        DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2];
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP;
 
 #ifdef DEPTH_5
-        __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
-        DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0];
-        DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX;
-        DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX;
+                __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
+                DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0];
+                DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX;
+                DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX;
 #else
-        __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
-        int delta = HALF_MAX * (1 << yuv_shift);
-        int Y =  CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift);
-        int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift);
-        int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift);
+                __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
+                int delta = HALF_MAX * (1 << yuv_shift);
+                int Y =  CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift);
+                int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift);
+                int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift);
 #endif
 
-        dst[0] = SAT_CAST( Y );
-        dst[1] = SAT_CAST( Cr );
-        dst[2] = SAT_CAST( Cb );
+                dst[0] = SAT_CAST( Y );
+                dst[1] = SAT_CAST( Cr );
+                dst[2] = SAT_CAST( Cb );
+            }
+            ++y;
+        }
     }
 }
 
@@ -346,35 +398,44 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
                         int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-        __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx);
-        __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+                __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx);
+                __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx);
 
-        DATA_TYPE y = srcptr[0], cr = srcptr[1], cb = srcptr[2];
+                DATA_TYPE_4 src_pix = vload4(0, srcptr);
+                DATA_TYPE y = src_pix.x, cr = src_pix.y, cb = src_pix.z;
 
 #ifdef DEPTH_5
-        __constant float * coeff = c_YCrCb2RGBCoeffs_f;
-        float r = y + coeff[0] * (cr - HALF_MAX);
-        float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX);
-        float b = y + coeff[3] * (cb - HALF_MAX);
+                __constant float * coeff = c_YCrCb2RGBCoeffs_f;
+                float r = y + coeff[0] * (cr - HALF_MAX);
+                float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX);
+                float b = y + coeff[3] * (cb - HALF_MAX);
 #else
-        __constant int * coeff = c_YCrCb2RGBCoeffs_i;
-        int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
-        int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift);
-        int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
+                __constant int * coeff = c_YCrCb2RGBCoeffs_i;
+                int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
+                int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift);
+                int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
 #endif
 
-        dstptr[(bidx^2)] = SAT_CAST(r);
-        dstptr[1] = SAT_CAST(g);
-        dstptr[bidx] = SAT_CAST(b);
+                dstptr[(bidx^2)] = SAT_CAST(r);
+                dstptr[1] = SAT_CAST(g);
+                dstptr[bidx] = SAT_CAST(b);
 #if dcn == 4
-        dstptr[3] = MAX_NUM;
+                dstptr[3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -385,30 +446,39 @@ __kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offse
                       int rows, int cols, __constant COEFF_TYPE * coeffs)
 {
     int dx = get_global_id(0);
-    int dy = get_global_id(1);
+    int dy = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (dy < rows && dx < cols)
+    if (dx < cols)
     {
-        int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
-        int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (dy < rows)
+            {
+                int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
+                int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
 
-        __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
-        __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
 
-        DATA_TYPE r = src[0], g = src[1], b = src[2];
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
 
 #ifdef DEPTH_5
-        float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
-        float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
-        float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
+                float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
+                float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
+                float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
 #else
-        int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
-        int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
-        int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
+                int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
+                int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
+                int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
 #endif
-        dst[0] = SAT_CAST(x);
-        dst[1] = SAT_CAST(y);
-        dst[2] = SAT_CAST(z);
+                dst[0] = SAT_CAST(x);
+                dst[1] = SAT_CAST(y);
+                dst[2] = SAT_CAST(z);
+            }
+            ++dy;
+        }
     }
 }
 
@@ -417,33 +487,42 @@ __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offse
                       int rows, int cols, __constant COEFF_TYPE * coeffs)
 {
     int dx = get_global_id(0);
-    int dy = get_global_id(1);
+    int dy = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (dy < rows && dx < cols)
+    if (dx < cols)
     {
-        int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
-        int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (dy < rows)
+            {
+                int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
+                int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
 
-        __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
-        __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
 
-        DATA_TYPE x = src[0], y = src[1], z = src[2];
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
 
 #ifdef DEPTH_5
-        float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
-        float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
-        float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
+                float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
+                float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
+                float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
 #else
-        int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
-        int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
-        int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
+                int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
+                int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
+                int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
 #endif
-        dst[0] = SAT_CAST(b);
-        dst[1] = SAT_CAST(g);
-        dst[2] = SAT_CAST(r);
+                dst[0] = SAT_CAST(b);
+                dst[1] = SAT_CAST(g);
+                dst[2] = SAT_CAST(r);
 #if dcn == 4
-        dst[3] = MAX_NUM;
+                dst[3] = MAX_NUM;
 #endif
+            }
+            ++dy;
+        }
     }
 }
 
@@ -454,33 +533,42 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
                   int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
 
-        __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
-        __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+                DATA_TYPE_4 src_pix = vload4(0, src);
 
 #ifdef REVERSE
-        dst[0] = src[2];
-        dst[1] = src[1];
-        dst[2] = src[0];
+                dst[0] = src_pix.z;
+                dst[1] = src_pix.y;
+                dst[2] = src_pix.x;
 #else
-        dst[0] = src[0];
-        dst[1] = src[1];
-        dst[2] = src[2];
+                dst[0] = src_pix.x;
+                dst[1] = src_pix.y;
+                dst[2] = src_pix.z;
 #endif
 
 #if dcn == 4
 #if scn == 3
-        dst[3] = MAX_NUM;
+                dst[3] = MAX_NUM;
 #else
-        dst[3] = src[3];
+                dst[3] = src[3];
 #endif
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -491,31 +579,39 @@ __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset
                          int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-        ushort t = *((__global const ushort*)(src + src_idx));
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+                ushort t = *((__global const ushort*)(src + src_idx));
 
 #if greenbits == 6
-        dst[dst_idx + bidx] = (uchar)(t << 3);
-        dst[dst_idx + 1] = (uchar)((t >> 3) & ~3);
-        dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7);
+                dst[dst_idx + bidx] = (uchar)(t << 3);
+                dst[dst_idx + 1] = (uchar)((t >> 3) & ~3);
+                dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7);
 #else
-        dst[dst_idx + bidx] = (uchar)(t << 3);
-        dst[dst_idx + 1] = (uchar)((t >> 2) & ~7);
-        dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7);
+                dst[dst_idx + bidx] = (uchar)(t << 3);
+                dst[dst_idx + 1] = (uchar)((t >> 2) & ~7);
+                dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7);
 #endif
 
 #if dcn == 4
 #if greenbits == 6
-        dst[dst_idx + 3] = 255;
+                dst[dst_idx + 3] = 255;
 #else
-        dst[dst_idx + 3] = t & 0x8000 ? 255 : 0;
+                dst[dst_idx + 3] = t & 0x8000 ? 255 : 0;
 #endif
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -524,21 +620,30 @@ __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset
                          int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+                uchar4 src_pix = vload4(0, src + src_idx);
 
 #if greenbits == 6
-            *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~3) << 3)|((src[src_idx + (bidx^2)]&~7) << 8));
+                    *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
 #elif scn == 3
-            *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)|((src[src_idx + (bidx^2)]&~7) << 7));
+                    *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
 #else
-            *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)|
-                ((src[src_idx + (bidx^2)]&~7) << 7)|(src[src_idx + 3] ? 0x8000 : 0));
+                    *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
+                        ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -549,23 +654,31 @@ __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offse
                           int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-        int t = *((__global const ushort*)(src + src_idx));
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x);
+                int t = *((__global const ushort*)(src + src_idx));
 
 #if greenbits == 6
-        dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
-                                         ((t >> 3) & 0xfc)*G2Y +
-                                         ((t >> 8) & 0xf8)*R2Y, yuv_shift);
+                dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
+                                                 ((t >> 3) & 0xfc)*G2Y +
+                                                 ((t >> 8) & 0xf8)*R2Y, yuv_shift);
 #else
-        dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
-                                         ((t >> 2) & 0xf8)*G2Y +
-                                         ((t >> 7) & 0xf8)*R2Y, yuv_shift);
+                dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
+                                                 ((t >> 2) & 0xf8)*G2Y +
+                                                 ((t >> 7) & 0xf8)*R2Y, yuv_shift);
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -574,20 +687,28 @@ __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offse
                           int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-        int t = src[src_idx];
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+                int t = src[src_idx];
 
 #if greenbits == 6
-        *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+                *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
 #else
-        t >>= 3;
-        *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10));
+                t >>= 3;
+                *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10));
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -608,36 +729,45 @@ __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
                       __constant int * sdiv_table, __constant int * hdiv_table)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
-        int b = src[src_idx + bidx], g = src[src_idx + 1], r = src[src_idx + (bidx^2)];
-        int h, s, v = b;
-        int vmin = b, diff;
-        int vr, vg;
-
-        v = max( v, g );
-        v = max( v, r );
-        vmin = min( vmin, g );
-        vmin = min( vmin, r );
-
-        diff = v - vmin;
-        vr = v == r ? -1 : 0;
-        vg = v == g ? -1 : 0;
-
-        s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h = (vr & (g - b)) +
-            (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
-        h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h += h < 0 ? hrange : 0;
-
-        dst[dst_idx] = convert_uchar_sat_rte(h);
-        dst[dst_idx + 1] = (uchar)s;
-        dst[dst_idx + 2] = (uchar)v;
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+                uchar4 src_pix = vload4(0, src + src_idx);
+
+                int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                int h, s, v = b;
+                int vmin = b, diff;
+                int vr, vg;
+
+                v = max( v, g );
+                v = max( v, r );
+                vmin = min( vmin, g );
+                vmin = min( vmin, r );
+
+                diff = v - vmin;
+                vr = v == r ? -1 : 0;
+                vg = v == g ? -1 : 0;
+
+                s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+                h = (vr & (g - b)) +
+                    (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+                h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+                h += h < 0 ? hrange : 0;
+
+                dst[dst_idx] = convert_uchar_sat_rte(h);
+                dst[dst_idx + 1] = (uchar)s;
+                dst[dst_idx + 2] = (uchar)v;
+            }
+            ++y;
+        }
     }
 }
 
@@ -646,51 +776,60 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
-        float h = src[src_idx], s = src[src_idx + 1]*(1/255.f), v = src[src_idx + 2]*(1/255.f);
-        float b, g, r;
-
-        if (s != 0)
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
         {
-            float tab[4];
-            int sector;
-            h *= hscale;
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-            sector = convert_int_sat_rtn(h);
-            h -= sector;
-            if( (unsigned)sector >= 6u )
+            if (y < rows)
             {
-                sector = 0;
-                h = 0.f;
-            }
-
-            tab[0] = v;
-            tab[1] = v*(1.f - s);
-            tab[2] = v*(1.f - s*h);
-            tab[3] = v*(1.f - s*(1.f - h));
-
-            b = tab[sector_data[sector][0]];
-            g = tab[sector_data[sector][1]];
-            r = tab[sector_data[sector][2]];
-        }
-        else
-            b = g = r = v;
-
-        dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
-        dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
-        dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+                uchar4 src_pix = vload4(0, src + src_idx);
+
+                float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+                    if( (unsigned)sector >= 6u )
+                    {
+                        sector = 0;
+                        h = 0.f;
+                    }
+
+                    tab[0] = v;
+                    tab[1] = v*(1.f - s);
+                    tab[2] = v*(1.f - s*h);
+                    tab[3] = v*(1.f - s*(1.f - h));
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = v;
+
+                dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
+                dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
+                dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
 #if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
+                dst[dst_idx + 3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -701,42 +840,51 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
-        __global const float * src = (__global const float *)(srcptr + src_idx);
-        __global float * dst = (__global float *)(dstptr + dst_idx);
-
-        float b = src[bidx], g = src[1], r = src[bidx^2];
-        float h, s, v;
-
-        float vmin, diff;
-
-        v = vmin = r;
-        if( v < g ) v = g;
-        if( v < b ) v = b;
-        if( vmin > g ) vmin = g;
-        if( vmin > b ) vmin = b;
-
-        diff = v - vmin;
-        s = diff/(float)(fabs(v) + FLT_EPSILON);
-        diff = (float)(60.f/(diff + FLT_EPSILON));
-        if( v == r )
-            h = (g - b)*diff;
-        else if( v == g )
-            h = (b - r)*diff + 120.f;
-        else
-            h = (r - g)*diff + 240.f;
-
-        if( h < 0 ) h += 360.f;
-
-        dst[0] = h*hscale;
-        dst[1] = s;
-        dst[2] = v;
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+                __global const float * src = (__global const float *)(srcptr + src_idx);
+                __global float * dst = (__global float *)(dstptr + dst_idx);
+                float4 src_pix = vload4(0, src);
+
+                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                float h, s, v;
+
+                float vmin, diff;
+
+                v = vmin = r;
+                if( v < g ) v = g;
+                if( v < b ) v = b;
+                if( vmin > g ) vmin = g;
+                if( vmin > b ) vmin = b;
+
+                diff = v - vmin;
+                s = diff/(float)(fabs(v) + FLT_EPSILON);
+                diff = (float)(60.f/(diff + FLT_EPSILON));
+                if( v == r )
+                    h = (g - b)*diff;
+                else if( v == g )
+                    h = (b - r)*diff + 120.f;
+                else
+                    h = (r - g)*diff + 240.f;
+
+                if( h < 0 ) h += 360.f;
+
+                dst[0] = h*hscale;
+                dst[1] = s;
+                dst[2] = v;
+            }
+            ++y;
+        }
     }
 }
 
@@ -745,54 +893,63 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
-        __global const float * src = (__global const float *)(srcptr + src_idx);
-        __global float * dst = (__global float *)(dstptr + dst_idx);
-
-        float h = src[0], s = src[1], v = src[2];
-        float b, g, r;
-
-        if (s != 0)
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
         {
-            float tab[4];
-            int sector;
-            h *= hscale;
-            if(h < 0)
-                do h += 6; while (h < 0);
-            else if (h >= 6)
-                do h -= 6; while (h >= 6);
-            sector = convert_int_sat_rtn(h);
-            h -= sector;
-            if ((unsigned)sector >= 6u)
+            if (y < rows)
             {
-                sector = 0;
-                h = 0.f;
-            }
-
-            tab[0] = v;
-            tab[1] = v*(1.f - s);
-            tab[2] = v*(1.f - s*h);
-            tab[3] = v*(1.f - s*(1.f - h));
-
-            b = tab[sector_data[sector][0]];
-            g = tab[sector_data[sector][1]];
-            r = tab[sector_data[sector][2]];
-        }
-        else
-            b = g = r = v;
-
-        dst[bidx] = b;
-        dst[1] = g;
-        dst[bidx^2] = r;
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+                __global const float * src = (__global const float *)(srcptr + src_idx);
+                __global float * dst = (__global float *)(dstptr + dst_idx);
+                float4 src_pix = vload4(0, src);
+
+                float h = src_pix.x, s = src_pix.y, v = src_pix.z;
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+                    h *= hscale;
+                    if(h < 0)
+                        do h += 6; while (h < 0);
+                    else if (h >= 6)
+                        do h -= 6; while (h >= 6);
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+                    if ((unsigned)sector >= 6u)
+                    {
+                        sector = 0;
+                        h = 0.f;
+                    }
+
+                    tab[0] = v;
+                    tab[1] = v*(1.f - s);
+                    tab[2] = v*(1.f - s*h);
+                    tab[3] = v*(1.f - s*(1.f - h));
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = v;
+
+                dst[bidx] = b;
+                dst[1] = g;
+                dst[bidx^2] = r;
 #if dcn == 4
-        dst[3] = MAX_NUM;
+                dst[3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -807,44 +964,53 @@ __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
-        float b = src[src_idx + bidx]*(1/255.f), g = src[src_idx + 1]*(1/255.f), r = src[src_idx + (bidx^2)]*(1/255.f);
-        float h = 0.f, s = 0.f, l;
-        float vmin, vmax, diff;
-
-        vmax = vmin = r;
-        if (vmax < g) vmax = g;
-        if (vmax < b) vmax = b;
-        if (vmin > g) vmin = g;
-        if (vmin > b) vmin = b;
-
-        diff = vmax - vmin;
-        l = (vmax + vmin)*0.5f;
-
-        if (diff > FLT_EPSILON)
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
         {
-            s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
-            diff = 60.f/diff;
-
-            if( vmax == r )
-                h = (g - b)*diff;
-            else if( vmax == g )
-                h = (b - r)*diff + 120.f;
-            else
-                h = (r - g)*diff + 240.f;
-
-            if( h < 0.f ) h += 360.f;
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+                uchar4 src_pix = vload4(0, src + src_idx);
+
+                float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
+                float h = 0.f, s = 0.f, l;
+                float vmin, vmax, diff;
+
+                vmax = vmin = r;
+                if (vmax < g) vmax = g;
+                if (vmax < b) vmax = b;
+                if (vmin > g) vmin = g;
+                if (vmin > b) vmin = b;
+
+                diff = vmax - vmin;
+                l = (vmax + vmin)*0.5f;
+
+                if (diff > FLT_EPSILON)
+                {
+                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+                    diff = 60.f/diff;
+
+                    if( vmax == r )
+                        h = (g - b)*diff;
+                    else if( vmax == g )
+                        h = (b - r)*diff + 120.f;
+                    else
+                        h = (r - g)*diff + 240.f;
+
+                    if( h < 0.f ) h += 360.f;
+                }
+
+                dst[dst_idx] = convert_uchar_sat_rte(h*hscale);
+                dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f);
+                dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f);
+            }
+            ++y;
         }
-
-        dst[dst_idx] = convert_uchar_sat_rte(h*hscale);
-        dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f);
-        dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f);
     }
 }
 
@@ -853,50 +1019,59 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
-        float h = src[src_idx], l = src[src_idx + 1]*(1.f/255.f), s = src[src_idx + 2]*(1.f/255.f);
-        float b, g, r;
-
-        if (s != 0)
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
         {
-            float tab[4];
-
-            float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
-            float p1 = 2*l - p2;
-
-            h *= hscale;
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-
-            int sector = convert_int_sat_rtn(h);
-            h -= sector;
-
-            tab[0] = p2;
-            tab[1] = p1;
-            tab[2] = p1 + (p2 - p1)*(1-h);
-            tab[3] = p1 + (p2 - p1)*h;
-
-            b = tab[sector_data[sector][0]];
-            g = tab[sector_data[sector][1]];
-            r = tab[sector_data[sector][2]];
-        }
-        else
-            b = g = r = l;
-
-        dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
-        dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
-        dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+                uchar4 src_pix = vload4(0, src + src_idx);
+
+                float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+
+                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+                    float p1 = 2*l - p2;
+
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+
+                    int sector = convert_int_sat_rtn(h);
+                    h -= sector;
+
+                    tab[0] = p2;
+                    tab[1] = p1;
+                    tab[2] = p1 + (p2 - p1)*(1-h);
+                    tab[3] = p1 + (p2 - p1)*h;
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = l;
+
+                dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
+                dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
+                dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
 #if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
+                dst[dst_idx + 3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -907,47 +1082,56 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
-        __global const float * src = (__global const float *)(srcptr + src_idx);
-        __global float * dst = (__global float *)(dstptr + dst_idx);
-
-        float b = src[bidx], g = src[1], r = src[bidx^2];
-        float h = 0.f, s = 0.f, l;
-        float vmin, vmax, diff;
-
-        vmax = vmin = r;
-        if (vmax < g) vmax = g;
-        if (vmax < b) vmax = b;
-        if (vmin > g) vmin = g;
-        if (vmin > b) vmin = b;
-
-        diff = vmax - vmin;
-        l = (vmax + vmin)*0.5f;
-
-        if (diff > FLT_EPSILON)
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
         {
-            s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
-            diff = 60.f/diff;
-
-            if( vmax == r )
-                h = (g - b)*diff;
-            else if( vmax == g )
-                h = (b - r)*diff + 120.f;
-            else
-                h = (r - g)*diff + 240.f;
-
-            if( h < 0.f ) h += 360.f;
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+                __global const float * src = (__global const float *)(srcptr + src_idx);
+                __global float * dst = (__global float *)(dstptr + dst_idx);
+                float4 src_pix = vload4(0, src);
+
+                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                float h = 0.f, s = 0.f, l;
+                float vmin, vmax, diff;
+
+                vmax = vmin = r;
+                if (vmax < g) vmax = g;
+                if (vmax < b) vmax = b;
+                if (vmin > g) vmin = g;
+                if (vmin > b) vmin = b;
+
+                diff = vmax - vmin;
+                l = (vmax + vmin)*0.5f;
+
+                if (diff > FLT_EPSILON)
+                {
+                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+                    diff = 60.f/diff;
+
+                    if( vmax == r )
+                        h = (g - b)*diff;
+                    else if( vmax == g )
+                        h = (b - r)*diff + 120.f;
+                    else
+                        h = (r - g)*diff + 240.f;
+
+                    if( h < 0.f ) h += 360.f;
+                }
+
+                dst[0] = h*hscale;
+                dst[1] = l;
+                dst[2] = s;
+            }
+            ++y;
         }
-
-        dst[0] = h*hscale;
-        dst[1] = l;
-        dst[2] = s;
     }
 }
 
@@ -956,54 +1140,63 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset
                       int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
-        __global const float * src = (__global const float *)(srcptr + src_idx);
-        __global float * dst = (__global float *)(dstptr + dst_idx);
-
-        float h = src[0], l = src[1], s = src[2];
-        float b, g, r;
-
-        if (s != 0)
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
         {
-            float tab[4];
-            int sector;
-
-            float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
-            float p1 = 2*l - p2;
-
-            h *= hscale;
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-
-            sector = convert_int_sat_rtn(h);
-            h -= sector;
-
-            tab[0] = p2;
-            tab[1] = p1;
-            tab[2] = p1 + (p2 - p1)*(1-h);
-            tab[3] = p1 + (p2 - p1)*h;
-
-            b = tab[sector_data[sector][0]];
-            g = tab[sector_data[sector][1]];
-            r = tab[sector_data[sector][2]];
-        }
-        else
-            b = g = r = l;
-
-        dst[bidx] = b;
-        dst[1] = g;
-        dst[bidx^2] = r;
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+                __global const float * src = (__global const float *)(srcptr + src_idx);
+                __global float * dst = (__global float *)(dstptr + dst_idx);
+                float4 src_pix = vload4(0, src);
+
+                float h = src_pix.x, l = src_pix.y, s = src_pix.z;
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+
+                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+                    float p1 = 2*l - p2;
+
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+
+                    tab[0] = p2;
+                    tab[1] = p1;
+                    tab[2] = p1 + (p2 - p1)*(1-h);
+                    tab[3] = p1 + (p2 - p1)*h;
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = l;
+
+                dst[bidx] = b;
+                dst[1] = g;
+                dst[bidx^2] = r;
 #if dcn == 4
-        dst[3] = MAX_NUM;
+                dst[3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -1018,21 +1211,29 @@ __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset
                          int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + (x << 2));
+                int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
+                uchar4 src_pix = vload4(0, src + src_idx);
 
-        uchar v0 = src[src_idx], v1 = src[src_idx + 1];
-        uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
+                uchar v0 = src_pix.x, v1 = src_pix.y;
+                uchar v2 = src_pix.z, v3 = src_pix.w;
 
-        dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 3] = v3;
+                dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
+                dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
+                dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
+                dst[dst_idx + 3] = v3;
+            }
+            ++y;
+        }
     }
 }
 
@@ -1041,22 +1242,30 @@ __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset
                          int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        uchar v0 = src[src_idx], v1 = src[src_idx + 1];
-        uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
-        uchar v3_half = v3 / 2;
-
-        dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 3] = v3;
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + (x << 2));
+                int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
+                uchar4 src_pix = vload4(0, src + src_idx);
+
+                uchar v0 = src_pix.x, v1 = src_pix.y;
+                uchar v2 = src_pix.z, v3 = src_pix.w;
+                uchar v3_half = v3 / 2;
+
+                dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
+                dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
+                dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
+                dst[dst_idx + 3] = v3;
+            }
+            ++y;
+        }
     }
 }
 
@@ -1086,32 +1295,41 @@ __kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
                       __constant int * coeffs, int Lscale, int Lshift)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
 
-        src += src_idx;
-        dst += dst_idx;
+                __global const uchar* src_ptr = src + src_idx;
+                __global uchar* dst_ptr = dst + dst_idx;
+                uchar4 src_pix = vload4(0, src_ptr);
 
-        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+                int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                    C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                    C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 
-        int R = gammaTab[src[0]], G = gammaTab[src[1]], B = gammaTab[src[2]];
-        int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
-        int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
-        int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
+                int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
+                int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
+                int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
+                int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
 
-        int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
-        int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
-        int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
+                int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
+                int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
+                int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
 
-        dst[0] = SAT_CAST(L);
-        dst[1] = SAT_CAST(a);
-        dst[2] = SAT_CAST(b);
+                dst_ptr[0] = SAT_CAST(L);
+                dst_ptr[1] = SAT_CAST(a);
+                dst_ptr[2] = SAT_CAST(b);
+            }
+            ++y;
+        }
     }
 }
 
@@ -1125,45 +1343,54 @@ __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offse
                       __constant float * coeffs, float _1_3, float _a)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
 
-        __global const float * src = (__global const float *)(srcptr + src_idx);
-        __global float * dst = (__global float *)(dstptr + dst_idx);
+                __global const float * src = (__global const float *)(srcptr + src_idx);
+                __global float * dst = (__global float *)(dstptr + dst_idx);
+                float4 src_pix = vload4(0, src);
 
-        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+                float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                      C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                      C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 
-        float R = clamp(src[0], 0.0f, 1.0f);
-        float G = clamp(src[1], 0.0f, 1.0f);
-        float B = clamp(src[2], 0.0f, 1.0f);
+                float R = clamp(src_pix.x, 0.0f, 1.0f);
+                float G = clamp(src_pix.y, 0.0f, 1.0f);
+                float B = clamp(src_pix.z, 0.0f, 1.0f);
 
 #ifdef SRGB
-        R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-        G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-        B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
 #endif
 
-        float X = R*C0 + G*C1 + B*C2;
-        float Y = R*C3 + G*C4 + B*C5;
-        float Z = R*C6 + G*C7 + B*C8;
+                float X = R*C0 + G*C1 + B*C2;
+                float Y = R*C3 + G*C4 + B*C5;
+                float Z = R*C6 + G*C7 + B*C8;
 
-        float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a);
-        float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a);
-        float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a);
+                float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a);
+                float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a);
+                float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a);
 
-        float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
-        float a = 500.f * (FX - FY);
-        float b = 200.f * (FY - FZ);
+                float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
+                float a = 500.f * (FX - FY);
+                float b = 200.f * (FY - FZ);
 
-        dst[0] = L;
-        dst[1] = a;
-        dst[2] = b;
+                dst[0] = L;
+                dst[1] = a;
+                dst[2] = b;
+            }
+            ++y;
+        }
     }
 }
 
@@ -1225,33 +1452,42 @@ __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
                       __constant float * coeffs, float lThresh, float fThresh)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
 
-        src += src_idx;
-        dst += dst_idx;
+                __global const uchar* src_ptr = src + src_idx;
+                __global uchar* dst_ptr = dst + dst_idx;
+                uchar4 src_pix = vload4(0, src_ptr);
 
-        float srcbuf[3], dstbuf[3];
-        srcbuf[0] = src[0]*(100.f/255.f);
-        srcbuf[1] = convert_float(src[1] - 128);
-        srcbuf[2] = convert_float(src[2] - 128);
+                float srcbuf[3], dstbuf[3];
+                srcbuf[0] = src_pix.x*(100.f/255.f);
+                srcbuf[1] = convert_float(src_pix.y - 128);
+                srcbuf[2] = convert_float(src_pix.z - 128);
 
-        Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
 #ifdef SRGB
-            gammaTab,
+                    gammaTab,
 #endif
-            coeffs, lThresh, fThresh);
+                    coeffs, lThresh, fThresh);
 
-        dst[0] = SAT_CAST(dstbuf[0] * 255.0f);
-        dst[1] = SAT_CAST(dstbuf[1] * 255.0f);
-        dst[2] = SAT_CAST(dstbuf[2] * 255.0f);
+                dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
+                dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
+                dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
 #if dcn == 4
-        dst[3] = MAX_NUM;
+                dst_ptr[3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
@@ -1265,29 +1501,38 @@ __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offse
                       __constant float * coeffs, float lThresh, float fThresh)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
 
-    if (y < rows && x < cols)
+    if (x < cols)
     {
-        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
-        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+                int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
 
-        __global const float * src = (__global const float *)(srcptr + src_idx);
-        __global float * dst = (__global float *)(dstptr + dst_idx);
+                __global const float * src = (__global const float *)(srcptr + src_idx);
+                __global float * dst = (__global float *)(dstptr + dst_idx);
+                float4 src_pix = vload4(0, src);
 
-        float srcbuf[3], dstbuf[3];
-        srcbuf[0] = src[0], srcbuf[1] = src[1], srcbuf[2] = src[2];
+                float srcbuf[3], dstbuf[3];
+                srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
 
-        Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
 #ifdef SRGB
-            gammaTab,
+                    gammaTab,
 #endif
-            coeffs, lThresh, fThresh);
+                    coeffs, lThresh, fThresh);
 
-        dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
+                dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
 #if dcn == 4
-        dst[3] = MAX_NUM;
+                dst[3] = MAX_NUM;
 #endif
+            }
+            ++y;
+        }
     }
 }
 
diff --git a/modules/imgproc/src/samplers.cpp b/modules/imgproc/src/samplers.cpp
index d6cc8a56fe..197a7ced65 100644
--- a/modules/imgproc/src/samplers.cpp
+++ b/modules/imgproc/src/samplers.cpp
@@ -172,7 +172,7 @@ void getRectSubPix_Cn_(const _Tp* src, size_t src_step, Size src_size,
                 dst[j+1] = cast_op(s1);
             }
 
-            for( j = 0; j < win_size.width; j++ )
+            for( ; j < win_size.width; j++ )
             {
                 _WTp s0 = src[j]*a11 + src[j+cn]*a12 + src[j+src_step]*a21 + src[j+src_step+cn]*a22;
                 dst[j] = cast_op(s0);
diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp
old mode 100644
new mode 100755
index c32813fabd..6802916bab
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -219,6 +219,8 @@ static void integral_##suffix( T* src, size_t srcstep, ST* sum, size_t sumstep,
 DEF_INTEGRAL_FUNC(8u32s, uchar, int, double)
 DEF_INTEGRAL_FUNC(8u32f64f, uchar, float, double)
 DEF_INTEGRAL_FUNC(8u64f64f, uchar, double, double)
+DEF_INTEGRAL_FUNC(16u64f64f, ushort, double, double)
+DEF_INTEGRAL_FUNC(16s64f64f, short, double, double)
 DEF_INTEGRAL_FUNC(32f32f64f, float, float, double)
 DEF_INTEGRAL_FUNC(32f64f64f, float, double, double)
 DEF_INTEGRAL_FUNC(64f64f64f, double, double, double)
@@ -411,6 +413,10 @@ void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, Output
         func = (IntegralFunc)integral_8u32f32f;
     else if( depth == CV_8U && sdepth == CV_64F && sqdepth == CV_64F )
         func = (IntegralFunc)integral_8u64f64f;
+    else if( depth == CV_16U && sdepth == CV_64F && sqdepth == CV_64F )
+        func = (IntegralFunc)integral_16u64f64f;
+    else if( depth == CV_16S && sdepth == CV_64F && sqdepth == CV_64F )
+        func = (IntegralFunc)integral_16s64f64f;
     else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_64F )
         func = (IntegralFunc)integral_32f32f64f;
     else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_32F )
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index ca132dd0f8..d3fb92ebb3 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -341,10 +341,93 @@ static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _
 
 #endif
 
+#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+
+typedef IppStatus (CV_STDCALL * ippimatchTemplate)(const void*, int, IppiSize, const void*, int, IppiSize, Ipp32f* , int , IppEnum , Ipp8u*);
+
+static bool ipp_crossCorr(const Mat& src, const Mat& tpl, Mat& dst)
+{
+    if (src.channels()!= 1)
+        return false;
+
+    IppStatus status;
+
+    IppiSize srcRoiSize = {src.cols,src.rows};
+    IppiSize tplRoiSize = {tpl.cols,tpl.rows};
+
+    Ipp8u *pBuffer;
+    int bufSize=0;
+
+    int depth = src.depth();
+
+    ippimatchTemplate ippFunc =
+            depth==CV_8U ? (ippimatchTemplate)ippiCrossCorrNorm_8u32f_C1R:
+            depth==CV_32F? (ippimatchTemplate)ippiCrossCorrNorm_32f_C1R: 0;
+
+    if (ippFunc==0)
+        return false;
+
+    IppEnum funCfg = (IppEnum)(ippAlgAuto | ippiNormNone | ippiROIValid);
+
+    status = ippiCrossCorrNormGetBufferSize(srcRoiSize, tplRoiSize, funCfg, &bufSize);
+    if ( status < 0 )
+        return false;
+
+    pBuffer = ippsMalloc_8u( bufSize );
+
+    status = ippFunc(src.data, (int)src.step, srcRoiSize, tpl.data, (int)tpl.step, tplRoiSize, (Ipp32f*)dst.data, (int)dst.step, funCfg, pBuffer);
+
+    ippsFree( pBuffer );
+    return status >= 0;
+}
+
+static bool ipp_sqrDistance(const Mat& src, const Mat& tpl, Mat& dst)
+{
+    if (src.channels()!= 1)
+        return false;
+
+    IppStatus status;
+
+    IppiSize srcRoiSize = {src.cols,src.rows};
+    IppiSize tplRoiSize = {tpl.cols,tpl.rows};
+
+    Ipp8u *pBuffer;
+    int bufSize=0;
+
+    int depth = src.depth();
+
+    ippimatchTemplate ippFunc =
+            depth==CV_8U ? (ippimatchTemplate)ippiSqrDistanceNorm_8u32f_C1R:
+            depth==CV_32F? (ippimatchTemplate)ippiSqrDistanceNorm_32f_C1R: 0;
+
+    if (ippFunc==0)
+        return false;
+
+    IppEnum funCfg = (IppEnum)(ippAlgAuto | ippiNormNone | ippiROIValid);
+
+    status = ippiSqrDistanceNormGetBufferSize(srcRoiSize, tplRoiSize, funCfg, &bufSize);
+    if ( status < 0 )
+        return false;
+
+    pBuffer = ippsMalloc_8u( bufSize );
+
+    status = ippFunc(src.data, (int)src.step, srcRoiSize, tpl.data, (int)tpl.step, tplRoiSize, (Ipp32f*)dst.data, (int)dst.step, funCfg, pBuffer);
+
+    ippsFree( pBuffer );
+    return status >= 0;
+}
+
+#endif
+
 void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                 Size corrsize, int ctype,
                 Point anchor, double delta, int borderType )
 {
+#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+    if (ipp_crossCorr(img, _templ, corr))
+        return;
+#endif
+
     const double blockScale = 4.5;
     const int minBlockSize = 256;
     std::vector<uchar> buf;
@@ -560,6 +643,11 @@ void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result,
         return;
 #endif
 
+#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+    if (method == CV_TM_SQDIFF && ipp_sqrDistance(img, templ, result))
+        return;
+#endif
+
     int cn = img.channels();
     crossCorr( img, templ, result, result.size(), result.type(), Point(0,0), 0, 0);
 
diff --git a/modules/nonfree/src/sift.cpp b/modules/nonfree/src/sift.cpp
index 259e934ede..2112971e94 100644
--- a/modules/nonfree/src/sift.cpp
+++ b/modules/nonfree/src/sift.cpp
@@ -111,21 +111,6 @@ namespace cv
 
 /******************************* Defs and macros *****************************/
 
-// default number of sampled intervals per octave
-static const int SIFT_INTVLS = 3;
-
-// default sigma for initial gaussian smoothing
-static const float SIFT_SIGMA = 1.6f;
-
-// default threshold on keypoint contrast |D(x)|
-static const float SIFT_CONTR_THR = 0.04f;
-
-// default threshold on keypoint ratio of principle curvatures
-static const float SIFT_CURV_THR = 10.f;
-
-// double image size before pyramid construction?
-static const bool SIFT_IMG_DBL = true;
-
 // default width of descriptor histogram array
 static const int SIFT_DESCR_WIDTH = 4;
 
diff --git a/modules/video/src/simpleflow.cpp b/modules/video/src/simpleflow.cpp
index 66f4c41bdc..20fc6b5431 100644
--- a/modules/video/src/simpleflow.cpp
+++ b/modules/video/src/simpleflow.cpp
@@ -66,21 +66,6 @@ inline static float dist(const Vec2f& p1, const Vec2f& p2) {
          (p1[1] - p2[1]) * (p1[1] - p2[1]);
 }
 
-inline static float dist(const Point2f& p1, const Point2f& p2) {
-  return (p1.x - p2.x) * (p1.x - p2.x) +
-         (p1.y - p2.y) * (p1.y - p2.y);
-}
-
-inline static float dist(float x1, float y1, float x2, float y2) {
-  return (x1 - x2) * (x1 - x2) +
-         (y1 - y2) * (y1 - y2);
-}
-
-inline static int dist(int x1, int y1, int x2, int y2) {
-  return (x1 - x2) * (x1 - x2) +
-         (y1 - y2) * (y1 - y2);
-}
-
 template<class T>
 inline static T min(T t1, T t2, T t3) {
   return (t1 <= t2 && t1 <= t3) ? t1 : min(t2, t3);
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index 6ef95a8d39..5b92b5e502 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -93,6 +93,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
 
   ocv_list_filterout(cpp_samples "viz")
 
+  if(NOT HAVE_IPP_A)
+    ocv_list_filterout(cpp_samples "/ippasync/")
+  endif()
+
   foreach(sample_filename ${cpp_samples})
     get_filename_component(sample ${sample_filename} NAME_WE)
     OPENCV_DEFINE_CPP_EXAMPLE(${sample}  ${sample_filename})