diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 29501a0715..7ac3672cf7 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1491,6 +1491,9 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
         return false;
 
+    if( (oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE) && (depth1 >= CV_32F || depth2 >= CV_32F || ddepth >= CV_32F) )
+        return false;
+
     int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
     int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
 
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 21d5bdaca7..d6abaa4adb 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1541,7 +1541,7 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
         kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
     bool doubleSupport = d.doubleFPConfig() > 0;
 
-    if (!doubleSupport && depth == CV_64F)
+    if (depth == CV_32F || depth == CV_64F)
         return false;
 
     char cvt[2][50];
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 6900b51803..8bd2f457d9 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -432,7 +432,7 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
 
         IppStatus status = (IppStatus)-1;
         IppiSize roisize = { cols, rows };
-        int mstep = (int)mask.step, dstep = (int)step;
+        int mstep = (int)mask.step[0], dstep = (int)step[0];
 
         if (isContinuous() && mask.isContinuous())
         {
@@ -616,7 +616,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
 {
     CV_Assert(flipCode >= -1 && flipCode <= 1);
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);;
+            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
 
     if (cn > 4)
         return false;
@@ -631,7 +631,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
 
     ocl::Device dev = ocl::Device::getDefault();
     int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
-    kercn = std::max(kercn, cn);
+    kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
 
     ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
         format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
@@ -762,7 +762,7 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
         flipHoriz( dst.data, dst.step, dst.data, dst.step, dst.size(), esz );
 }
 
-#ifdef HAVE_OPENCL
+/*#ifdef HAVE_OPENCL
 
 static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
 {
@@ -790,7 +790,7 @@ static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
     return k.run(2, globalsize, NULL, false);
 }
 
-#endif
+#endif*/
 
 void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
 {
@@ -800,8 +800,8 @@ void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
     Size ssize = _src.size();
     _dst.create(ssize.height*ny, ssize.width*nx, _src.type());
 
-    CV_OCL_RUN(_dst.isUMat(),
-               ocl_repeat(_src, ny, nx, _dst))
+    /*CV_OCL_RUN(_dst.isUMat(),
+               ocl_repeat(_src, ny, nx, _dst))*/
 
     Mat src = _src.getMat(), dst = _dst.getMat();
     Size dsize = dst.size();
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index ba6df7261a..398abcaaa6 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -3336,7 +3336,7 @@ static inline void reduceSumC_8u16u16s32f_64f(const cv::Mat& srcmat, cv::Mat& ds
             stype == CV_32FC3 ? (ippiSumHint)ippiSum_32f_C3R :
             stype == CV_32FC4 ? (ippiSumHint)ippiSum_32f_C4R : 0;
         func =
-        sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double,   cv::OpAdd<double> > :
+            sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double,   cv::OpAdd<double> > :
             sdepth == CV_16U ? (cv::ReduceFunc)cv::reduceC_<ushort, double,   cv::OpAdd<double> > :
             sdepth == CV_16S ? (cv::ReduceFunc)cv::reduceC_<short, double,   cv::OpAdd<double> > :
             sdepth == CV_32F ? (cv::ReduceFunc)cv::reduceC_<float, double,   cv::OpAdd<double> > : 0;
@@ -3459,6 +3459,9 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
     if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
         return false;
 
+    if ((op == CV_REDUCE_SUM && sdepth == CV_32F) || op == CV_REDUCE_MIN || op == CV_REDUCE_MAX)
+        return false;
+
     if (op == CV_REDUCE_AVG)
     {
         if (sdepth < CV_32S && ddepth < CV_32S)
diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index 664673e5a2..1d84567ef9 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -209,7 +209,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 
 #if kercn == 1
 #ifdef NEED_MINVAL
-#if NEED_MINLOC
+#ifdef NEED_MINLOC
             if (minval > temp)
             {
                 minval = temp;
@@ -326,7 +326,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
             int lid2 = lsize + lid;
 
 #ifdef NEED_MINVAL
-#ifdef NEED_MAXLOC
+#ifdef NEED_MINLOC
             if (localmem_min[lid] >= localmem_min[lid2])
             {
                 if (localmem_min[lid] == localmem_min[lid2])
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 888fd7cacc..a0fde76509 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2283,7 +2283,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
 
                 setIppErrorStatus();
             }
-            typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
+            /*typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
             ippiMaskNormFuncC3 ippFuncC3 =
                 normType == NORM_INF ?
                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
@@ -2318,7 +2318,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                 }
                 setIppErrorStatus();
-            }
+            }*/
         }
         else
         {
@@ -2724,7 +2724,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                 0) :
                 normType == NORM_L1 ?
                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
-                type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
+                //type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
                 0) :
@@ -2741,7 +2741,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                 setIppErrorStatus();
             }
-            typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
+            /*typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
             ippiMaskNormDiffFuncC3 ippFuncC3 =
                 normType == NORM_INF ?
                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
@@ -2776,7 +2776,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                 }
                 setIppErrorStatus();
-            }
+            }*/
         }
         else
         {
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index a7a09cabb7..4940d80f7c 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -829,7 +829,7 @@ OCL_TEST_P(Pow, Mat)
 {
     static const double pows[] = { -4, -1, -2.5, 0, 1, 2, 3.7, 4 };
 
-    for (int j = 0; j < test_loop_times; j++)
+    for (int j = 0; j < 1/*test_loop_times*/; j++)
         for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
         {
             SCOPED_TRACE(pows[k]);
@@ -1203,7 +1203,7 @@ OCL_TEST_P(MinMaxIdx_Mask, Mat)
 
 static bool relativeError(double actual, double expected, double eps)
 {
-    return std::abs(actual - expected) / actual < eps;
+    return std::abs(actual - expected) < eps*(1 + std::abs(actual));
 }
 
 typedef ArithmTestBase Norm;
@@ -1230,7 +1230,7 @@ OCL_TEST_P(Norm, NORM_INF_1arg_mask)
         OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_INF, mask_roi));
         OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_INF, umask_roi));
 
-        EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        EXPECT_NEAR(cpuRes, gpuRes, 0.2);
     }
 }
 
@@ -1302,7 +1302,7 @@ OCL_TEST_P(Norm, NORM_INF_2args)
             OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type));
             OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type));
 
-            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+            EXPECT_NEAR(cpuRes, gpuRes, 0.2);
         }
 }
 
diff --git a/modules/cudabgsegm/CMakeLists.txt b/modules/cudabgsegm/CMakeLists.txt
index 3a882824b1..41517b6c69 100644
--- a/modules/cudabgsegm/CMakeLists.txt
+++ b/modules/cudabgsegm/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "CUDA-accelerated Background Segmentation")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
 
-ocv_define_module(cudabgsegm opencv_video OPTIONAL opencv_legacy opencv_imgproc opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
+ocv_define_module(cudabgsegm opencv_video OPTIONAL opencv_imgproc opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
diff --git a/modules/cudabgsegm/perf/perf_bgsegm.cpp b/modules/cudabgsegm/perf/perf_bgsegm.cpp
index 02fc9a8ee9..6e1ab46788 100644
--- a/modules/cudabgsegm/perf/perf_bgsegm.cpp
+++ b/modules/cudabgsegm/perf/perf_bgsegm.cpp
@@ -42,10 +42,6 @@
 
 #include "perf_precomp.hpp"
 
-#ifdef HAVE_OPENCV_CUDALEGACY
-#  include "opencv2/cudalegacy.hpp"
-#endif
-
 #ifdef HAVE_OPENCV_CUDAIMGPROC
 #  include "opencv2/cudaimgproc.hpp"
 #endif
@@ -72,18 +68,6 @@ using namespace perf;
 
 #if BUILD_WITH_VIDEO_INPUT_SUPPORT
 
-#ifdef HAVE_OPENCV_CUDALEGACY
-
-namespace cv
-{
-    template<> void DefaultDeleter<CvBGStatModel>::operator ()(CvBGStatModel* obj) const
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-#endif
-
 DEF_PARAM_TEST_1(Video, string);
 
 PERF_TEST_P(Video, FGDStatModel,
@@ -150,48 +134,7 @@ PERF_TEST_P(Video, FGDStatModel,
     }
     else
     {
-#ifdef HAVE_OPENCV_CUDALEGACY
-        IplImage ipl_frame = frame;
-        cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-        int i = 0;
-
-        // collect performance data
-        for (; i < numIters; ++i)
-        {
-            cap >> frame;
-            ASSERT_FALSE(frame.empty());
-
-            ipl_frame = frame;
-
-            startTimer();
-            if(!next())
-                break;
-
-            cvUpdateBGStatModel(&ipl_frame, model);
-
-            stopTimer();
-        }
-
-        // process last frame in sequence to get data for sanity test
-        for (; i < numIters; ++i)
-        {
-            cap >> frame;
-            ASSERT_FALSE(frame.empty());
-
-            ipl_frame = frame;
-
-            cvUpdateBGStatModel(&ipl_frame, model);
-        }
-
-        const cv::Mat background = cv::cvarrToMat(model->background);
-        const cv::Mat foreground = cv::cvarrToMat(model->foreground);
-
-        CPU_SANITY_CHECK(background);
-        CPU_SANITY_CHECK(foreground);
-#else
         FAIL_NO_CPU();
-#endif
     }
 }
 
diff --git a/modules/cudabgsegm/test/test_bgsegm.cpp b/modules/cudabgsegm/test/test_bgsegm.cpp
index 34f3dcc9ab..89fd69474c 100644
--- a/modules/cudabgsegm/test/test_bgsegm.cpp
+++ b/modules/cudabgsegm/test/test_bgsegm.cpp
@@ -42,10 +42,6 @@
 
 #include "test_precomp.hpp"
 
-#ifdef HAVE_OPENCV_CUDALEGACY
-#  include "opencv2/cudalegacy.hpp"
-#endif
-
 #ifdef HAVE_CUDA
 
 using namespace cvtest;
@@ -63,80 +59,6 @@ using namespace cvtest;
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
 #endif
 
-//////////////////////////////////////////////////////
-// FGDStatModel
-
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT && defined(HAVE_OPENCV_CUDALEGACY)
-
-namespace cv
-{
-    template<> void DefaultDeleter<CvBGStatModel>::operator ()(CvBGStatModel* obj) const
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-PARAM_TEST_CASE(FGDStatModel, cv::cuda::DeviceInfo, std::string)
-{
-    cv::cuda::DeviceInfo devInfo;
-    std::string inputFile;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        cv::cuda::setDevice(devInfo.deviceID());
-
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-    }
-};
-
-CUDA_TEST_P(FGDStatModel, Update)
-{
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    IplImage ipl_frame = frame;
-    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-    cv::cuda::GpuMat d_frame(frame);
-    cv::Ptr<cv::cuda::BackgroundSubtractorFGD> d_fgd = cv::cuda::createBackgroundSubtractorFGD();
-    cv::cuda::GpuMat d_foreground, d_background;
-    std::vector< std::vector<cv::Point> > foreground_regions;
-    d_fgd->apply(d_frame, d_foreground);
-
-    for (int i = 0; i < 5; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        ipl_frame = frame;
-        int gold_count = cvUpdateBGStatModel(&ipl_frame, model);
-
-        d_frame.upload(frame);
-        d_fgd->apply(d_frame, d_foreground);
-        d_fgd->getBackgroundImage(d_background);
-        d_fgd->getForegroundRegions(foreground_regions);
-        int count = (int) foreground_regions.size();
-
-        cv::Mat gold_background = cv::cvarrToMat(model->background);
-        cv::Mat gold_foreground = cv::cvarrToMat(model->foreground);
-
-        ASSERT_MAT_NEAR(gold_background, d_background, 1.0);
-        ASSERT_MAT_NEAR(gold_foreground, d_foreground, 0.0);
-        ASSERT_EQ(gold_count, count);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(CUDA_BgSegm, FGDStatModel, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"))));
-
-#endif
-
 //////////////////////////////////////////////////////
 // MOG
 
diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp
index 9326fa1932..ff730ee941 100644
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@@ -66,6 +66,11 @@ public:
         return 0;
     }
 
+    int bayer2RGBA(const T*, int, T*, int, int) const
+    {
+        return 0;
+    }
+
     int bayer2RGB_EA(const T*, int, T*, int, int) const
     {
         return 0;
@@ -218,6 +223,11 @@ public:
         return (int)(bayer - (bayer_end - width));
     }
 
+    int bayer2RGBA(const uchar*, int, uchar*, int, int) const
+    {
+        return 0;
+    }
+
     int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
     {
         if (!use_simd)
@@ -323,6 +333,165 @@ public:
 
     bool use_simd;
 };
+#elif CV_NEON
+class SIMDBayerInterpolator_8u
+{
+public:
+    SIMDBayerInterpolator_8u()
+    {
+    }
+
+    int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
+                   int width, int bcoeff, int gcoeff, int rcoeff) const
+    {
+        /*
+         B G B G | B G B G | B G B G | B G B G
+         G R G R | G R G R | G R G R | G R G R
+         B G B G | B G B G | B G B G | B G B G
+         */
+
+        uint16x8_t masklo = vdupq_n_u16(255);
+        const uchar* bayer_end = bayer + width;
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
+        {
+            uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
+            uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
+            uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
+
+            uint16x8_t b1_ = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
+            uint16x8_t b1 = vextq_u16(b1_, b1_, 1);
+            uint16x8_t b0 = vaddq_u16(b1_, b1);
+            // b0 = b0 b2 b4 ...
+            // b1 = b1 b3 b5 ...
+
+            uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
+            uint16x8_t g1 = vandq_u16(r1, masklo);
+            g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
+            g1 = vshlq_n_u16(vextq_u16(g1, g1, 1), 2);
+            // g0 = b0 b2 b4 ...
+            // g1 = b1 b3 b5 ...
+
+            r0 = vshrq_n_u16(r1, 8);
+            r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
+            r0 = vshlq_n_u16(r0, 2);
+            // r0 = r0 r2 r4 ...
+            // r1 = r1 r3 r5 ...
+
+            b0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b0), (short)(rcoeff*2)));
+            b1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b1), (short)(rcoeff*4)));
+
+            g0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g0), (short)(gcoeff*2)));
+            g1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g1), (short)(gcoeff*2)));
+
+            r0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r0), (short)(bcoeff*2)));
+            r1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r1), (short)(bcoeff*4)));
+
+            g0 = vaddq_u16(vaddq_u16(g0, b0), r0);
+            g1 = vaddq_u16(vaddq_u16(g1, b1), r1);
+
+            uint8x8x2_t p = vzip_u8(vrshrn_n_u16(g0, 2), vrshrn_n_u16(g1, 2));
+            vst1_u8(dst, p.val[0]);
+            vst1_u8(dst + 8, p.val[1]);
+        }
+
+        return (int)(bayer - (bayer_end - width));
+    }
+
+    int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
+    {
+        /*
+         B G B G | B G B G | B G B G | B G B G
+         G R G R | G R G R | G R G R | G R G R
+         B G B G | B G B G | B G B G | B G B G
+         */
+        uint16x8_t masklo = vdupq_n_u16(255);
+        uint8x16x3_t pix;
+        const uchar* bayer_end = bayer + width;
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
+        {
+            uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
+            uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
+            uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
+
+            uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
+            uint16x8_t nextb1 = vextq_u16(b1, b1, 1);
+            uint16x8_t b0 = vaddq_u16(b1, nextb1);
+            // b0 b1 b2 ...
+            uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1));
+            pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]);
+
+            uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
+            uint16x8_t g1 = vandq_u16(r1, masklo);
+            g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
+            g1 = vextq_u16(g1, g1, 1);
+            // g0 g1 g2 ...
+            uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1));
+            pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]);
+
+            r0 = vshrq_n_u16(r1, 8);
+            r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
+            // r0 r1 r2 ...
+            uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1));
+            pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]);
+
+            vst3q_u8(dst-1, pix);
+        }
+
+        return (int)(bayer - (bayer_end - width));
+    }
+
+    int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
+    {
+        /*
+         B G B G | B G B G | B G B G | B G B G
+         G R G R | G R G R | G R G R | G R G R
+         B G B G | B G B G | B G B G | B G B G
+         */
+        uint16x8_t masklo = vdupq_n_u16(255);
+        uint8x16x4_t pix;
+        const uchar* bayer_end = bayer + width;
+        pix.val[3] = vdupq_n_u8(255);
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
+        {
+            uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
+            uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
+            uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
+
+            uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
+            uint16x8_t nextb1 = vextq_u16(b1, b1, 1);
+            uint16x8_t b0 = vaddq_u16(b1, nextb1);
+            // b0 b1 b2 ...
+            uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1));
+            pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]);
+
+            uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
+            uint16x8_t g1 = vandq_u16(r1, masklo);
+            g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
+            g1 = vextq_u16(g1, g1, 1);
+            // g0 g1 g2 ...
+            uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1));
+            pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]);
+
+            r0 = vshrq_n_u16(r1, 8);
+            r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
+            // r0 r1 r2 ...
+            uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1));
+            pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]);
+
+            vst4q_u8(dst-1, pix);
+        }
+
+        return (int)(bayer - (bayer_end - width));
+    }
+
+    int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const
+    {
+        return 0;
+    }
+};
 #else
 typedef SIMDBayerStubInterpolator_<uchar> SIMDBayerInterpolator_8u;
 #endif
@@ -559,7 +728,9 @@ public:
             }
 
             // simd optimization only for dcn == 3
-            int delta = dcn == 4 ? 0 : vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
+            int delta = dcn == 4 ?
+                vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) :
+                vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
             bayer += delta;
             dst += delta*dcn;
 
diff --git a/modules/imgproc/src/opencl/pyr_down.cl b/modules/imgproc/src/opencl/pyr_down.cl
index 2358775e7a..4db1a8d811 100644
--- a/modules/imgproc/src/opencl/pyr_down.cl
+++ b/modules/imgproc/src/opencl/pyr_down.cl
@@ -89,19 +89,56 @@
 #define MAD(x,y,z) mad((x),(y),(z))
 #endif
 
+#define LOAD_LOCAL(col_gl, col_lcl) \
+    sum0 =     co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));         \
+    sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0);  \
+    temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows));                      \
+    sum0 = MAD(co1, temp, sum0);                                            \
+    sum1 = co3 * temp;                                                      \
+    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows));                  \
+    sum0 = MAD(co2, temp, sum0);                                            \
+    sum1 = MAD(co2, temp, sum1);                                            \
+    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows));                  \
+    sum0 = MAD(co3, temp, sum0);                                            \
+    sum1 = MAD(co1, temp, sum1);                                            \
+    smem[0][col_lcl] = sum0;                                                \
+    sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1);  \
+    sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1);  \
+    smem[1][col_lcl] = sum1;
+
+
+#if kercn == 4
+#define LOAD_LOCAL4(col_gl, col_lcl) \
+    sum40 =     co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));           \
+    sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40);   \
+    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y, src_rows));                       \
+    sum40 = MAD(co1, temp4, sum40);                                             \
+    sum41 = co3 * temp4;                                                        \
+    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 1, src_rows));                   \
+    sum40 = MAD(co2, temp4, sum40);                                             \
+    sum41 = MAD(co2, temp4, sum41);                                             \
+    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 2, src_rows));                   \
+    sum40 = MAD(co3, temp4, sum40);                                             \
+    sum41 = MAD(co1, temp4, sum41);                                             \
+    vstore4(sum40, col_lcl, (__local float*) &smem[0][2]);                      \
+    sum41 = MAD(co2, SRC4(col_gl,  EXTRAPOLATE_(src_y + 3, src_rows)), sum41);  \
+    sum41 = MAD(co3, SRC4(col_gl,  EXTRAPOLATE_(src_y + 4, src_rows)), sum41);  \
+    vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);
+#endif
+
 #define noconvert
 
 __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
                          __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
 {
     const int x = get_global_id(0)*kercn;
-    const int y = get_group_id(1);
+    const int y = 2*get_global_id(1);
 
-    __local FT smem[LOCAL_SIZE + 4];
+    __local FT smem[2][LOCAL_SIZE + 4];
     __global uchar * dstData = dst + dst_offset;
     __global const uchar * srcData = src + src_offset;
 
-    FT sum;
+    FT sum0, sum1, temp;
     FT co1 = 0.375f;
     FT co2 = 0.25f;
     FT co3 = 0.0625f;
@@ -109,134 +146,68 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
     const int src_y = 2*y;
     int col;
 
-    if (src_y >= 2 && src_y < src_rows - 2)
+    if (src_y >= 2 && src_y < src_rows - 4)
     {
+#define EXTRAPOLATE_(val, maxVal)   val
 #if kercn == 1
         col = EXTRAPOLATE(x, src_cols);
-
-        sum =     co3* SRC(col, src_y - 2);
-        sum = MAD(co2, SRC(col, src_y - 1), sum);
-        sum = MAD(co1, SRC(col, src_y    ), sum);
-        sum = MAD(co2, SRC(col, src_y + 1), sum);
-        sum = MAD(co3, SRC(col, src_y + 2), sum);
-
-        smem[2 + get_local_id(0)] = sum;
+        LOAD_LOCAL(col, 2 + get_local_id(0))
 #else
         if (x < src_cols-4)
         {
-            float4 sum4;
-            sum4 =     co3* SRC4(x, src_y - 2);
-            sum4 = MAD(co2, SRC4(x, src_y - 1), sum4);
-            sum4 = MAD(co1, SRC4(x, src_y    ), sum4);
-            sum4 = MAD(co2, SRC4(x, src_y + 1), sum4);
-            sum4 = MAD(co3, SRC4(x, src_y + 2), sum4);
-
-            vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
+            float4 sum40, sum41, temp4;
+            LOAD_LOCAL4(x, get_local_id(0))
         }
         else
         {
             for (int i=0; i<4; i++)
             {
                 col = EXTRAPOLATE(x+i, src_cols);
-                sum =     co3* SRC(col, src_y - 2);
-                sum = MAD(co2, SRC(col, src_y - 1), sum);
-                sum = MAD(co1, SRC(col, src_y    ), sum);
-                sum = MAD(co2, SRC(col, src_y + 1), sum);
-                sum = MAD(co3, SRC(col, src_y + 2), sum);
-
-                smem[2 + 4*get_local_id(0)+i] = sum;
+                LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)
             }
         }
 #endif
         if (get_local_id(0) < 2)
         {
             col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
-
-            sum =     co3* SRC(col, src_y - 2);
-            sum = MAD(co2, SRC(col, src_y - 1), sum);
-            sum = MAD(co1, SRC(col, src_y    ), sum);
-            sum = MAD(co2, SRC(col, src_y + 1), sum);
-            sum = MAD(co3, SRC(col, src_y + 2), sum);
-
-            smem[get_local_id(0)] = sum;
+            LOAD_LOCAL(col, get_local_id(0))
         }
-
-        if (get_local_id(0) > 1 && get_local_id(0) < 4)
+        else if (get_local_id(0) < 4)
         {
             col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
-
-            sum =     co3* SRC(col, src_y - 2);
-            sum = MAD(co2, SRC(col, src_y - 1), sum);
-            sum = MAD(co1, SRC(col, src_y    ), sum);
-            sum = MAD(co2, SRC(col, src_y + 1), sum);
-            sum = MAD(co3, SRC(col, src_y + 2), sum);
-
-            smem[LOCAL_SIZE + get_local_id(0)] = sum;
+            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
         }
     }
     else // need extrapolate y
     {
+#define EXTRAPOLATE_(val, maxVal)   EXTRAPOLATE(val, maxVal)
 #if kercn == 1
         col = EXTRAPOLATE(x, src_cols);
-
-        sum =     co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
-        sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
-        sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y    , src_rows)), sum);
-        sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
-        sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
-
-        smem[2 + get_local_id(0)] = sum;
+        LOAD_LOCAL(col, 2 + get_local_id(0))
 #else
         if (x < src_cols-4)
         {
-            float4 sum4;
-            sum4 =     co3* SRC4(x, EXTRAPOLATE(src_y - 2, src_rows));
-            sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4);
-            sum4 = MAD(co1, SRC4(x, EXTRAPOLATE(src_y    , src_rows)), sum4);
-            sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4);
-            sum4 = MAD(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4);
-
-            vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
+            float4 sum40, sum41, temp4;
+            LOAD_LOCAL4(x, get_local_id(0))
         }
         else
         {
             for (int i=0; i<4; i++)
             {
                 col = EXTRAPOLATE(x+i, src_cols);
-                sum =     co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
-                sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
-                sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y    , src_rows)), sum);
-                sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
-                sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
-
-                smem[2 + 4*get_local_id(0)+i] = sum;
+                LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)
             }
         }
 #endif
         if (get_local_id(0) < 2)
         {
             col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
-
-            sum =     co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
-            sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
-            sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y    , src_rows)), sum);
-            sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
-            sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
-
-            smem[get_local_id(0)] = sum;
+            LOAD_LOCAL(col, get_local_id(0))
         }
-
-        if (get_local_id(0) > 1 && get_local_id(0) < 4)
+        else if (get_local_id(0) < 4)
         {
             col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
-
-            sum =     co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
-            sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
-            sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y    , src_rows)), sum);
-            sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
-            sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
-
-            smem[LOCAL_SIZE + get_local_id(0)] = sum;
+            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
         }
     }
 
@@ -247,50 +218,68 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
     {
         const int tid2 = get_local_id(0) * 2;
 
-        sum = 0.f;
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dst_cols)
+        {
+            for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
+            {
 #if cn == 1
 #if fdepth <= 5
-        sum = sum + dot(vload4(0, (__local float*) (&smem)+tid2), (float4)(co3, co2, co1, co2));
+                FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
 #else
-        sum = sum + dot(vload4(0, (__local double*) (&smem)+tid2), (double4)(co3, co2, co1, co2));
+                FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
 #endif
 #else
-        sum = MAD(co3, smem[2 + tid2 - 2], sum);
-        sum = MAD(co2, smem[2 + tid2 - 1], sum);
-        sum = MAD(co1, smem[2 + tid2    ], sum);
-        sum = MAD(co2, smem[2 + tid2 + 1], sum);
+                FT sum = co3 * smem[yin - y][2 + tid2 - 2];
+                sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);
+                sum = MAD(co1, smem[yin - y][2 + tid2    ], sum);
+                sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
 #endif
-        sum = MAD(co3, smem[2 + tid2 + 2], sum);
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dst_cols)
-            storepix(convertToT(sum), dstData + y * dst_step + dst_x * PIXSIZE);
+                sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
+                storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
+            }
+        }
     }
 #else
     int tid4 = get_local_id(0) * 4;
-
-    sum =     co3* smem[2 + tid4 + 2];
-    sum = MAD(co3, smem[2 + tid4 - 2], sum);
-    sum = MAD(co2, smem[2 + tid4 - 1], sum);
-    sum = MAD(co1, smem[2 + tid4    ], sum);
-    sum = MAD(co2, smem[2 + tid4 + 1], sum);
-
     int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
+    if (dst_x < dst_cols - 1)
+    {
+        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
+        {
 
-    if (dst_x < dst_cols)
-        storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));
-
-    tid4 += 2;
-    dst_x += 1;
+            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
+            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
+            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
+            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
+
+            dst_x ++;
+            sum =     co3* smem[yin - y][2 + tid4 + 4];
+            sum = MAD(co3, smem[yin - y][2 + tid4    ], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
+            sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
+            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
+            dst_x --;
+        }
 
-    sum =     co3* smem[2 + tid4 + 2];
-    sum = MAD(co3, smem[2 + tid4 - 2], sum);
-    sum = MAD(co2, smem[2 + tid4 - 1], sum);
-    sum = MAD(co1, smem[2 + tid4    ], sum);
-    sum = MAD(co2, smem[2 + tid4 + 1], sum);
+    }
+    else if (dst_x < dst_cols)
+    {
+        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
+        {
+            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
+            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
+            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
 
-    if (dst_x < dst_cols)
-        storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));
+            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
+        }
+    }
 #endif
+
 }
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index cbbe399301..2714e08f30 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -445,7 +445,7 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
 
     size_t localThreads[2]  = { local_size/kercn, 1 };
-    size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, dst.rows };
+    size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, (dst.rows + 1) / 2 };
     return k.run(2, globalThreads, localThreads, false);
 }
 
diff --git a/modules/photo/doc/cloning.rst b/modules/photo/doc/cloning.rst
index 384c6b676a..0965d3a72f 100644
--- a/modules/photo/doc/cloning.rst
+++ b/modules/photo/doc/cloning.rst
@@ -7,7 +7,7 @@ seamlessClone
 -------------
 Image editing tasks concern either global changes (color/intensity corrections, filters, deformations) or local changes concerned to a selection.
 Here we are interested in achieving local changes, ones that are restricted to a region manually selected (ROI), in a seamless and effortless manner.
-The extent of the changes ranges from slight distortions to complete replacement by novel content.
+The extent of the changes ranges from slight distortions to complete replacement by novel content [PM03]_.
 
 .. ocv:function:: void seamlessClone( InputArray src, InputArray dst, InputArray mask, Point p, OutputArray blend, int flags)
 
@@ -25,13 +25,9 @@ The extent of the changes ranges from slight distortions to complete replacement
 
             * **NORMAL_CLONE**     The power of the method is fully expressed when inserting objects with complex outlines into a new background
 
-            * **MIXED_CLONE**    The classic method, color-based selection and alpha
-                                 masking might be time consuming and often leaves an undesirable halo. Seamless
-                                 cloning, even averaged with the original image, is not effective. Mixed seamless
-                                 cloning based on a loose selection proves effective.
+            * **MIXED_CLONE**    The classic method, color-based selection and alpha masking might be time consuming and often leaves an undesirable halo. Seamless cloning, even averaged with the original image, is not effective. Mixed seamless cloning based on a loose selection proves effective.
 
-            * **FEATURE_EXCHANGE**     Feature exchange allows the user to replace easily certain
-                                       features of one object by alternative features.
+            * **FEATURE_EXCHANGE**     Feature exchange allows the user to easily replace certain features of one object by alternative features.
 
 
 
@@ -97,3 +93,5 @@ region, giving its contents a flat aspect. Here Canny Edge Detector is used.
 **NOTE:**
 
 The algorithm assumes that the color of the source image is close to that of the destination. This assumption means that when the colors don't match, the source image color gets tinted toward the color of the destination image.
+
+.. [PM03] Patrick Perez, Michel Gangnet, Andrew Blake, "Poisson image editing", ACM Transactions on Graphics (SIGGRAPH), 2003.
diff --git a/modules/photo/doc/decolor.rst b/modules/photo/doc/decolor.rst
index cf7b9b9c4c..69bf0d590b 100644
--- a/modules/photo/doc/decolor.rst
+++ b/modules/photo/doc/decolor.rst
@@ -6,7 +6,7 @@ Decolorization
 decolor
 -------
 
-Transforms a color image to a grayscale image. It is a basic tool in digital printing, stylized black-and-white photograph rendering, and in many single channel image processing applications.
+Transforms a color image to a grayscale image. It is a basic tool in digital printing, stylized black-and-white photograph rendering, and in many single channel image processing applications [CL12]_.
 
 .. ocv:function:: void decolor( InputArray src, OutputArray grayscale, OutputArray color_boost )
 
@@ -17,3 +17,5 @@ Transforms a color image to a grayscale image. It is a basic tool in digital pri
     :param color_boost: Output 8-bit 3-channel image.
 
 This function is to be applied on color images.
+
+.. [CL12] Cewu Lu, Li Xu, Jiaya Jia, "Contrast Preserving Decolorization", IEEE International Conference on Computational Photography (ICCP), 2012.
diff --git a/modules/photo/doc/hdr_imaging.rst b/modules/photo/doc/hdr_imaging.rst
index bcd962f86d..708ca87902 100644
--- a/modules/photo/doc/hdr_imaging.rst
+++ b/modules/photo/doc/hdr_imaging.rst
@@ -356,7 +356,7 @@ Creates MergeRobertson object
 .. ocv:function:: Ptr<MergeRobertson> createMergeRobertson()
 
 References
-==========
+---------------------------
 
 .. [DM03] F. Drago, K. Myszkowski, T. Annen, N. Chiba, "Adaptive Logarithmic Mapping For Displaying High Contrast Scenes", Computer Graphics Forum, 2003, 22, 419 - 426.
 
diff --git a/modules/photo/doc/npr.rst b/modules/photo/doc/npr.rst
index 123c946c2a..c07fd69beb 100644
--- a/modules/photo/doc/npr.rst
+++ b/modules/photo/doc/npr.rst
@@ -6,7 +6,7 @@ Non-Photorealistic Rendering
 edgePreservingFilter
 --------------------
 
-Filtering is the fundamental operation in image and video processing. Edge-preserving smoothing filters are used in many different applications.
+Filtering is the fundamental operation in image and video processing. Edge-preserving smoothing filters are used in many different applications [EM11]_.
 
 .. ocv:function:: void edgePreservingFilter(InputArray src, OutputArray dst, int flags = 1, float sigma_s = 60, float sigma_r = 0.4f)
 
@@ -16,9 +16,9 @@ Filtering is the fundamental operation in image and video processing. Edge-prese
 
     :param flags: Edge preserving filters:
 
-            * **RECURS_FILTER**
+            * **RECURS_FILTER** = 1
 
-            * **NORMCONV_FILTER**
+            * **NORMCONV_FILTER** = 2
 
     :param sigma_s: Range between 0 to 200.
 
@@ -72,3 +72,5 @@ Stylization aims to produce digital imagery with a wide variety of effects not f
     :param sigma_s: Range between 0 to 200.
 
     :param sigma_r: Range between 0 to 1.
+
+.. [EM11] Eduardo S. L. Gastal, Manuel M. Oliveira, "Domain transform for edge-aware image and video processing", ACM Trans. Graph. 30(4): 69, 2011.
diff --git a/modules/photo/src/npr.hpp b/modules/photo/src/npr.hpp
index 744b2bdfbb..2ff1985aca 100644
--- a/modules/photo/src/npr.hpp
+++ b/modules/photo/src/npr.hpp
@@ -173,6 +173,7 @@ void Domain_Filter::compute_Rfilter(Mat &output, Mat &hz, float sigma_h)
 {
     int h = output.rows;
     int w = output.cols;
+    int channel = output.channels();
 
     float a = (float) exp((-1.0 * sqrt(2.0)) / sigma_h);
 
@@ -185,11 +186,15 @@ void Domain_Filter::compute_Rfilter(Mat &output, Mat &hz, float sigma_h)
         for(int j=0;j<w;j++)
             V.at<float>(i,j) = pow(a,hz.at<float>(i,j));
 
-    for(int i=0; i<h; i++)
+   for(int i=0; i<h; i++)
     {
         for(int j =1; j < w; j++)
         {
-           temp.at<float>(i,j) = temp.at<float>(i,j) + (temp.at<float>(i,j-1) - temp.at<float>(i,j)) * V.at<float>(i,j);
+            for(int c = 0; c<channel; c++)
+            {
+                temp.at<float>(i,j*channel+c) = temp.at<float>(i,j*channel+c) +
+                    (temp.at<float>(i,(j-1)*channel+c) - temp.at<float>(i,j*channel+c)) * V.at<float>(i,j);
+            }
         }
     }
 
@@ -197,7 +202,11 @@ void Domain_Filter::compute_Rfilter(Mat &output, Mat &hz, float sigma_h)
     {
         for(int j =w-2; j >= 0; j--)
         {
-           temp.at<float>(i,j) = temp.at<float>(i,j) + (temp.at<float>(i,j+1) - temp.at<float>(i,j)) * V.at<float>(i,j+1);
+            for(int c = 0; c<channel; c++)
+            {
+                temp.at<float>(i,j*channel+c) = temp.at<float>(i,j*channel+c) +
+                    (temp.at<float>(i,(j+1)*channel+c) - temp.at<float>(i,j*channel+c))*V.at<float>(i,j+1);
+            }
         }
     }
 
diff --git a/modules/photo/src/seamless_cloning.cpp b/modules/photo/src/seamless_cloning.cpp
index 6ddadb3202..445c6dae74 100644
--- a/modules/photo/src/seamless_cloning.cpp
+++ b/modules/photo/src/seamless_cloning.cpp
@@ -108,6 +108,7 @@ void cv::seamlessClone(InputArray _src, InputArray _dst, InputArray _mask, Point
 
     Cloning obj;
     obj.normal_clone(dest,cd_mask,dst_mask,blend,flags);
+
 }
 
 void cv::colorChange(InputArray _src, InputArray _mask, OutputArray _dst, float r, float g, float b)
@@ -136,7 +137,6 @@ void cv::colorChange(InputArray _src, InputArray _mask, OutputArray _dst, float
     obj.local_color_change(src,cs_mask,gray,blend,red,green,blue);
 }
 
-
 void cv::illuminationChange(InputArray _src, InputArray _mask, OutputArray _dst, float a, float b)
 {
 
diff --git a/modules/photo/src/seamless_cloning.hpp b/modules/photo/src/seamless_cloning.hpp
index 143d550894..669be9f089 100644
--- a/modules/photo/src/seamless_cloning.hpp
+++ b/modules/photo/src/seamless_cloning.hpp
@@ -455,6 +455,8 @@ void Cloning::normal_clone(Mat &I, Mat &mask, Mat &wmask, Mat &cloned, int num)
 {
     int w = I.size().width;
     int h = I.size().height;
+    int channel = I.channels();
+
 
     initialization(I,mask,wmask);
 
@@ -466,20 +468,33 @@ void Cloning::normal_clone(Mat &I, Mat &mask, Mat &wmask, Mat &cloned, int num)
     }
     else if(num == 2)
     {
+
         for(int i=0;i < h; i++)
-            for(int j=0; j < w; j++)
+        {
+           for(int j=0; j < w; j++)
             {
-                if(abs(sgx.at<float>(i,j) - sgy.at<float>(i,j)) > abs(grx.at<float>(i,j) - gry.at<float>(i,j)))
+                for(int c=0;c<channel;++c)
                 {
-                    srx32.at<float>(i,j) = sgx.at<float>(i,j) * smask.at<float>(i,j);
-                    sry32.at<float>(i,j) = sgy.at<float>(i,j) * smask.at<float>(i,j);
-                }
-                else
-                {
-                    srx32.at<float>(i,j) = grx.at<float>(i,j) * smask.at<float>(i,j);
-                    sry32.at<float>(i,j) = gry.at<float>(i,j) * smask.at<float>(i,j);
+                    if(abs(sgx.at<float>(i,j*channel+c) - sgy.at<float>(i,j*channel+c)) >
+                            abs(grx.at<float>(i,j*channel+c) - gry.at<float>(i,j*channel+c)))
+                    {
+
+                        srx32.at<float>(i,j*channel+c) = sgx.at<float>(i,j*channel+c)
+                            * smask.at<float>(i,j);
+                        sry32.at<float>(i,j*channel+c) = sgy.at<float>(i,j*channel+c)
+                            * smask.at<float>(i,j);
+                    }
+                    else
+                    {
+                        srx32.at<float>(i,j*channel+c) = grx.at<float>(i,j*channel+c)
+                            * smask.at<float>(i,j);
+                        sry32.at<float>(i,j*channel+c) = gry.at<float>(i,j*channel+c)
+                            * smask.at<float>(i,j);
+                    }
                 }
             }
+        }
+
     }
     else if(num == 3)
     {
diff --git a/modules/videoio/CMakeLists.txt b/modules/videoio/CMakeLists.txt
index bba3d33396..96ac5045f5 100644
--- a/modules/videoio/CMakeLists.txt
+++ b/modules/videoio/CMakeLists.txt
@@ -148,7 +148,7 @@ endif(HAVE_INTELPERC)
 
 if(IOS)
   add_definitions(-DHAVE_IOS=1)
-  list(APPEND videoio_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)
+  list(APPEND videoio_srcs src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)
   list(APPEND VIDEOIO_LIBRARIES "-framework Accelerate" "-framework AVFoundation" "-framework CoreGraphics" "-framework CoreImage" "-framework CoreMedia" "-framework CoreVideo" "-framework QuartzCore" "-framework AssetsLibrary")
 endif()
 
diff --git a/samples/cpp/tutorial_code/photo/decolorization/decolor.cpp b/samples/cpp/tutorial_code/photo/decolorization/decolor.cpp
new file mode 100644
index 0000000000..067bad1178
--- /dev/null
+++ b/samples/cpp/tutorial_code/photo/decolorization/decolor.cpp
@@ -0,0 +1,40 @@
+/*
+* decolor.cpp
+*
+* Author:
+* Siddharth Kherada <siddharthkherada27[at]gmail[dot]com>
+*
+* This tutorial demonstrates how to use OpenCV Decolorization Module.
+*
+* Input:
+* Color Image
+*
+* Output:
+* 1) Grayscale image
+* 2) Color boost image
+*
+*/
+
+#include "opencv2/photo.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/core.hpp"
+#include <iostream>
+
+using namespace std;
+using namespace cv;
+
+int main(int argc, char *argv[])
+{
+    CV_Assert(argc == 2);
+    Mat I;
+    I = imread(argv[1]);
+
+    Mat gray = Mat(I.size(),CV_8UC1);
+    Mat color_boost = Mat(I.size(),CV_8UC3);
+
+    decolor(I,gray,color_boost);
+    imshow("grayscale",gray);
+    imshow("color_boost",color_boost);
+    waitKey(0);
+}
diff --git a/samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp b/samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp
new file mode 100644
index 0000000000..5579ca269f
--- /dev/null
+++ b/samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp
@@ -0,0 +1,96 @@
+/*
+* npr_demo.cpp
+*
+* Author:
+* Siddharth Kherada <siddharthkherada27[at]gmail[dot]com>
+*
+* This tutorial demonstrates how to use OpenCV Non-Photorealistic Rendering Module.
+* 1) Edge Preserve Smoothing
+*    -> Using Normalized convolution Filter
+*    -> Using Recursive Filter
+* 2) Detail Enhancement
+* 3) Pencil sketch/Color Pencil Drawing
+* 4) Stylization
+*
+*/
+
+#include <signal.h>
+#include "opencv2/photo.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/core.hpp"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+using namespace cv;
+
+int main(int argc, char* argv[])
+{
+    if(argc < 2)
+    {
+        cout << "usage: " << argv[0] << " <Input image> "  << endl;
+        exit(0);
+    }
+
+    int num,type;
+
+    Mat I = imread(argv[1]);
+
+    if(!I.data)
+    {
+        cout <<  "Image not found" << endl;
+        exit(0);
+    }
+
+    cout << endl;
+    cout << " Edge Preserve Filter" << endl;
+    cout << "----------------------" << endl;
+
+    cout << "Options: " << endl;
+    cout << endl;
+
+    cout << "1) Edge Preserve Smoothing" << endl;
+    cout << "   -> Using Normalized convolution Filter" << endl;
+    cout << "   -> Using Recursive Filter" << endl;
+    cout << "2) Detail Enhancement" << endl;
+    cout << "3) Pencil sketch/Color Pencil Drawing" << endl;
+    cout << "4) Stylization" << endl;
+    cout << endl;
+
+    cout << "Press number 1-4 to choose from above techniques: ";
+
+    cin >> num;
+
+    Mat img;
+
+    if(num == 1)
+    {
+        cout << endl;
+        cout << "Press 1 for Normalized Convolution Filter and 2 for Recursive Filter: ";
+
+        cin >> type;
+
+        edgePreservingFilter(I,img,type);
+        imshow("Edge Preserve Smoothing",img);
+
+    }
+    else if(num == 2)
+    {
+        detailEnhance(I,img);
+        imshow("Detail Enhanced",img);
+    }
+    else if(num == 3)
+    {
+        Mat img1;
+        pencilSketch(I,img1, img, 10 , 0.1f, 0.03f);
+        imshow("Pencil Sketch",img1);
+        imshow("Color Pencil Sketch",img);
+    }
+    else if(num == 4)
+    {
+        stylization(I,img);
+        imshow("Stylization",img);
+    }
+    waitKey(0);
+}
diff --git a/samples/cpp/tutorial_code/photo/seamless_cloning/cloning_demo.cpp b/samples/cpp/tutorial_code/photo/seamless_cloning/cloning_demo.cpp
new file mode 100644
index 0000000000..24d9b7facf
--- /dev/null
+++ b/samples/cpp/tutorial_code/photo/seamless_cloning/cloning_demo.cpp
@@ -0,0 +1,246 @@
+/*
+* cloning_demo.cpp
+*
+* Author:
+* Siddharth Kherada <siddharthkherada27[at]gmail[dot]com>
+*
+* This tutorial demonstrates how to use OpenCV seamless cloning
+* module without GUI.
+*
+* 1- Normal Cloning
+* 2- Mixed Cloning
+* 3- Monochrome Transfer
+* 4- Color Change
+* 5- Illumination change
+* 6- Texture Flattening
+
+* The program takes as input a source and a destination image (for 1-3 methods)
+* and ouputs the cloned image.
+*
+* Download test images from opencv_extra folder @github.
+*
+*/
+
+#include "opencv2/photo.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/core.hpp"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+using namespace cv;
+
+int main()
+{
+    cout << endl;
+    cout << "Cloning Module" << endl;
+    cout << "---------------" << endl;
+    cout << "Options: " << endl;
+    cout << endl;
+    cout << "1) Normal Cloning " << endl;
+    cout << "2) Mixed Cloning " << endl;
+    cout << "3) Monochrome Transfer " << endl;
+    cout << "4) Local Color Change " << endl;
+    cout << "5) Local Illumination Change " << endl;
+    cout << "6) Texture Flattening " << endl;
+    cout << endl;
+    cout << "Press number 1-6 to choose from above techniques: ";
+    int num = 1;
+    cin >> num;
+    cout << endl;
+
+    if(num == 1)
+    {
+        string folder =  "cloning/Normal_Cloning/";
+        string original_path1 = folder + "source1.png";
+        string original_path2 = folder + "destination1.png";
+        string original_path3 = folder + "mask.png";
+
+        Mat source = imread(original_path1, IMREAD_COLOR);
+        Mat destination = imread(original_path2, IMREAD_COLOR);
+        Mat mask = imread(original_path3, IMREAD_COLOR);
+
+        if(source.empty())
+        {
+            cout << "Could not load source image " << original_path1 << endl;
+            exit(0);
+        }
+        if(destination.empty())
+        {
+            cout << "Could not load destination image " << original_path2 << endl;
+            exit(0);
+        }
+        if(mask.empty())
+        {
+            cout << "Could not load mask image " << original_path3 << endl;
+            exit(0);
+        }
+
+        Mat result;
+        Point p;
+        p.x = 400;
+        p.y = 100;
+
+        seamlessClone(source, destination, mask, p, result, 1);
+
+        imshow("Output",result);
+        imwrite(folder + "cloned.png", result);
+    }
+    else if(num == 2)
+    {
+        string folder = "cloning/Mixed_Cloning/";
+        string original_path1 = folder + "source1.png";
+        string original_path2 = folder + "destination1.png";
+        string original_path3 = folder + "mask.png";
+
+        Mat source = imread(original_path1, IMREAD_COLOR);
+        Mat destination = imread(original_path2, IMREAD_COLOR);
+        Mat mask = imread(original_path3, IMREAD_COLOR);
+
+        if(source.empty())
+        {
+            cout << "Could not load source image " << original_path1 << endl;
+            exit(0);
+        }
+        if(destination.empty())
+        {
+            cout << "Could not load destination image " << original_path2 << endl;
+            exit(0);
+        }
+        if(mask.empty())
+        {
+            cout << "Could not load mask image " << original_path3 << endl;
+            exit(0);
+        }
+
+        Mat result;
+        Point p;
+        p.x = destination.size().width/2;
+        p.y = destination.size().height/2;
+
+        seamlessClone(source, destination, mask, p, result, 2);
+
+        imshow("Output",result);
+        imwrite(folder + "cloned.png", result);
+    }
+    else if(num == 3)
+    {
+        string folder = "cloning/Monochrome_Transfer/";
+        string original_path1 = folder + "source1.png";
+        string original_path2 = folder + "destination1.png";
+        string original_path3 = folder + "mask.png";
+
+        Mat source = imread(original_path1, IMREAD_COLOR);
+        Mat destination = imread(original_path2, IMREAD_COLOR);
+        Mat mask = imread(original_path3, IMREAD_COLOR);
+
+        if(source.empty())
+        {
+            cout << "Could not load source image " << original_path1 << endl;
+            exit(0);
+        }
+        if(destination.empty())
+        {
+            cout << "Could not load destination image " << original_path2 << endl;
+            exit(0);
+        }
+        if(mask.empty())
+        {
+            cout << "Could not load mask image " << original_path3 << endl;
+            exit(0);
+        }
+
+        Mat result;
+        Point p;
+        p.x = destination.size().width/2;
+        p.y = destination.size().height/2;
+
+        seamlessClone(source, destination, mask, p, result, 3);
+
+        imshow("Output",result);
+        imwrite(folder + "cloned.png", result);
+    }
+    else if(num == 4)
+    {
+        string folder = "cloning/Color_Change/";
+        string original_path1 = folder + "source1.png";
+        string original_path2 = folder + "mask.png";
+
+        Mat source = imread(original_path1, IMREAD_COLOR);
+        Mat mask = imread(original_path2, IMREAD_COLOR);
+
+        if(source.empty())
+        {
+            cout << "Could not load source image " << original_path1 << endl;
+            exit(0);
+        }
+        if(mask.empty())
+        {
+            cout << "Could not load mask image " << original_path2 << endl;
+            exit(0);
+        }
+
+        Mat result;
+
+        colorChange(source, mask, result, 1.5, .5, .5);
+
+        imshow("Output",result);
+        imwrite(folder + "cloned.png", result);
+    }
+    else if(num == 5)
+    {
+        string folder = "cloning/Illumination_Change/";
+        string original_path1 = folder + "source1.png";
+        string original_path2 = folder + "mask.png";
+
+        Mat source = imread(original_path1, IMREAD_COLOR);
+        Mat mask = imread(original_path2, IMREAD_COLOR);
+
+        if(source.empty())
+        {
+            cout << "Could not load source image " << original_path1 << endl;
+            exit(0);
+        }
+        if(mask.empty())
+        {
+            cout << "Could not load mask image " << original_path2 << endl;
+            exit(0);
+        }
+
+        Mat result;
+
+        illuminationChange(source, mask, result, 0.2f, 0.4f);
+
+        imshow("Output",result);
+        imwrite(folder + "cloned.png", result);
+    }
+    else if(num == 6)
+    {
+        string folder = "cloning/Texture_Flattening/";
+        string original_path1 = folder + "source1.png";
+        string original_path2 = folder + "mask.png";
+
+        Mat source = imread(original_path1, IMREAD_COLOR);
+        Mat mask = imread(original_path2, IMREAD_COLOR);
+
+        if(source.empty())
+        {
+            cout << "Could not load source image " << original_path1 << endl;
+            exit(0);
+        }
+        if(mask.empty())
+        {
+            cout << "Could not load mask image " << original_path2 << endl;
+            exit(0);
+        }
+
+        Mat result;
+
+        textureFlattening(source, mask, result, 30, 45, 3);
+
+        imshow("Output",result);
+        imwrite(folder + "cloned.png", result);
+    }
+    waitKey(0);
+}
diff --git a/samples/cpp/tutorial_code/photo/seamless_cloning/cloning_gui.cpp b/samples/cpp/tutorial_code/photo/seamless_cloning/cloning_gui.cpp
new file mode 100644
index 0000000000..2457b12154
--- /dev/null
+++ b/samples/cpp/tutorial_code/photo/seamless_cloning/cloning_gui.cpp
@@ -0,0 +1,546 @@
+/*
+* cloning.cpp
+*
+* Author:
+* Siddharth Kherada <siddharthkherada27[at]gmail[dot]com>
+*
+* This tutorial demonstrates how to use OpenCV seamless cloning
+* module.
+*
+* 1- Normal Cloning
+* 2- Mixed Cloning
+* 3- Monochrome Transfer
+* 4- Color Change
+* 5- Illumination change
+* 6- Texture Flattening
+
+* The program takes as input a source and a destination image (for 1-3 methods)
+* and ouputs the cloned image.
+
+* Step 1:
+* -> In the source image, select the region of interest by left click mouse button. A Polygon ROI will be created by left clicking mouse button.
+* -> To set the Polygon ROI, click the right mouse button or 'd' key.
+* -> To reset the region selected, click the middle mouse button or 'r' key.
+
+* Step 2:
+* -> In the destination image, select the point where you want to place the ROI in the image by left clicking mouse button.
+* -> To get the cloned result, click the right mouse button or 'c' key.
+* -> To quit the program, use 'q' key.
+*
+* Result: The cloned image will be displayed.
+*/
+
+#include <signal.h>
+#include "opencv2/photo.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/core.hpp"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+using namespace cv;
+
+Mat img0, img1, img2, res, res1, final, final1, blend;
+
+Point point;
+int drag = 0;
+int destx, desty;
+
+int numpts = 100;
+Point* pts = new Point[100];
+Point* pts2 = new Point[100];
+Point* pts_diff = new Point[100];
+
+int var = 0;
+int flag = 0, flag1 = 0, flag4 = 0;
+
+int minx, miny, maxx, maxy, lenx, leny;
+int minxd, minyd, maxxd, maxyd, lenxd, lenyd;
+
+int channel, num, kernel_size;
+
+float alpha,beta;
+
+float red, green, blue;
+
+double low_t, high_t;
+
+void source(int, int, int, int, void*);
+void destination(int, int, int, int, void*);
+void checkfile(char*);
+
+void source(int event, int x, int y, int, void*)
+{
+
+    if (event == EVENT_LBUTTONDOWN && !drag)
+    {
+        if(flag1 == 0)
+        {
+            if(var==0)
+                img1 = img0.clone();
+            point = Point(x, y);
+            circle(img1,point,2,Scalar(0, 0, 255),-1, 8, 0);
+            pts[var] = point;
+            var++;
+            drag  = 1;
+            if(var>1)
+                line(img1,pts[var-2], point, Scalar(0, 0, 255), 2, 8, 0);
+
+            imshow("Source", img1);
+        }
+    }
+
+    if (event == EVENT_LBUTTONUP && drag)
+    {
+        imshow("Source", img1);
+
+        drag = 0;
+    }
+    if (event == EVENT_RBUTTONDOWN)
+    {
+        flag1 = 1;
+        img1 = img0.clone();
+        for(int i = var; i < numpts ; i++)
+            pts[i] = point;
+
+        if(var!=0)
+        {
+            const Point* pts3[1] = {&pts[0]};
+            polylines( img1, pts3, &numpts,1, 1, Scalar(0,0,0), 2, 8, 0);
+        }
+
+        for(int i=0;i<var;i++)
+        {
+            minx = min(minx,pts[i].x);
+            maxx = max(maxx,pts[i].x);
+            miny = min(miny,pts[i].y);
+            maxy = max(maxy,pts[i].y);
+        }
+        lenx = maxx - minx;
+        leny = maxy - miny;
+
+        int mid_pointx = minx + lenx/2;
+        int mid_pointy = miny + leny/2;
+
+        for(int i=0;i<var;i++)
+        {
+            pts_diff[i].x = pts[i].x - mid_pointx;
+            pts_diff[i].y = pts[i].y - mid_pointy;
+        }
+
+        imshow("Source", img1);
+    }
+
+    if (event == EVENT_RBUTTONUP)
+    {
+        flag = var;
+
+        final = Mat::zeros(img0.size(),CV_8UC3);
+        res1 = Mat::zeros(img0.size(),CV_8UC1);
+        const Point* pts4[1] = {&pts[0]};
+
+        fillPoly(res1, pts4,&numpts, 1, Scalar(255, 255, 255), 8, 0);
+        bitwise_and(img0, img0, final,res1);
+
+        imshow("Source", img1);
+
+        if(num == 4)
+        {
+            colorChange(img0,res1,blend,red,green,blue);
+            imshow("Color Change Image", blend);
+            waitKey(0);
+
+        }
+        else if(num == 5)
+        {
+            illuminationChange(img0,res1,blend,alpha,beta);
+            imshow("Illum Change Image", blend);
+            waitKey(0);
+        }
+        else if(num == 6)
+        {
+            textureFlattening(img0,res1,blend,low_t,high_t,kernel_size);
+            imshow("Texture Flattened", blend);
+            waitKey(0);
+        }
+
+    }
+    if (event == EVENT_MBUTTONDOWN)
+    {
+        for(int i = 0; i < numpts ; i++)
+        {
+            pts[i].x=0;
+            pts[i].y=0;
+        }
+        var = 0;
+        flag1 = 0;
+        minx = INT_MAX; miny = INT_MAX; maxx = INT_MIN; maxy = INT_MIN;
+        imshow("Source", img0);
+        if(num == 1 || num == 2 || num == 3)
+            imshow("Destination",img2);
+        drag = 0;
+    }
+}
+
+void destination(int event, int x, int y, int, void*)
+{
+
+    Mat im1;
+    minxd = INT_MAX; minyd = INT_MAX; maxxd = INT_MIN; maxyd = INT_MIN;
+    im1 = img2.clone();
+    if (event == EVENT_LBUTTONDOWN)
+    {
+        flag4 = 1;
+        if(flag1 == 1)
+        {
+            point = Point(x, y);
+
+            for(int i=0;i<var;i++)
+            {
+                pts2[i].x = point.x + pts_diff[i].x;
+                pts2[i].y = point.y + pts_diff[i].y;
+            }
+
+            for(int i=var;i<numpts;i++)
+            {
+                pts2[i].x = point.x + pts_diff[0].x;
+                pts2[i].y = point.y + pts_diff[0].y;
+            }
+
+            const Point* pts5[1] = {&pts2[0]};
+            polylines( im1, pts5, &numpts,1, 1, Scalar(0,0,255), 2, 8, 0);
+
+            destx = x;
+            desty = y;
+
+            imshow("Destination", im1);
+        }
+    }
+    if (event == EVENT_RBUTTONUP)
+    {
+        for(int i=0;i<flag;i++)
+        {
+            minxd = min(minxd,pts2[i].x);
+            maxxd = max(maxxd,pts2[i].x);
+            minyd = min(minyd,pts2[i].y);
+            maxyd = max(maxyd,pts2[i].y);
+        }
+
+        if(maxxd > im1.size().width || maxyd > im1.size().height || minxd < 0 || minyd < 0)
+        {
+            cout << "Index out of range" << endl;
+            exit(0);
+        }
+
+        final1 = Mat::zeros(img2.size(),CV_8UC3);
+        res = Mat::zeros(img2.size(),CV_8UC1);
+        for(int i=miny, k=minyd;i<(miny+leny);i++,k++)
+            for(int j=minx,l=minxd ;j<(minx+lenx);j++,l++)
+            {
+                for(int c=0;c<channel;c++)
+                {
+                    final1.at<uchar>(k,l*channel+c) = final.at<uchar>(i,j*channel+c);
+
+                }
+            }
+
+        const Point* pts6[1] = {&pts2[0]};
+        fillPoly(res, pts6, &numpts, 1, Scalar(255, 255, 255), 8, 0);
+
+        if(num == 1 || num == 2 || num == 3)
+        {
+            seamlessClone(img0,img2,res1,point,blend,num);
+            imshow("Cloned Image", blend);
+            imwrite("cloned.png",blend);
+            waitKey(0);
+        }
+
+        for(int i = 0; i < flag ; i++)
+        {
+            pts2[i].x=0;
+            pts2[i].y=0;
+        }
+
+        minxd = INT_MAX; minyd = INT_MAX; maxxd = INT_MIN; maxyd = INT_MIN;
+    }
+
+    im1.release();
+}
+
+int main()
+{
+    cout << endl;
+    cout << "Cloning Module" << endl;
+    cout << "---------------" << endl;
+    cout << "Step 1:" << endl;
+    cout << " -> In the source image, select the region of interest by left click mouse button. A Polygon ROI will be created by left clicking mouse button." << endl;
+    cout << " -> To set the Polygon ROI, click the right mouse button or use 'd' key" << endl;
+    cout << " -> To reset the region selected, click the middle mouse button or use 'r' key." << endl;
+
+    cout << "Step 2:" << endl;
+    cout << " -> In the destination image, select the point where you want to place the ROI in the image by left clicking mouse button." << endl;
+    cout << " -> To get the cloned result, click the right mouse button or use 'c' key." << endl;
+    cout << " -> To quit the program, use 'q' key." << endl;
+    cout << endl;
+    cout << "Options: " << endl;
+    cout << endl;
+    cout << "1) Normal Cloning " << endl;
+    cout << "2) Mixed Cloning " << endl;
+    cout << "3) Monochrome Transfer " << endl;
+    cout << "4) Local Color Change " << endl;
+    cout << "5) Local Illumination Change " << endl;
+    cout << "6) Texture Flattening " << endl;
+
+    cout << endl;
+
+    cout << "Press number 1-6 to choose from above techniques: ";
+    cin >> num;
+    cout << endl;
+
+    minx = INT_MAX; miny = INT_MAX; maxx = INT_MIN; maxy = INT_MIN;
+
+    minxd = INT_MAX; minyd = INT_MAX; maxxd = INT_MIN; maxyd = INT_MIN;
+
+    int flag3 = 0;
+
+    if(num == 1 || num == 2 || num == 3)
+    {
+
+        string src,dest;
+        cout << "Enter Source Image: ";
+        cin >> src;
+
+        cout << "Enter Destination Image: ";
+        cin >> dest;
+
+        img0 = imread(src);
+
+        img2 = imread(dest);
+
+        if(!img0.data)
+        {
+            cout << "Source Image does not exist" << endl;
+            exit(0);
+        }
+        if(!img2.data)
+        {
+            cout << "Destination Image does not exist" << endl;
+            exit(0);
+        }
+
+        channel = img0.channels();
+
+        res = Mat::zeros(img2.size(),CV_8UC1);
+        res1 = Mat::zeros(img0.size(),CV_8UC1);
+        final = Mat::zeros(img0.size(),CV_8UC3);
+        final1 = Mat::zeros(img2.size(),CV_8UC3);
+        //////////// source image ///////////////////
+
+        namedWindow("Source", 1);
+        setMouseCallback("Source", source, NULL);
+        imshow("Source", img0);
+
+        /////////// destination image ///////////////
+
+        namedWindow("Destination", 1);
+        setMouseCallback("Destination", destination, NULL);
+        imshow("Destination",img2);
+
+    }
+    else if(num == 4)
+    {
+        string src;
+        cout << "Enter Source Image: ";
+        cin >> src;
+
+        cout << "Enter RGB values: " << endl;
+        cout << "Red: ";
+        cin >> red;
+
+        cout << "Green: ";
+        cin >> green;
+
+        cout << "Blue: ";
+        cin >> blue;
+
+        img0 = imread(src);
+
+        if(!img0.data)
+        {
+            cout << "Source Image does not exist" << endl;
+            exit(0);
+        }
+
+        res1 = Mat::zeros(img0.size(),CV_8UC1);
+        final = Mat::zeros(img0.size(),CV_8UC3);
+
+        //////////// source image ///////////////////
+
+        namedWindow("Source", 1);
+        setMouseCallback("Source", source, NULL);
+        imshow("Source", img0);
+
+    }
+    else if(num == 5)
+    {
+        string src;
+        cout << "Enter Source Image: ";
+        cin >> src;
+
+        cout << "alpha: ";
+        cin >> alpha;
+
+        cout << "beta: ";
+        cin >> beta;
+
+        img0 = imread(src);
+
+        if(!img0.data)
+        {
+            cout << "Source Image does not exist" << endl;
+            exit(0);
+        }
+
+        res1 = Mat::zeros(img0.size(),CV_8UC1);
+        final = Mat::zeros(img0.size(),CV_8UC3);
+
+        //////////// source image ///////////////////
+
+        namedWindow("Source", 1);
+        setMouseCallback("Source", source, NULL);
+        imshow("Source", img0);
+
+    }
+    else if(num == 6)
+    {
+        string src;
+        cout << "Enter Source Image: ";
+        cin >> src;
+
+        cout << "low_threshold: ";
+        cin >> low_t;
+
+        cout << "high_threshold: ";
+        cin >> high_t;
+
+        cout << "kernel_size: ";
+        cin >> kernel_size;
+
+        img0 = imread(src);
+
+        if(!img0.data)
+        {
+            cout << "Source Image does not exist" << endl;
+            exit(0);
+        }
+
+        res1 = Mat::zeros(img0.size(),CV_8UC1);
+        final = Mat::zeros(img0.size(),CV_8UC3);
+
+        //////////// source image ///////////////////
+
+        namedWindow("Source", 1);
+        setMouseCallback("Source", source, NULL);
+        imshow("Source", img0);
+    }
+    else
+    {
+        cout << "Wrong Option Choosen" << endl;
+        exit(0);
+    }
+
+    for(;;)
+    {
+        char key = (char) waitKey(0);
+
+        if(key == 'd' && flag3 == 0)
+        {
+            flag1 = 1;
+            flag3 = 1;
+            img1 = img0.clone();
+            for(int i = var; i < numpts ; i++)
+                pts[i] = point;
+
+            if(var!=0)
+            {
+                const Point* pts3[1] = {&pts[0]};
+                polylines( img1, pts3, &numpts,1, 1, Scalar(0,0,0), 2, 8, 0);
+            }
+
+            for(int i=0;i<var;i++)
+            {
+                minx = min(minx,pts[i].x);
+                maxx = max(maxx,pts[i].x);
+                miny = min(miny,pts[i].y);
+                maxy = max(maxy,pts[i].y);
+            }
+            lenx = maxx - minx;
+            leny = maxy - miny;
+
+            int mid_pointx = minx + lenx/2;
+            int mid_pointy = miny + leny/2;
+
+            for(int i=0;i<var;i++)
+            {
+                pts_diff[i].x = pts[i].x - mid_pointx;
+                pts_diff[i].y = pts[i].y - mid_pointy;
+            }
+
+            flag = var;
+
+            final = Mat::zeros(img0.size(),CV_8UC3);
+            res1 = Mat::zeros(img0.size(),CV_8UC1);
+            const Point* pts4[1] = {&pts[0]};
+
+            fillPoly(res1, pts4,&numpts, 1, Scalar(255, 255, 255), 8, 0);
+            bitwise_and(img0, img0, final,res1);
+
+            imshow("Source", img1);
+        }
+        else if(key == 'r')
+        {
+            for(int i = 0; i < numpts ; i++)
+            {
+                pts[i].x=0;
+                pts[i].y=0;
+            }
+            var = 0;
+            flag1 = 0;
+            flag3 = 0;
+            flag4 = 0;
+            minx = INT_MAX; miny = INT_MAX; maxx = INT_MIN; maxy = INT_MIN;
+            imshow("Source", img0);
+            if(num == 1 || num == 2 || num == 3)
+                imshow("Destination",img2);
+            drag = 0;
+        }
+        else if ((num == 1 || num == 2 || num == 3) && key == 'c' && flag1 == 1 && flag4 == 1)
+        {
+            seamlessClone(img0,img2,res1,point,blend,num);
+            imshow("Cloned Image", blend);
+            imwrite("cloned.png",blend);
+        }
+        else if (num == 4 && key == 'c' && flag1 == 1)
+        {
+            colorChange(img0,res1,blend,red,green,blue);
+            imshow("Color Change Image", blend);
+            imwrite("cloned.png",blend);
+        }
+        else if (num == 5 && key == 'c' && flag1 == 1)
+        {
+            illuminationChange(img0,res1,blend,alpha,beta);
+            imshow("Illum Change Image", blend);
+            imwrite("cloned.png",blend);
+        }
+        else if (num == 6 && key == 'c' && flag1 == 1)
+        {
+            textureFlattening(img0,res1,blend,low_t,high_t,kernel_size);
+            imshow("Texture Flattened", blend);
+            imwrite("cloned.png",blend);
+        }
+        else if(key == 'q')
+            exit(0);
+    }
+    waitKey(0);
+}