diff --git a/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown b/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown
index 42fdb3483e..298c45b3b1 100644
--- a/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown
+++ b/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown
@@ -69,7 +69,7 @@ CvMat* mI        =  &I.operator CvMat();
 @endcode
 One of the biggest complaints of the C interface is that it leaves all the memory management to you.
 You need to figure out when it is safe to release your unused objects and make sure you do so before
-the program finishes or you could have troublesome memory leeks. To work around this issue in OpenCV
+the program finishes or you could have troublesome memory leaks. To work around this issue in OpenCV
 there is introduced a sort of smart pointer. This will automatically release the object when it's no
 longer in use. To use this declare the pointers as a specialization of the *Ptr* :
 @code{.cpp}
diff --git a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
index e9f446db36..968b3faaca 100644
--- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
+++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
@@ -29,7 +29,7 @@ Execute in webcam:
 
 @code{.bash}
 
-$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392
+$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --rgb
 
 @endcode
 
@@ -37,7 +37,7 @@ Execute with image or video file:
 
 @code{.bash}
 
-$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE]
+$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE] --rgb
 
 @endcode
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 95c9bfb1fe..9dadab57ea 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -786,10 +786,14 @@ template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
+{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
 
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 3fff6651e3..8b76dd8487 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -589,7 +589,7 @@ inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
     return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
 }
 
-#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2)       \
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2)   \
 template<int imm>                                                 \
 inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
 {                                                                 \
@@ -598,11 +598,13 @@ inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
     return imm ? b : a;                                           \
 }
 
-OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  right, a, b)
-OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec)    \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left,  b, a)  \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
 
-OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  left, b, a)
-OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
 
 /* Extract */
 template<int s, typename _Tpvec>
@@ -716,26 +718,33 @@ inline int v_signmask(const v_uint64x2& a)
 inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_s64(a)); }
 
-
 template<typename _Tpvec>
 inline bool v_check_all(const _Tpvec& a)
-{ return vec_all_lt(a.val, _Tpvec().val);}
-inline bool v_check_all(const v_uint8x16 &a)
+{ return vec_all_lt(a.val, _Tpvec().val); }
+inline bool v_check_all(const v_uint8x16& a)
 { return v_check_all(v_reinterpret_as_s8(a)); }
-inline bool v_check_all(const v_uint16x8 &a)
+inline bool v_check_all(const v_uint16x8& a)
 { return v_check_all(v_reinterpret_as_s16(a)); }
-inline bool v_check_all(const v_uint32x4 &a)
+inline bool v_check_all(const v_uint32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float32x4& a)
 { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
 
 template<typename _Tpvec>
 inline bool v_check_any(const _Tpvec& a)
-{ return vec_any_lt(a.val, _Tpvec().val);}
-inline bool v_check_any(const v_uint8x16 &a)
+{ return vec_any_lt(a.val, _Tpvec().val); }
+inline bool v_check_any(const v_uint8x16& a)
 { return v_check_any(v_reinterpret_as_s8(a)); }
-inline bool v_check_any(const v_uint16x8 &a)
+inline bool v_check_any(const v_uint16x8& a)
 { return v_check_any(v_reinterpret_as_s16(a)); }
-inline bool v_check_any(const v_uint32x4 &a)
+inline bool v_check_any(const v_uint32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x4& a)
 { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
 
 ////////// Other math /////////
 
diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp
index e5d1a7e80a..75864ea822 100644
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -194,8 +194,8 @@ Matx<_Tp, n, m> Matx<_Tp, m, n>::inv(int method, bool *p_is_ok /*= NULL*/) const
 {
     Matx<_Tp, n, m> b;
     bool ok;
-    if( method == DECOMP_LU || method == DECOMP_CHOLESKY )
-        ok = cv::internal::Matx_FastInvOp<_Tp, m>()(*this, b, method);
+    if( m == n && (method == DECOMP_LU || method == DECOMP_CHOLESKY) )
+        ok = cv::internal::Matx_FastInvOp<_Tp, m>()(*reinterpret_cast<const Matx<_Tp, m, m>*>(this), reinterpret_cast<Matx<_Tp, m, m>&>(b), method);
     else
     {
         Mat A(*this, false), B(b, false);
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 7a21c9eb56..7579d9cf05 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -837,17 +837,28 @@ template<typename R> struct TheTest
         Data<R> resC = v_rotate_right<s>(a);
         Data<R> resD = v_rotate_right<s>(a, b);
 
+        Data<R> resE = v_rotate_left<s>(a);
+        Data<R> resF = v_rotate_left<s>(a, b);
+
         for (int i = 0; i < R::nlanes; ++i)
         {
             if (i + s >= R::nlanes)
             {
                 EXPECT_EQ((LaneType)0, resC[i]);
                 EXPECT_EQ(dataB[i - R::nlanes + s], resD[i]);
+
+                EXPECT_EQ((LaneType)0, resE[i - R::nlanes + s]);
+                EXPECT_EQ(dataB[i], resF[i - R::nlanes + s]);
             }
             else
+            {
                 EXPECT_EQ(dataA[i + s], resC[i]);
-        }
+                EXPECT_EQ(dataA[i + s], resD[i]);
 
+                EXPECT_EQ(dataA[i], resE[i + s]);
+                EXPECT_EQ(dataA[i], resF[i + s]);
+            }
+        }
         return *this;
     }
 
diff --git a/modules/cudafilters/src/cuda/column_filter.hpp b/modules/cudafilters/src/cuda/column_filter.hpp
index 7dc339ca91..e93fc836fa 100644
--- a/modules/cudafilters/src/cuda/column_filter.hpp
+++ b/modules/cudafilters/src/cuda/column_filter.hpp
@@ -52,10 +52,8 @@ namespace column_filter
 {
     #define MAX_KERNEL_SIZE 32
 
-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
     template <int KSIZE, typename T, typename D, typename B>
-    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
     {
         #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
             const int BLOCK_DIM_X = 16;
@@ -135,7 +133,7 @@ namespace column_filter
 
                 #pragma unroll
                 for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k];
 
                 dst(y, x) = saturate_cast<D>(sum);
             }
@@ -143,7 +141,7 @@ namespace column_filter
     }
 
     template <int KSIZE, typename T, typename D, template<typename> class B>
-    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
     {
         int BLOCK_DIM_X;
         int BLOCK_DIM_Y;
@@ -167,7 +165,7 @@ namespace column_filter
 
         B<T> brd(src.rows);
 
-        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
 
         cudaSafeCall( cudaGetLastError() );
 
@@ -181,7 +179,7 @@ namespace filter
     template <typename T, typename D>
     void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
     {
-        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);
 
         static const caller_t callers[5][33] =
         {
@@ -362,11 +360,6 @@ namespace filter
             }
         };
 
-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
     }
 }
diff --git a/modules/cudafilters/src/cuda/row_filter.hpp b/modules/cudafilters/src/cuda/row_filter.hpp
index 80eab59b44..4a4be36f9a 100644
--- a/modules/cudafilters/src/cuda/row_filter.hpp
+++ b/modules/cudafilters/src/cuda/row_filter.hpp
@@ -52,10 +52,8 @@ namespace row_filter
 {
     #define MAX_KERNEL_SIZE 32
 
-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
     template <int KSIZE, typename T, typename D, typename B>
-    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
     {
         #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
             const int BLOCK_DIM_X = 32;
@@ -135,7 +133,7 @@ namespace row_filter
 
                 #pragma unroll
                 for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
+                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k];
 
                 dst(y, x) = saturate_cast<D>(sum);
             }
@@ -143,7 +141,7 @@ namespace row_filter
     }
 
     template <int KSIZE, typename T, typename D, template<typename> class B>
-    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
     {
         int BLOCK_DIM_X;
         int BLOCK_DIM_Y;
@@ -167,7 +165,7 @@ namespace row_filter
 
         B<T> brd(src.cols);
 
-        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
@@ -180,7 +178,7 @@ namespace filter
     template <typename T, typename D>
     void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
     {
-        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);
 
         static const caller_t callers[5][33] =
         {
@@ -361,11 +359,6 @@ namespace filter
             }
         };
 
-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
     }
 }
diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp
index 75e53cf3ec..8c3fd4a2b5 100644
--- a/modules/cudaimgproc/src/canny.cpp
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -58,9 +58,9 @@ namespace canny
 
     void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);
 
-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream);
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream);
 
-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream);
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream);
 
     void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
 }
@@ -127,6 +127,8 @@ namespace
         Ptr<Filter> filterDX_, filterDY_;
 #endif
         int old_apperture_size_;
+
+        int *d_counter;
     };
 
     void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
@@ -218,12 +220,17 @@ namespace
 
     void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
     {
-        map_.setTo(Scalar::all(0));
+        map_.setTo(Scalar::all(0), stream);
+
         canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));
 
-        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), StreamAccessor::getStream(stream));
+        cudaSafeCall( cudaMalloc(&d_counter, sizeof(int)) );
+
+        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
+
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
 
-        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), StreamAccessor::getStream(stream));
+        cudaSafeCall( cudaFree(d_counter) );
 
         canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
     }
diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu
index e0ba515693..4418b8e5eb 100644
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -47,6 +47,7 @@
 #include "opencv2/core/cuda/transform.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda.hpp"
 
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@@ -102,6 +103,20 @@ namespace canny
         }
     };
 
+    struct SrcTexObject
+    {
+        int xoff;
+        int yoff;
+        cudaTextureObject_t tex_src_object;
+        __host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
+        {
+            return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
+        }
+
+    };
+
     template <class Norm> __global__
     void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
     {
@@ -120,29 +135,94 @@ namespace canny
         mag(y, x) = norm(dxVal, dyVal);
     }
 
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= mag.rows || x >= mag.cols)
+            return;
+
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
     void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
     {
         const dim3 block(16, 16);
         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
 
-        bindTexture(&tex_src, srcWhole);
-        SrcTex src(xoff, yoff);
+        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
 
-        if (L2Grad)
+        if (cc30)
         {
-            L2 norm;
-            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = srcWhole.ptr();
+            resDesc.res.pitch2D.height = srcWhole.rows;
+            resDesc.res.pitch2D.width = srcWhole.cols;
+            resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
+
+            cudaTextureDesc texDesc;
+            memset(&texDesc, 0, sizeof(texDesc));
+            texDesc.addressMode[0] = cudaAddressModeClamp;
+            texDesc.addressMode[1] = cudaAddressModeClamp;
+            texDesc.addressMode[2] = cudaAddressModeClamp;
+
+            cudaTextureObject_t tex = 0;
+            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+
+            SrcTexObject src(xoff, yoff, tex);
+
+            if (L2Grad)
+            {
+                L2 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+            else
+            {
+                L1 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+            else
+                cudaSafeCall( cudaStreamSynchronize(stream) );
+
+            cudaSafeCall( cudaDestroyTextureObject(tex) );
         }
         else
         {
-            L1 norm;
-            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-        }
+            bindTexture(&tex_src, srcWhole);
+            SrcTex src(xoff, yoff);
 
-        cudaSafeCall( cudaGetLastError() );
+            if (L2Grad)
+            {
+                L2 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+            else
+            {
+                L1 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
 
-        if (stream == NULL)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
     }
 
     void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
@@ -165,7 +245,6 @@ namespace canny
 namespace canny
 {
     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
-
     __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
     {
         const int CANNY_SHIFT = 15;
@@ -218,18 +297,103 @@ namespace canny
         map(y, x) = edge_type;
     }
 
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D<float>(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
+        {
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
+
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+        }
+
+        map(y, x) = edge_type;
+    }
+
     void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
     {
         const dim3 block(16, 16);
         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
 
-        bindTexture(&tex_mag, mag);
+        if (deviceSupports(FEATURE_SET_COMPUTE_30))
+        {
+            // Use the texture object
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = mag.ptr();
+            resDesc.res.pitch2D.height = mag.rows;
+            resDesc.res.pitch2D.width = mag.cols;
+            resDesc.res.pitch2D.pitchInBytes = mag.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
+
+            cudaTextureDesc texDesc;
+            memset(&texDesc, 0, sizeof(texDesc));
+            texDesc.addressMode[0] = cudaAddressModeClamp;
+            texDesc.addressMode[1] = cudaAddressModeClamp;
+            texDesc.addressMode[2] = cudaAddressModeClamp;
+
+            cudaTextureObject_t tex=0;
+            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
+            cudaSafeCall( cudaGetLastError() );
 
-        calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
-        cudaSafeCall( cudaGetLastError() );
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+            else
+                cudaSafeCall( cudaStreamSynchronize(stream) );
 
-        if (stream == NULL)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaDestroyTextureObject(tex) );
+        }
+        else
+        {
+            // Use the texture reference
+            bindTexture(&tex_mag, mag);
+            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
     }
 }
 
@@ -237,14 +401,12 @@ namespace canny
 
 namespace canny
 {
-    __device__ int counter = 0;
-
     __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
     {
         return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
     }
 
-    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st, int* d_counter)
     {
         __shared__ volatile int smem[18][18];
 
@@ -325,22 +487,19 @@ namespace canny
 
         if (n > 0)
         {
-            const int ind =  ::atomicAdd(&counter, 1);
+            const int ind =  ::atomicAdd(d_counter, 1);
             st[ind] = make_short2(x, y);
         }
     }
 
-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream)
     {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-        cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
+        cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
 
         const dim3 block(16, 16);
         const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
 
-        edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
+        edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1, d_counter);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == NULL)
@@ -355,7 +514,7 @@ namespace canny
     __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
     __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 
-    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, const int count)
     {
         const int stack_size = 512;
 
@@ -429,7 +588,7 @@ namespace canny
         {
             if (threadIdx.x == 0)
             {
-                s_ind = ::atomicAdd(&counter, s_counter);
+                s_ind = ::atomicAdd(d_counter, s_counter);
 
                 if (s_ind + s_counter > map.cols * map.rows)
                     s_counter = 0;
@@ -444,29 +603,26 @@ namespace canny
         }
     }
 
-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream)
     {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
-
         int count;
-        cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+        cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
         cudaSafeCall( cudaStreamSynchronize(stream) );
 
         while (count > 0)
         {
-            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
+            cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
 
             const dim3 block(128);
             const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
 
-            edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
+            edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, d_counter, count);
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == NULL)
                 cudaSafeCall( cudaDeviceSynchronize() );
 
-            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
             cudaSafeCall( cudaStreamSynchronize(stream) );
 
             count = min(count, map.cols * map.rows);
diff --git a/modules/cudaimgproc/test/test_canny.cpp b/modules/cudaimgproc/test/test_canny.cpp
index 2b1a8d57e0..a782a87b3b 100644
--- a/modules/cudaimgproc/test/test_canny.cpp
+++ b/modules/cudaimgproc/test/test_canny.cpp
@@ -92,9 +92,66 @@ CUDA_TEST_P(Canny, Accuracy)
     EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
 }
 
+class CannyAsyncParallelLoopBody : public cv::ParallelLoopBody
+{
+public:
+    CannyAsyncParallelLoopBody(const cv::cuda::GpuMat& d_img_, cv::cuda::GpuMat* edges_, double low_thresh_, double high_thresh_, int apperture_size_, bool useL2gradient_)
+        : d_img(d_img_), edges(edges_), low_thresh(low_thresh_), high_thresh(high_thresh_), apperture_size(apperture_size_), useL2gradient(useL2gradient_) {}
+    ~CannyAsyncParallelLoopBody() {};
+    void operator()(const cv::Range& r) const
+    {
+        for (int i = r.start; i < r.end; i++) {
+            cv::cuda::Stream stream;
+            cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+            canny->detect(d_img, edges[i], stream);
+            stream.waitForCompletion();
+        }
+    }
+protected:
+    const cv::cuda::GpuMat& d_img;
+    cv::cuda::GpuMat* edges;
+    double low_thresh;
+    double high_thresh;
+    int apperture_size;
+    bool useL2gradient;
+};
+
+#define NUM_STREAMS 64
+
+CUDA_TEST_P(Canny, Async)
+{
+    if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30))
+    {
+        throw SkipTestException("CUDA device doesn't support texture objects");
+    }
+    else
+    {
+        const cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+
+        const cv::cuda::GpuMat d_img_roi = loadMat(img, useRoi);
+
+        double low_thresh = 50.0;
+        double high_thresh = 100.0;
+
+        // Synchronous call
+        cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+        cv::cuda::GpuMat edges_gold;
+        canny->detect(d_img_roi, edges_gold);
+
+        // Asynchronous call
+        cv::cuda::GpuMat edges[NUM_STREAMS];
+        cv::parallel_for_(cv::Range(0, NUM_STREAMS), CannyAsyncParallelLoopBody(d_img_roi, edges, low_thresh, high_thresh, apperture_size, useL2gradient));
+
+        // Compare the results of synchronous call and asynchronous call
+        for (int i = 0; i < NUM_STREAMS; i++)
+            EXPECT_MAT_NEAR(edges_gold, edges[i], 0.0);
+    }
+ }
+
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Canny, testing::Combine(
     ALL_DEVICES,
-    testing::Values(AppertureSize(3), AppertureSize(5)),
+    testing::Values(AppertureSize(3), AppertureSize(5), AppertureSize(7)),
     testing::Values(L2gradient(false), L2gradient(true)),
     WHOLE_SUBMAT));
 
diff --git a/modules/imgproc/src/color_hsv.cpp b/modules/imgproc/src/color_hsv.cpp
index d8074b4fa5..d5a41dfcec 100644
--- a/modules/imgproc/src/color_hsv.cpp
+++ b/modules/imgproc/src/color_hsv.cpp
@@ -84,15 +84,97 @@ struct RGB2HSV_f
     typedef float channel_type;
 
     RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
-    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
+    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {
+        #if CV_SIMD128
+        hasSIMD = hasSIMD128();
+        #endif
+    }
+
+    #if CV_SIMD128
+    inline void process(v_float32x4& v_r, v_float32x4& v_g,
+                        v_float32x4& v_b, float hscale) const
+    {
+        v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b);
+        v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b);
+
+        v_float32x4 v_eps = v_setall_f32(FLT_EPSILON);
+        v_float32x4 v_diff = v_max_rgb - v_min_rgb;
+        v_float32x4 v_s = v_diff / (v_abs(v_max_rgb) + v_eps);
+
+        v_float32x4 v_r_eq_max = v_r == v_max_rgb;
+        v_float32x4 v_g_eq_max = v_g == v_max_rgb;
+        v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b,
+                          v_select(v_g_eq_max, v_b - v_r, v_r - v_g));
+        v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f),
+                            v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f)));
+        v_float32x4 v_rev_diff = v_setall_f32(60.0f) / (v_diff + v_eps);
+        v_r = v_muladd(v_h, v_rev_diff, v_res) * v_setall_f32(hscale);
+
+        v_g = v_s;
+        v_b = v_max_rgb;
+    }
+    #endif
 
     void operator()(const float* src, float* dst, int n) const
     {
-        int i, bidx = blueIdx, scn = srccn;
+        int i = 0, bidx = blueIdx, scn = srccn;
         float hscale = hrange*(1.f/360.f);
         n *= 3;
 
-        for( i = 0; i < n; i += 3, src += scn )
+        #if CV_SIMD128
+        if (hasSIMD)
+        {
+            if (scn == 3) {
+                if (bidx) {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_load_deinterleave(src, v_r, v_g, v_b);
+                        process(v_r, v_g, v_b, hscale);
+                        v_store_interleave(dst + i, v_r, v_g, v_b);
+                    }
+                } else {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_load_deinterleave(src, v_r, v_g, v_b);
+                        process(v_b, v_g, v_r, hscale);
+                        v_store_interleave(dst + i, v_b, v_g, v_r);
+                    }
+                }
+            } else { // scn == 4
+                if (bidx) {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_float32x4 v_a;
+                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
+                        process(v_r, v_g, v_b, hscale);
+                        v_store_interleave(dst + i, v_r, v_g, v_b);
+                    }
+                } else {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_float32x4 v_a;
+                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
+                        process(v_b, v_g, v_r, hscale);
+                        v_store_interleave(dst + i, v_b, v_g, v_r);
+                    }
+                }
+            }
+        }
+        #endif
+
+        for( ; i < n; i += 3, src += scn )
         {
             float b = src[bidx], g = src[1], r = src[bidx^2];
             float h, s, v;
@@ -125,6 +207,9 @@ struct RGB2HSV_f
 
     int srccn, blueIdx;
     float hrange;
+    #if CV_SIMD128
+    bool hasSIMD;
+    #endif
 };
 
 
@@ -855,167 +940,111 @@ struct HLS2RGB_f
 
     HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
-        #if CV_SSE2
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #if CV_SIMD128
+        hasSIMD = hasSIMD128();
         #endif
     }
 
-    #if CV_SSE2
-    void process(__m128& v_h0, __m128& v_h1, __m128& v_l0,
-                 __m128& v_l1, __m128& v_s0, __m128& v_s1) const
+    #if CV_SIMD128
+    inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const
     {
-        __m128 v_lel0 = _mm_cmple_ps(v_l0, _mm_set1_ps(0.5f));
-        __m128 v_lel1 = _mm_cmple_ps(v_l1, _mm_set1_ps(0.5f));
-        __m128 v_p20 = _mm_andnot_ps(v_lel0, _mm_sub_ps(_mm_add_ps(v_l0, v_s0), _mm_mul_ps(v_l0, v_s0)));
-        __m128 v_p21 = _mm_andnot_ps(v_lel1, _mm_sub_ps(_mm_add_ps(v_l1, v_s1), _mm_mul_ps(v_l1, v_s1)));
-        v_p20 = _mm_or_ps(v_p20, _mm_and_ps(v_lel0, _mm_mul_ps(v_l0, _mm_add_ps(_mm_set1_ps(1.0f), v_s0))));
-        v_p21 = _mm_or_ps(v_p21, _mm_and_ps(v_lel1, _mm_mul_ps(v_l1, _mm_add_ps(_mm_set1_ps(1.0f), v_s1))));
-
-        __m128 v_p10 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l0), v_p20);
-        __m128 v_p11 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l1), v_p21);
-
-        v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
-        v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
-
-        __m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
-        __m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
-
-        v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
-        v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
-
-        __m128 v_p2_p10 = _mm_sub_ps(v_p20, v_p10);
-        __m128 v_p2_p11 = _mm_sub_ps(v_p21, v_p11);
-        __m128 v_tab20 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0)));
-        __m128 v_tab21 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1)));
-        __m128 v_tab30 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, v_h0));
-        __m128 v_tab31 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, v_h1));
-
-        __m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
-        __m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
-        v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
-        v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
-        v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
-        v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
-        v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
-        v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
-
-        v_h0 = _mm_and_ps(v_p10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
-        v_h1 = _mm_and_ps(v_p11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
-        v_l0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
-        v_l1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
-        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
-        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
-        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
-        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
-        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
-        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
-        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
-        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
-        v_s0 = _mm_and_ps(v_p20, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
-        v_s1 = _mm_and_ps(v_p21, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
+        v_float32x4 v_one = v_setall_f32(1.0f);
+
+        v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f);
+        v_float32x4 v_ls = v_l * v_s;
+        v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls);
+
+        v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale);
+        v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw));
+        v_float32x4 v_hs = v_hs_raw - v_pre_hs;
+        v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f)));
+        v_float32x4 v_elem1 = v_hs + v_hs;
+
+        v_float32x4 v_tab0 = v_l + v_elem0;
+        v_float32x4 v_tab1 = v_l - v_elem0;
+        v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1;
+        v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1;
+
+        v_float32x4 v_two  = v_setall_f32(2.0f);
+        v_float32x4 v_four = v_setall_f32(4.0f);
+
+        v_h = v_select(v_sector <  v_two , v_tab1,
+              v_select(v_sector <= v_two , v_tab3,
+              v_select(v_sector <= v_four, v_tab0, v_tab2)));
+
+        v_l = v_select(v_sector <  v_one , v_tab3,
+              v_select(v_sector <= v_two , v_tab0,
+              v_select(v_sector <  v_four, v_tab2, v_tab1)));
+
+        v_s = v_select(v_sector <  v_one , v_tab0,
+              v_select(v_sector <  v_two , v_tab2,
+              v_select(v_sector <  v_four, v_tab1,
+              v_select(v_sector <= v_four, v_tab3, v_tab0))));
     }
     #endif
 
     void operator()(const float* src, float* dst, int n) const
     {
         int i = 0, bidx = blueIdx, dcn = dstcn;
-        float _hscale = hscale;
         float alpha = ColorChannel<float>::max();
         n *= 3;
 
-        #if CV_SSE2
-        if (haveSIMD)
+        #if CV_SIMD128
+        if (hasSIMD)
         {
-            for( ; i <= n - 24; i += 24, dst += dcn * 8 )
+            if (dcn == 3)
             {
-                __m128 v_h0 = _mm_loadu_ps(src + i +  0);
-                __m128 v_h1 = _mm_loadu_ps(src + i +  4);
-                __m128 v_l0 = _mm_loadu_ps(src + i +  8);
-                __m128 v_l1 = _mm_loadu_ps(src + i + 12);
-                __m128 v_s0 = _mm_loadu_ps(src + i + 16);
-                __m128 v_s1 = _mm_loadu_ps(src + i + 20);
-
-                _mm_deinterleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
-
-                process(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
-
-                if (dcn == 3)
+                if (bidx)
                 {
-                    if (bidx)
+                    for (; i <= n - 12; i += 12, dst += dcn * 4)
                     {
-                        _mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1);
-
-                        _mm_storeu_ps(dst +  0, v_s0);
-                        _mm_storeu_ps(dst +  4, v_s1);
-                        _mm_storeu_ps(dst +  8, v_l0);
-                        _mm_storeu_ps(dst + 12, v_l1);
-                        _mm_storeu_ps(dst + 16, v_h0);
-                        _mm_storeu_ps(dst + 20, v_h1);
+                        v_float32x4 v_h;
+                        v_float32x4 v_l;
+                        v_float32x4 v_s;
+                        v_load_deinterleave(src + i, v_h, v_l, v_s);
+                        process(v_h, v_l, v_s);
+                        v_store_interleave(dst, v_s, v_l, v_h);
                     }
-                    else
+                } else {
+                    for (; i <= n - 12; i += 12, dst += dcn * 4)
                     {
-                        _mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
-
-                        _mm_storeu_ps(dst +  0, v_h0);
-                        _mm_storeu_ps(dst +  4, v_h1);
-                        _mm_storeu_ps(dst +  8, v_l0);
-                        _mm_storeu_ps(dst + 12, v_l1);
-                        _mm_storeu_ps(dst + 16, v_s0);
-                        _mm_storeu_ps(dst + 20, v_s1);
+                        v_float32x4 v_h;
+                        v_float32x4 v_l;
+                        v_float32x4 v_s;
+                        v_load_deinterleave(src + i, v_h, v_l, v_s);
+                        process(v_h, v_l, v_s);
+                        v_store_interleave(dst, v_h, v_l, v_s);
                     }
                 }
-                else
+            } else { // dcn == 4
+                if (bidx)
                 {
-                    __m128 v_a0 = _mm_set1_ps(alpha);
-                    __m128 v_a1 = _mm_set1_ps(alpha);
-                    if (bidx)
+                    for (; i <= n - 12; i += 12, dst += dcn * 4)
                     {
-                        _mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1, v_a0, v_a1);
-
-                        _mm_storeu_ps(dst +  0, v_s0);
-                        _mm_storeu_ps(dst +  4, v_s1);
-                        _mm_storeu_ps(dst +  8, v_l0);
-                        _mm_storeu_ps(dst + 12, v_l1);
-                        _mm_storeu_ps(dst + 16, v_h0);
-                        _mm_storeu_ps(dst + 20, v_h1);
-                        _mm_storeu_ps(dst + 24, v_a0);
-                        _mm_storeu_ps(dst + 28, v_a1);
+                        v_float32x4 v_h;
+                        v_float32x4 v_l;
+                        v_float32x4 v_s;
+                        v_load_deinterleave(src + i, v_h, v_l, v_s);
+                        process(v_h, v_l, v_s);
+                        v_float32x4 v_a = v_setall_f32(alpha);
+                        v_store_interleave(dst, v_s, v_l, v_h, v_a);
                     }
-                    else
+                } else {
+                    for (; i <= n - 12; i += 12, dst += dcn * 4)
                     {
-                        _mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1, v_a0, v_a1);
-
-                        _mm_storeu_ps(dst +  0, v_h0);
-                        _mm_storeu_ps(dst +  4, v_h1);
-                        _mm_storeu_ps(dst +  8, v_l0);
-                        _mm_storeu_ps(dst + 12, v_l1);
-                        _mm_storeu_ps(dst + 16, v_s0);
-                        _mm_storeu_ps(dst + 20, v_s1);
-                        _mm_storeu_ps(dst + 24, v_a0);
-                        _mm_storeu_ps(dst + 28, v_a1);
+                        v_float32x4 v_h;
+                        v_float32x4 v_l;
+                        v_float32x4 v_s;
+                        v_load_deinterleave(src + i, v_h, v_l, v_s);
+                        process(v_h, v_l, v_s);
+                        v_float32x4 v_a = v_setall_f32(alpha);
+                        v_store_interleave(dst, v_h, v_l, v_s, v_a);
                     }
                 }
             }
         }
         #endif
+
         for( ; i < n; i += 3, dst += dcn )
         {
             float h = src[i], l = src[i+1], s = src[i+2];
@@ -1033,7 +1062,7 @@ struct HLS2RGB_f
                 float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
                 float p1 = 2*l - p2;
 
-                h *= _hscale;
+                h *= hscale;
                 if( h < 0 )
                     do h += 6; while( h < 0 );
                 else if( h >= 6 )
@@ -1063,8 +1092,8 @@ struct HLS2RGB_f
 
     int dstcn, blueIdx;
     float hscale;
-    #if CV_SSE2
-    bool haveSIMD;
+    #if CV_SIMD128
+    bool hasSIMD;
     #endif
 };
 
diff --git a/modules/photo/test/test_hdr.cpp b/modules/photo/test/test_hdr.cpp
index 9e2aeaa94f..c4bf536278 100644
--- a/modules/photo/test/test_hdr.cpp
+++ b/modules/photo/test/test_hdr.cpp
@@ -213,7 +213,7 @@ TEST(Photo_MergeRobertson, regression)
     loadImage(test_path + "merge/robertson.hdr", expected);
     merge->process(images, result, times);
 
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__PPC64__)
     const float eps = 6.f;
 #else
     const float eps = 5.f;
diff --git a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
index 4c14340722..542f1e42fb 100644
--- a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
@@ -48,6 +48,7 @@
 #endif
 
 #include "opencv2/core.hpp"
+#include "opencv2/core/cuda.hpp"
 
 namespace cv {
 namespace detail {
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index de600302a1..0317831d3a 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -870,7 +870,12 @@ bool CvCapture_FFMPEG::open( const char* _filename )
             int enc_width = enc->width;
             int enc_height = enc->height;
 
-            AVCodec *codec = avcodec_find_decoder(enc->codec_id);
+            AVCodec *codec;
+            if(av_dict_get(dict, "video_codec", NULL, 0) == NULL) {
+                codec = avcodec_find_decoder(enc->codec_id);
+            } else {
+                codec = avcodec_find_decoder_by_name(av_dict_get(dict, "video_codec", NULL, 0)->value);
+            }
             if (!codec ||
 #if LIBAVCODEC_VERSION_INT >= ((53<<16)+(8<<8)+0)
                 avcodec_open2(enc, codec, NULL)
diff --git a/modules/videoio/src/cap_qt.cpp b/modules/videoio/src/cap_qt.cpp
index a75ff2ac16..cb416e6849 100644
--- a/modules/videoio/src/cap_qt.cpp
+++ b/modules/videoio/src/cap_qt.cpp
@@ -634,11 +634,11 @@ static int icvOpenCamera_QT (CvCapture_QT_Cam * capture, const int index)
     OPENCV_ASSERT (capture,            "icvOpenCamera_QT", "'capture' is a NULL-pointer");
     OPENCV_ASSERT (index >=0, "icvOpenCamera_QT", "camera index is negative");
 
-    ComponentDescription	component_description;
-    Component				component = 0;
+    ComponentDescription    component_description;
+    Component               component = 0;
     int                     number_of_inputs = 0;
     Rect                    myRect;
-    ComponentResult			result = noErr;
+    ComponentResult         result = noErr;
 
 
     // travers all components and count video digitizer channels
diff --git a/samples/cpp/tutorial_code/dnn/custom_layers.cpp b/samples/cpp/tutorial_code/dnn/custom_layers.cpp
index 3fc9e61279..217e53659f 100644
--- a/samples/cpp/tutorial_code/dnn/custom_layers.cpp
+++ b/samples/cpp/tutorial_code/dnn/custom_layers.cpp
@@ -16,18 +16,18 @@ public:
     virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
                                  const int requiredOutputs,
                                  std::vector<std::vector<int> > &outputs,
-                                 std::vector<std::vector<int> > &internals) const;
+                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE;
     //! [MyLayer::getMemoryShapes]
 
     //! [MyLayer::forward]
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals);
+    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE;
     //! [MyLayer::forward]
 
     //! [MyLayer::finalize]
-    virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs);
+    virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs) CV_OVERRIDE;
     //! [MyLayer::finalize]
 
-    virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals);
+    virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE;
 };
 //! [A custom layer interface]
 
@@ -49,7 +49,7 @@ public:
     virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
                                  const int requiredOutputs,
                                  std::vector<std::vector<int> > &outputs,
-                                 std::vector<std::vector<int> > &internals) const
+                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE
     {
         CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
         std::vector<int> outShape(4);
@@ -62,7 +62,7 @@ public:
     }
 
     // Implementation of this custom layer is based on https://github.com/cdmh/deeplab-public/blob/master/src/caffe/layers/interp_layer.cpp
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals)
+    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE
     {
         CV_UNUSED(internals);
         cv::Mat& inp = *inputs[0];
@@ -105,7 +105,7 @@ public:
         }
     }
 
-    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) {}
+    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) CV_OVERRIDE {}
 
 private:
     int outWidth, outHeight;
@@ -132,7 +132,7 @@ public:
     virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
                                  const int requiredOutputs,
                                  std::vector<std::vector<int> > &outputs,
-                                 std::vector<std::vector<int> > &internals) const
+                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE
     {
         CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
         std::vector<int> outShape(4);
@@ -146,7 +146,7 @@ public:
 
     // This implementation is based on a reference implementation from
     // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals)
+    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE
     {
         CV_UNUSED(internals);
         cv::Mat& inp = *inputs[0];
@@ -187,7 +187,7 @@ public:
         }
     }
 
-    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) {}
+    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) CV_OVERRIDE {}
 
 private:
     static inline int offset(const cv::MatSize& size, int c, int x, int y, int b)
diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp
index 5ff537bdbd..1298d7e39e 100644
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -7,12 +7,13 @@
 
 const char* keys =
     "{ help  h     | | Print help message. }"
-    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
+    "{ device      |  0 | camera device number. }"
+    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
     "{ model m     | | Path to a binary file of model contains trained weights. "
                       "It could be a file with extensions .caffemodel (Caffe), "
-                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet) }"
+                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet).}"
     "{ config c    | | Path to a text file of model contains network configuration. "
-                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet) }"
+                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet).}"
     "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
     "{ classes     | | Optional path to a text file with names of classes to label detected objects. }"
     "{ mean        | | Preprocess input image by subtracting mean values. Mean values should be in BGR order and delimited by spaces. }"
@@ -91,7 +92,7 @@ int main(int argc, char** argv)
     if (parser.has("input"))
         cap.open(parser.get<String>("input"));
     else
-        cap.open(0);
+        cap.open(parser.get<int>("device"));
 
     // Process frames.
     Mat frame, blob;
diff --git a/samples/dnn/openpose.cpp b/samples/dnn/openpose.cpp
index bc95c60023..da9315426a 100644
--- a/samples/dnn/openpose.cpp
+++ b/samples/dnn/openpose.cpp
@@ -61,12 +61,16 @@ int main(int argc, char **argv)
         "{ p proto          |       | (required) model configuration, e.g. hand/pose.prototxt }"
         "{ m model          |       | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }"
         "{ i image          |       | (required) path to image file (containing a single person, or hand) }"
+        "{ width            |  368  | Preprocess input image by resizing to a specific width. }"
+        "{ height           |  368  | Preprocess input image by resizing to a specific height. }"
         "{ t threshold      |  0.1  | threshold or confidence value for the heatmap }"
     );
 
     String modelTxt = parser.get<string>("proto");
     String modelBin = parser.get<string>("model");
     String imageFile = parser.get<String>("image");
+    int W_in = parser.get<int>("width");
+    int H_in = parser.get<int>("height");
     float thresh = parser.get<float>("threshold");
     if (parser.get<bool>("help") || modelTxt.empty() || modelBin.empty() || imageFile.empty())
     {
@@ -75,10 +79,6 @@ int main(int argc, char **argv)
         return 0;
     }
 
-    // fixed input size for the pretrained network
-    int W_in = 368;
-    int H_in = 368;
-
     // read the network model
     Net net = readNetFromCaffe(modelTxt, modelBin);
 
diff --git a/samples/dnn/segmentation.cpp b/samples/dnn/segmentation.cpp
index a0eb15bc86..252140a275 100644
--- a/samples/dnn/segmentation.cpp
+++ b/samples/dnn/segmentation.cpp
@@ -7,12 +7,13 @@
 
 const char* keys =
     "{ help  h     | | Print help message. }"
-    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
+    "{ device      |  0 | camera device number. }"
+    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
     "{ model m     | | Path to a binary file of model contains trained weights. "
                       "It could be a file with extensions .caffemodel (Caffe), "
-                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet) }"
+                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet). }"
     "{ config c    | | Path to a text file of model contains network configuration. "
-                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet) }"
+                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet). }"
     "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
     "{ classes     | | Optional path to a text file with names of classes. }"
     "{ colors      | | Optional path to a text file with colors for an every class. "
@@ -111,7 +112,7 @@ int main(int argc, char** argv)
     if (parser.has("input"))
         cap.open(parser.get<String>("input"));
     else
-        cap.open(0);
+        cap.open(parser.get<int>("device"));
     //! [Open a video file or an image file or a camera stream]
 
     // Process frames.