diff --git a/modules/cudacodec/CMakeLists.txt b/modules/cudacodec/CMakeLists.txt
index fd6ed2d1ab..1dd4c92e5c 100644
--- a/modules/cudacodec/CMakeLists.txt
+++ b/modules/cudacodec/CMakeLists.txt
@@ -4,9 +4,9 @@ endif()
 
 set(the_description "CUDA-accelerated Video Encoding/Decoding")
 
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef)
 
-ocv_add_module(cudacodec opencv_highgui)
+ocv_add_module(cudacodec opencv_highgui OPTIONAL opencv_cudev)
 
 ocv_module_include_directories()
 ocv_glob_module_sources()
diff --git a/modules/cudacodec/src/cuda/nv12_to_rgb.cu b/modules/cudacodec/src/cuda/nv12_to_rgb.cu
index a6a8c77ea8..f45a314c60 100644
--- a/modules/cudacodec/src/cuda/nv12_to_rgb.cu
+++ b/modules/cudacodec/src/cuda/nv12_to_rgb.cu
@@ -47,13 +47,26 @@
  * source and converts to output in ARGB format
  */
 
-#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/opencv_modules.hpp"
 
-namespace cv { namespace cuda { namespace device
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev/common.hpp"
+
+using namespace cv;
+using namespace cv::cudev;
+
+void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height);
+
+namespace
 {
     __constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};
 
-    __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
+    __device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
     {
         float luma, chromaCb, chromaCr;
 
@@ -76,7 +89,7 @@ namespace cv { namespace cuda { namespace device
                 (chromaCr * constHueColorSpaceMat[8]);
     }
 
-    __device__ uint RGBA_pack_10bit(float red, float green, float blue, uint alpha)
+    __device__ static uint RGBA_pack_10bit(float red, float green, float blue, uint alpha)
     {
         uint ARGBpixel = 0;
 
@@ -99,9 +112,9 @@ namespace cv { namespace cuda { namespace device
     #define COLOR_COMPONENT_BIT_SIZE 10
     #define COLOR_COMPONENT_MASK     0x3FF
 
-    __global__ void NV12_to_RGB(uchar* srcImage, size_t nSourcePitch,
-                                uint* dstImage, size_t nDestPitch,
-                                uint width, uint height)
+    __global__ void NV12_to_RGB(const uchar* srcImage, size_t nSourcePitch,
+                                  uint* dstImage, size_t nDestPitch,
+                                  uint width, uint height)
     {
         // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
         const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
@@ -171,18 +184,24 @@ namespace cv { namespace cuda { namespace device
         dstImage[y * dstImagePitch + x     ] = RGBA_pack_10bit(red[0], green[0], blue[0], ((uint)0xff << 24));
         dstImage[y * dstImagePitch + x + 1 ] = RGBA_pack_10bit(red[1], green[1], blue[1], ((uint)0xff << 24));
     }
+}
 
-    void NV12_to_RGB(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
+void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height)
+{
+    // Final Stage: NV12toARGB color space conversion
 
-        NV12_to_RGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step,
-            interopFrame.cols, interopFrame.rows);
+    _outFrame.create(height, width, CV_8UC4);
+    GpuMat outFrame = _outFrame.getGpuMat();
 
-        cudaSafeCall( cudaGetLastError() );
+    dim3 block(32, 8);
+    dim3 grid(divUp(width, 2 * block.x), divUp(height, block.y));
 
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}}}
+    NV12_to_RGB<<<grid, block>>>(decodedFrame.ptr<uchar>(), decodedFrame.step,
+                                 outFrame.ptr<uint>(), outFrame.step,
+                                 width, height);
+
+    CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+    CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+}
+
+#endif
diff --git a/modules/cudacodec/src/cuda/rgb_to_yv12.cu b/modules/cudacodec/src/cuda/rgb_to_yv12.cu
index 18a85f5621..ed0e0df9ba 100644
--- a/modules/cudacodec/src/cuda/rgb_to_yv12.cu
+++ b/modules/cudacodec/src/cuda/rgb_to_yv12.cu
@@ -40,10 +40,21 @@
 //
 //M*/
 
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/opencv_modules.hpp"
 
-namespace cv { namespace cuda { namespace device
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev/ptr2d/glob.hpp"
+
+using namespace cv::cudev;
+
+void RGB_to_YV12(const GpuMat& src, GpuMat& dst);
+
+namespace
 {
     __device__ __forceinline__ void rgb_to_y(const uchar b, const uchar g, const uchar r, uchar& y)
     {
@@ -57,7 +68,7 @@ namespace cv { namespace cuda { namespace device
         v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
     }
 
-    __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
+    __global__ void Gray_to_YV12(const GlobPtrSz<uchar> src, GlobPtr<uchar> dst)
     {
         const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
         const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
@@ -67,9 +78,9 @@ namespace cv { namespace cuda { namespace device
 
         // get pointers to the data
         const size_t planeSize = src.rows * dst.step;
-        PtrStepb y_plane(dst.data, dst.step);
-        PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
-        PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
+        GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
+        GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
+        GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
 
         uchar pix;
         uchar y_val, u_val, v_val;
@@ -94,7 +105,7 @@ namespace cv { namespace cuda { namespace device
     }
 
     template <typename T>
-    __global__ void RGB_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
+    __global__ void RGB_to_YV12(const GlobPtrSz<T> src, GlobPtr<uchar> dst)
     {
         const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
         const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
@@ -104,9 +115,9 @@ namespace cv { namespace cuda { namespace device
 
         // get pointers to the data
         const size_t planeSize = src.rows * dst.step;
-        PtrStepb y_plane(dst.data, dst.step);
-        PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
-        PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
+        GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
+        GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
+        GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
 
         T pix;
         uchar y_val, u_val, v_val;
@@ -129,42 +140,28 @@ namespace cv { namespace cuda { namespace device
         u_plane(y / 2, x / 2) = u_val;
         v_plane(y / 2, x / 2) = v_val;
     }
+}
 
-    void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-        Gray_to_YV12<<<grid, block, 0, stream>>>(src, dst);
-        cudaSafeCall( cudaGetLastError() );
+void RGB_to_YV12(const GpuMat& src, GpuMat& dst)
+{
+    const dim3 block(32, 8);
+    const dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
 
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    template <int cn>
-    void RGB_to_YV12_caller(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream)
+    switch (src.channels())
     {
-        typedef typename TypeVec<uchar, cn>::vec_type src_t;
-
-        dim3 block(32, 8);
-        dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-        RGB_to_YV12<<<grid, block, 0, stream>>>(static_cast< PtrStepSz<src_t> >(src), dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+    case 1:
+        Gray_to_YV12<<<grid, block>>>(globPtr<uchar>(src), globPtr<uchar>(dst));
+        break;
+    case 3:
+        RGB_to_YV12<<<grid, block>>>(globPtr<uchar3>(src), globPtr<uchar>(dst));
+        break;
+    case 4:
+        RGB_to_YV12<<<grid, block>>>(globPtr<uchar4>(src), globPtr<uchar>(dst));
+        break;
     }
 
-    void RGB_to_YV12(const PtrStepSzb src, int cn, PtrStepSzb dst, cudaStream_t stream)
-    {
-        typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream);
-
-        static const func_t funcs[] =
-        {
-            0, Gray_to_YV12_caller, 0, RGB_to_YV12_caller<3>, RGB_to_YV12_caller<4>
-        };
+    CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+    CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+}
 
-        funcs[cn](src, dst, stream);
-    }
-}}}
+#endif
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index ede7ef43e5..98f0ba1d94 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -53,10 +53,7 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>&) {
 
 #else // HAVE_NVCUVID
 
-namespace cv { namespace cuda { namespace device
-{
-    void NV12_to_RGB(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream = 0);
-}}}
+void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height);
 
 using namespace cv::cudacodec::detail;
 
@@ -125,18 +122,6 @@ namespace
         CUvideoctxlock m_lock;
     };
 
-    void cudaPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height)
-    {
-        using namespace cv::cuda::device;
-
-        // Final Stage: NV12toARGB color space conversion
-
-        _outFrame.create(height, width, CV_8UC4);
-        GpuMat outFrame = _outFrame.getGpuMat();
-
-        NV12_to_RGB(decodedFrame, outFrame);
-    }
-
     bool VideoReaderImpl::nextFrame(OutputArray frame)
     {
         if (videoSource_->hasError() || videoParser_->hasError())
@@ -195,7 +180,7 @@ namespace
 
             // perform post processing on the CUDA surface (performs colors space conversion and post processing)
             // comment this out if we inclue the line of code seen above
-            cudaPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
+            videoDecPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
 
             // unmap video frame
             // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
diff --git a/modules/cudacodec/src/video_writer.cpp b/modules/cudacodec/src/video_writer.cpp
index 78183fb9de..15f6732e3b 100644
--- a/modules/cudacodec/src/video_writer.cpp
+++ b/modules/cudacodec/src/video_writer.cpp
@@ -62,10 +62,7 @@ Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const Ptr<EncoderCallBack>&, S
 
 #else // !defined HAVE_CUDA || !defined WIN32
 
-namespace cv { namespace cuda { namespace device
-{
-    void RGB_to_YV12(const PtrStepSzb src, int cn, PtrStepSzb dst, cudaStream_t stream = 0);
-}}}
+void RGB_to_YV12(const GpuMat& src, GpuMat& dst);
 
 ///////////////////////////////////////////////////////////////////////////
 // VideoWriterImpl
@@ -642,7 +639,7 @@ namespace
 
         if (inputFormat_ == SF_BGR)
         {
-            device::RGB_to_YV12(frame, frame.channels(), videoFrame_);
+            RGB_to_YV12(frame, videoFrame_);
         }
         else
         {
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
index 7304a8c7f5..7385926638 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
@@ -72,7 +72,7 @@ template <typename T> struct GlobPtrSz : GlobPtr<T>
 };
 
 template <typename T>
-__host__ GlobPtr<T> globPtr(T* data, size_t step)
+__host__ __device__ GlobPtr<T> globPtr(T* data, size_t step)
 {
     GlobPtr<T> p;
     p.data = data;
@@ -81,7 +81,7 @@ __host__ GlobPtr<T> globPtr(T* data, size_t step)
 }
 
 template <typename T>
-__host__ GlobPtrSz<T> globPtr(T* data, size_t step, int rows, int cols)
+__host__ __device__ GlobPtrSz<T> globPtr(T* data, size_t step, int rows, int cols)
 {
     GlobPtrSz<T> p;
     p.data = data;