diff --git a/modules/cudaarithm/src/cuda/minmax.cu b/modules/cudaarithm/src/cuda/minmax.cu
index c92de44ad3..eec861bdc4 100644
--- a/modules/cudaarithm/src/cuda/minmax.cu
+++ b/modules/cudaarithm/src/cuda/minmax.cu
@@ -40,208 +40,77 @@
 //
 //M*/
 
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
 
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/utility.hpp"
+#ifndef HAVE_OPENCV_CUDEV
 
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#error "opencv_cudev is required"
 
-namespace minMax
-{
-    __device__ unsigned int blocks_finished = 0;
-
-    // To avoid shared bank conflicts we convert each value into value of
-    // appropriate type (32 bits minimum)
-    template <typename T> struct MinMaxTypeTraits;
-    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
-    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
-
-    template <int BLOCK_SIZE, typename R>
-    struct GlobalReduce
-    {
-        static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
-        {
-        #if __CUDA_ARCH__ >= 200
-            if (tid == 0)
-            {
-                Emulation::glob::atomicMin(minval, mymin);
-                Emulation::glob::atomicMax(maxval, mymax);
-            }
-        #else
-            __shared__ bool is_last;
-
-            if (tid == 0)
-            {
-                minval[bid] = mymin;
-                maxval[bid] = mymax;
-
-                __threadfence();
-
-                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
-                is_last = (ticket == gridDim.x * gridDim.y - 1);
-            }
-
-            __syncthreads();
-
-            if (is_last)
-            {
-                int idx = ::min(tid, gridDim.x * gridDim.y - 1);
-
-                mymin = minval[idx];
-                mymax = maxval[idx];
-
-                const minimum<R> minOp;
-                const maximum<R> maxOp;
-                device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
-
-                if (tid == 0)
-                {
-                    minval[0] = mymin;
-                    maxval[0] = mymax;
-
-                    blocks_finished = 0;
-                }
-            }
-        #endif
-        }
-    };
-
-    template <int BLOCK_SIZE, typename T, typename R, class Mask>
-    __global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
-    {
-        __shared__ R sminval[BLOCK_SIZE];
-        __shared__ R smaxval[BLOCK_SIZE];
-
-        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
-
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-
-        R mymin = numeric_limits<R>::max();
-        R mymax = -numeric_limits<R>::max();
-
-        const minimum<R> minOp;
-        const maximum<R> maxOp;
-
-        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
-        {
-            const T* ptr = src.ptr(y);
+#else
 
-            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
-            {
-                if (mask(y, x))
-                {
-                    const R srcVal = ptr[x];
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
 
-                    mymin = minOp(mymin, srcVal);
-                    mymax = maxOp(mymax, srcVal);
-                }
-            }
-        }
+using namespace cv::cudev;
 
-        device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
-
-        GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
-    }
-
-    const int threads_x = 32;
-    const int threads_y = 8;
-
-    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+namespace
+{
+    template <typename T>
+    void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal)
     {
-        block = dim3(threads_x, threads_y);
-
-        grid = dim3(divUp(cols, block.x * block.y),
-                    divUp(rows, block.y * block.x));
+        typedef typename SelectIf<
+                TypesEquals<T, double>::value,
+                double,
+                typename SelectIf<TypesEquals<T, float>::value, float, int>::type
+                >::type work_type;
 
-        grid.x = ::min(grid.x, block.x);
-        grid.y = ::min(grid.y, block.y);
-    }
+        GpuMat_<T> src(_src);
+        GpuMat_<work_type> buf(_buf);
 
-    void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
-    {
-        dim3 block, grid;
-        getLaunchCfg(cols, rows, block, grid);
+        if (mask.empty())
+            gridFindMinMaxVal(src, buf);
+        else
+            gridFindMinMaxVal(src, buf, globPtr<uchar>(mask));
 
-        bufcols = grid.x * grid.y * sizeof(double);
-        bufrows = 2;
-    }
+        work_type data[2];
+        buf.download(cv::Mat(1, 2, buf.type(), data));
 
-    __global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
-    {
-        *minval_buf = numeric_limits<int>::max();
-        *maxval_buf = numeric_limits<int>::min();
-    }
-    __global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
-    {
-        *minval_buf = numeric_limits<float>::max();
-        *maxval_buf = -numeric_limits<float>::max();
-    }
-    __global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
-    {
-        *minval_buf = numeric_limits<double>::max();
-        *maxval_buf = -numeric_limits<double>::max();
-    }
+        if (minVal)
+            *minVal = data[0];
 
-    template <typename R>
-    void setDefault(R* minval_buf, R* maxval_buf)
-    {
-        setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
+        if (maxVal)
+            *maxVal = data[1];
     }
+}
 
-    template <typename T>
-    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
+void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal);
+    static const func_t funcs[] =
     {
-        typedef typename MinMaxTypeTraits<T>::best_type R;
-
-        dim3 block, grid;
-        getLaunchCfg(src.cols, src.rows, block, grid);
-
-        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
-        const int theight = divUp(divUp(src.rows, grid.y), block.y);
-
-        R* minval_buf = (R*) buf.ptr(0);
-        R* maxval_buf = (R*) buf.ptr(1);
+        minMaxImpl<uchar>,
+        minMaxImpl<schar>,
+        minMaxImpl<ushort>,
+        minMaxImpl<short>,
+        minMaxImpl<int>,
+        minMaxImpl<float>,
+        minMaxImpl<double>
+    };
 
-        setDefault(minval_buf, maxval_buf);
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
-        if (mask.data)
-            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
-        else
-            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
+    CV_Assert( src.channels() == 1 );
+    CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
 
-        cudaSafeCall( cudaGetLastError() );
+    const int depth = src.depth();
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+    const int work_type = depth == CV_64F ? CV_64F : depth == CV_32F ? CV_32F : CV_32S;
+    ensureSizeIsEnough(1, 2, work_type, buf);
 
-        R minval_, maxval_;
-        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
-        *minval = minval_;
-        *maxval = maxval_;
-    }
+    const func_t func = funcs[src.depth()];
 
-    template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-    template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-    template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    func(src, mask, buf, minVal, maxVal);
 }
 
-#endif // CUDA_DISABLER
+#endif
diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp
index d4e53b77f5..e3a8d6d7a1 100644
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -186,53 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
     return retVal;
 }
 
-////////////////////////////////////////////////////////////////////////
-// minMax
-
-namespace minMax
-{
-    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
-
-    template <typename T>
-    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-}
-
-void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
-{
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
-
-    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-    static const func_t funcs[] =
-    {
-        ::minMax::run<uchar>,
-        ::minMax::run<schar>,
-        ::minMax::run<ushort>,
-        ::minMax::run<short>,
-        ::minMax::run<int>,
-        ::minMax::run<float>,
-        ::minMax::run<double>
-    };
-
-    CV_Assert( src.channels() == 1 );
-    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
-
-    if (src.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    Size buf_size;
-    ::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
-    ensureSizeIsEnough(buf_size, CV_8U, buf);
-
-    const func_t func = funcs[src.depth()];
-
-    double temp1, temp2;
-    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
-}
-
 ////////////////////////////////////////////////////////////////////////
 // minMaxLoc
 
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
index c220a94214..21a95eace2 100644
--- a/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
@@ -440,30 +440,24 @@ namespace grid_reduce_detail
     __host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
     {
         typedef typename PtrTraits<SrcPtr>::value_type src_type;
-        const int cn = VecTraits<src_type>::cn;
-        typedef typename MakeVec<ResType, cn>::type work_type;
 
-        glob_reduce<MinMaxReductor<minop<work_type>, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
+        glob_reduce<MinMaxReductor<minop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
     }
 
     template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
     __host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
     {
         typedef typename PtrTraits<SrcPtr>::value_type src_type;
-        const int cn = VecTraits<src_type>::cn;
-        typedef typename MakeVec<ResType, cn>::type work_type;
 
-        glob_reduce<MinMaxReductor<maxop<work_type>, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
+        glob_reduce<MinMaxReductor<maxop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
     }
 
     template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
     __host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
     {
         typedef typename PtrTraits<SrcPtr>::value_type src_type;
-        const int cn = VecTraits<src_type>::cn;
-        typedef typename MakeVec<ResType, cn>::type work_type;
 
-        glob_reduce<MinMaxReductor<both, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
+        glob_reduce<MinMaxReductor<both, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
     }
 }