diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index dc876b744b..25a324a00f 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -201,6 +201,15 @@ CV_EXPORTS void alphaComp(InputArray img1, InputArray img2, OutputArray dst, int
  */
 CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stream::Null());
 
+/** @brief Calculates histogram for one channel 8-bit image confined in given mask.
+
+@param src Source image with CV_8UC1 type.
+@param hist Destination histogram with one row, 256 columns, and the CV_32SC1 type.
+@param mask A mask image same size as src and of type CV_8UC1.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void calcHist(InputArray src, InputArray mask, OutputArray hist, Stream& stream = Stream::Null());
+
 /** @brief Equalizes the histogram of a grayscale image.
 
 @param src Source image with CV_8UC1 type.
diff --git a/modules/cudaimgproc/src/cuda/hist.cu b/modules/cudaimgproc/src/cuda/hist.cu
index ba9290c190..be13091f12 100644
--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@@ -105,6 +105,72 @@ namespace hist
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
+
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
+            const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+                unsigned int m = maskRowPtr[x];
+
+                if ((m >>  0) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+
+                if ((m >>  8) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+
+                if ((m >>  16) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+
+                if ((m >>  24) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    unsigned int m = ((const uchar*)maskRowPtr)[x];
+
+                    if (m)
+                        Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
 /////////////////////////////////////////////////////////////////////////
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index 59aa83343a..fce5057590 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -69,20 +69,32 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
 namespace hist
 {
     void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
 }
 
 void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
+{
+    calcHist(_src, cv::cuda::GpuMat(), _hist, stream);
+}
+
+void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, Stream& stream)
 {
     GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
     CV_Assert( src.type() == CV_8UC1 );
+    CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
+    CV_Assert( mask.empty() || mask.size() == src.size() );
 
     _hist.create(1, 256, CV_32SC1);
     GpuMat hist = _hist.getGpuMat();
 
     hist.setTo(Scalar::all(0), stream);
 
-    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+    if (mask.empty())
+        hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+    else
+        hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp
index 3d3217375e..7fcde85206 100644
--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@@ -136,6 +136,49 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES));
 
+PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CalcHistWithMask, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+    cv::Mat mask = randomMat(size, CV_8UC1);
+    cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
+
+    cv::cuda::GpuMat hist;
+    cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
+
+    cv::Mat hist_gold;
+
+    const int hbins = 256;
+    const float hranges[] = {0.0f, 256.0f};
+    const int histSize[] = {hbins};
+    const float* ranges[] = {hranges};
+    const int channels[] = {0};
+
+    cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
+    hist_gold = hist_gold.reshape(1, 1);
+    hist_gold.convertTo(hist_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // EqualizeHist