From 10e29b268389a51155f2685eb1e48726308b9b32 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Wed, 19 Apr 2023 16:41:33 +0300
Subject: [PATCH] cuda: fix bug in histogram kernels when source memory is not
 aligned to 4 bytes

---
 modules/cudaimgproc/src/cuda/hist.cu        | 160 +++++++++++---------
 modules/cudaimgproc/src/histogram.cpp       |  16 +-
 modules/cudaimgproc/test/test_histogram.cpp |  76 +++++++---
 3 files changed, 156 insertions(+), 96 deletions(-)
diff --git a/modules/cudaimgproc/src/cuda/hist.cu b/modules/cudaimgproc/src/cuda/hist.cu
index 6bc5f15e6..a6d0ce7e9 100644
--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@@ -52,38 +52,41 @@ using namespace cv::cuda::device;
 
 namespace hist
 {
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    template<bool fourByteAligned>
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
     {
         __shared__ int shist[256];
 
         const int y = blockIdx.x * blockDim.y + threadIdx.y;
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
         shist[tid] = 0;
         __syncthreads();
 
-        if (y < rows)
-        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
+        if (y < rows) {
+            const uchar* rowPtr = &src[y * step];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++)
+                    Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
+            }
 
-                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
                 Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
                 Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
             }
 
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    Emulation::smem::atomicAdd(&shist[data], 1);
-                }
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++)
+                    Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
             }
         }
 
@@ -94,61 +97,70 @@ namespace hist
             ::atomicAdd(hist + tid, histVal);
     }
 
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.rows, block.y));
-
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        if(offsetX)
+            histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
+        else
+            histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
+    template<bool fourByteAligned>
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
     {
         __shared__ int shist[256];
 
         const int y = blockIdx.x * blockDim.y + threadIdx.y;
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
         shist[tid] = 0;
         __syncthreads();
 
         if (y < rows)
         {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
-            const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
+            const uchar* rowPtr = &src[y * srcStep];
+            const uchar* maskRowPtr = &mask[y * maskStep];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++) {
+                    if (maskRowPtr[x])
+                        Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
+                }
+            }
 
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
-                unsigned int m = maskRowPtr[x];
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
+            const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                const unsigned int m = maskRowPtrIntAligned[x];
 
-                if ((m >>  0) & 0xFFU)
-                    Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                if ((m >> 0) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
 
-                if ((m >>  8) & 0xFFU)
-                    Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                if ((m >> 8) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
 
-                if ((m >>  16) & 0xFFU)
+                if ((m >> 16) & 0xFFU)
                     Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
 
-                if ((m >>  24) & 0xFFU)
+                if ((m >> 24) & 0xFFU)
                     Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
             }
 
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    unsigned int m = ((const uchar*)maskRowPtr)[x];
-
-                    if (m)
-                        Emulation::smem::atomicAdd(&shist[data], 1);
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++) {
+                    if (maskRowPtr[x])
+                        Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
                 }
             }
         }
@@ -160,12 +172,15 @@ namespace hist
             ::atomicAdd(hist + tid, histVal);
     }
 
-    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.rows, block.y));
 
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
+        if(offsetX)
+            histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
+        else
+            histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
@@ -186,42 +201,44 @@ namespace hist
         }
     }
 
-    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
-                               int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
+    template<bool fourByteAligned>
+    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
+        const int lowerLevel, const int upperLevel, const int offsetX)
     {
         extern __shared__ int shist[];
 
         const int y = blockIdx.x * blockDim.y + threadIdx.y;
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
         if (tid < binCount)
             shist[tid] = 0;
-
         __syncthreads();
 
         if (y < rows)
         {
-            const uchar* rowPtr = src + y * step;
-            const uint* rowPtr4 = (uint*) rowPtr;
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                const uint data = rowPtr4[x];
+            const uchar* rowPtr = &src[y * step];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++)
+                    histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
+            }
 
-                histEvenInc(shist, (data >>  0) & 0xFFU, binSize, lowerLevel, upperLevel);
-                histEvenInc(shist, (data >>  8) & 0xFFU, binSize, lowerLevel, upperLevel);
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
                 histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
                 histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
             }
 
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    const uchar data = rowPtr[x];
-                    histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
-                }
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++)
+                    histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
             }
         }
 
@@ -236,7 +253,7 @@ namespace hist
         }
     }
 
-    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.rows, block.y));
@@ -245,7 +262,10 @@ namespace hist
 
         const size_t smem_size = binCount * sizeof(int);
 
-        histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
+        if(offsetX)
+            histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
+        else
+            histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index c252abc45..177bf75b1 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
 
 namespace hist
 {
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
-    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
 }
 
 void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
@@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St
 
     hist.setTo(Scalar::all(0), stream);
 
+    Point ofs; Size wholeSize;
+    src.locateROI(wholeSize, ofs);
     if (mask.empty())
-        hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+        hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
     else
-        hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
+        hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
 
 namespace hist
 {
-    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
 }
 
 namespace
 {
     void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
     {
+        Point ofs; Size wholeSize;
+        src.locateROI(wholeSize, ofs);
         hist.create(1, histSize, CV_32S);
         cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
-        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
+        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
     }
 }
 
diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp
index a92eefde8..0be400ab6 100644
--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@@ -49,15 +49,40 @@ namespace opencv_test { namespace {
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HistEven
 
-PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
+typedef tuple<Size, int> hist_size_to_roi_offset_params_t;
+const hist_size_to_roi_offset_params_t hist_size_to_roi_offset_params[] =
+{
+    // uchar reads only
+    hist_size_to_roi_offset_params_t(Size(1,32), 0),
+    hist_size_to_roi_offset_params_t(Size(2,32), 0),
+    hist_size_to_roi_offset_params_t(Size(2,32), 1),
+    hist_size_to_roi_offset_params_t(Size(3,32), 0),
+    hist_size_to_roi_offset_params_t(Size(3,32), 1),
+    hist_size_to_roi_offset_params_t(Size(3,32), 2),
+    hist_size_to_roi_offset_params_t(Size(4,32), 0),
+    hist_size_to_roi_offset_params_t(Size(4,32), 1),
+    hist_size_to_roi_offset_params_t(Size(4,32), 2),
+    hist_size_to_roi_offset_params_t(Size(4,32), 3),
+    // uchar and int reads
+    hist_size_to_roi_offset_params_t(Size(129,32), 0),
+    hist_size_to_roi_offset_params_t(Size(129,32), 1),
+    hist_size_to_roi_offset_params_t(Size(129,32), 2),
+    hist_size_to_roi_offset_params_t(Size(129,32), 3),
+    // int reads only
+    hist_size_to_roi_offset_params_t(Size(128,32), 0)
+};
+
+PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Size size;
+    int roiOffsetX;
 
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -66,19 +91,21 @@ PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(HistEven, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
-
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
     int hbins = 30;
     float hranges[] = {50.0f, 200.0f};
 
     cv::cuda::GpuMat hist;
-    cv::cuda::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);
+    cv::cuda::GpuMat srcDevice = loadMat(src);
+    cv::cuda::histEven(srcDevice(roi), hist, hbins, (int)hranges[0], (int)hranges[1]);
 
     cv::Mat hist_gold;
 
     int histSize[] = {hbins};
     const float* ranges[] = {hranges};
     int channels[] = {0};
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
 
     hist_gold = hist_gold.t();
     hist_gold.convertTo(hist_gold, CV_32S);
@@ -87,22 +114,24 @@ CUDA_TEST_P(HistEven, Accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HistEven, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // CalcHist
 
-PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
+PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
     cv::cuda::DeviceInfo devInfo;
 
     cv::Size size;
 
+    int roiOffsetX;
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -111,9 +140,10 @@ PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(CalcHist, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
-
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
     cv::cuda::GpuMat hist;
-    cv::cuda::calcHist(loadMat(src), hist);
+    GpuMat srcDevice = loadMat(src);
+    cv::cuda::calcHist(srcDevice(roi), hist);
 
     cv::Mat hist_gold;
 
@@ -123,7 +153,8 @@ CUDA_TEST_P(CalcHist, Accuracy)
     const float* ranges[] = {hranges};
     const int channels[] = {0};
 
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    const Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
     hist_gold = hist_gold.reshape(1, 1);
     hist_gold.convertTo(hist_gold, CV_32S);
 
@@ -131,19 +162,21 @@ CUDA_TEST_P(CalcHist, Accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
 
-PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
+PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
     cv::cuda::DeviceInfo devInfo;
 
     cv::Size size;
 
+    int roiOffsetX;
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -152,11 +185,14 @@ PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(CalcHistWithMask, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
     cv::Mat mask = randomMat(size, CV_8UC1);
     cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
 
     cv::cuda::GpuMat hist;
-    cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
+    GpuMat srcDevice = loadMat(src);
+    GpuMat maskDevice = loadMat(mask);
+    cv::cuda::calcHist(srcDevice(roi), maskDevice(roi), hist);
 
     cv::Mat hist_gold;
 
@@ -166,7 +202,8 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
     const float* ranges[] = {hranges};
     const int channels[] = {0};
 
-    cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
+    const Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, mask(roi), hist_gold, 1, histSize, ranges);
     hist_gold = hist_gold.reshape(1, 1);
     hist_gold.convertTo(hist_gold, CV_32S);
 
@@ -174,8 +211,7 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // EqualizeHist