Merge pull request #3475 from cudawarped:cuda_fix_unaligned_hist

Modified histogram kernels to work with non aligned data
2 years ago · 4f66f8677b
parent 70f870600a 10e29b2683
commit 4f66f8677b
3 changed files with 156 additions and 96 deletions
--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@ -52,38 +52,41 @@ using namespace cv::cuda::device;

 namespace hist
 {
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    template<bool fourByteAligned>
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
    {
        __shared__ int shist[256];

        const int y = blockIdx.x * blockDim.y + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
        shist[tid] = 0;
        __syncthreads();

-        if (y < rows)
-        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
+        if (y < rows) {
+            const uchar* rowPtr = &src[y * step];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++)
+                    Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
+            }

+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
                Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
                Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
            }

-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    Emulation::smem::atomicAdd(&shist[data], 1);
-                }
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++)
+                    Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
            }
        }

@ -94,38 +97,50 @@ namespace hist
            ::atomicAdd(hist + tid, histVal);
    }

-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.rows, block.y));
-
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        if(offsetX)
+            histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
+        else
+            histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
+    template<bool fourByteAligned>
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
    {
        __shared__ int shist[256];

        const int y = blockIdx.x * blockDim.y + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
        shist[tid] = 0;
        __syncthreads();

        if (y < rows)
        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
-            const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
+            const uchar* rowPtr = &src[y * srcStep];
+            const uchar* maskRowPtr = &mask[y * maskStep];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++) {
+                    if (maskRowPtr[x])
+                        Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
+                }
+            }

-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
-                unsigned int m = maskRowPtr[x];
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
+            const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                const unsigned int m = maskRowPtrIntAligned[x];

                if ((m >> 0) & 0xFFU)
                    Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
@ -140,15 +155,12 @@ namespace hist
                    Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
            }

-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    unsigned int m = ((const uchar*)maskRowPtr)[x];
-
-                    if (m)
-                        Emulation::smem::atomicAdd(&shist[data], 1);
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++) {
+                    if (maskRowPtr[x])
+                        Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
                }
            }
        }
@ -160,12 +172,15 @@ namespace hist
            ::atomicAdd(hist + tid, histVal);
    }

-    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.rows, block.y));

-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
+        if(offsetX)
+            histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
+        else
+            histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
@ -186,42 +201,44 @@ namespace hist
        }
    }

-    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
-                               int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
+    template<bool fourByteAligned>
+    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
+        const int lowerLevel, const int upperLevel, const int offsetX)
    {
        extern __shared__ int shist[];

        const int y = blockIdx.x * blockDim.y + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
        if (tid < binCount)
            shist[tid] = 0;
-
        __syncthreads();

        if (y < rows)
        {
-            const uchar* rowPtr = src + y * step;
-            const uint* rowPtr4 = (uint*) rowPtr;
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                const uint data = rowPtr4[x];
+            const uchar* rowPtr = &src[y * step];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++)
+                    histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
+            }

+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
                histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
                histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
                histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
                histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
            }

-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    const uchar data = rowPtr[x];
-                    histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
-                }
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++)
+                    histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
            }
        }

@ -236,7 +253,7 @@ namespace hist
        }
    }

-    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.rows, block.y));
@ -245,7 +262,10 @@ namespace hist

        const size_t smem_size = binCount * sizeof(int);

-        histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
+        if(offsetX)
+            histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
+        else
+            histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no

 namespace hist
 {
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
-    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
 }

 void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St

    hist.setTo(Scalar::all(0), stream);

+    Point ofs; Size wholeSize;
+    src.locateROI(wholeSize, ofs);
    if (mask.empty())
-        hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+        hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
    else
-        hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
+        hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int

 namespace hist
 {
-    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
 }

 namespace
 {
    void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
    {
+        Point ofs; Size wholeSize;
+        src.locateROI(wholeSize, ofs);
        hist.create(1, histSize, CV_32S);
        cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
-        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
+        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
    }
 }

--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@ -49,15 +49,40 @@ namespace opencv_test { namespace {
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HistEven

-PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
+typedef tuple<Size, int> hist_size_to_roi_offset_params_t;
+const hist_size_to_roi_offset_params_t hist_size_to_roi_offset_params[] =
+{
+    // uchar reads only
+    hist_size_to_roi_offset_params_t(Size(1,32), 0),
+    hist_size_to_roi_offset_params_t(Size(2,32), 0),
+    hist_size_to_roi_offset_params_t(Size(2,32), 1),
+    hist_size_to_roi_offset_params_t(Size(3,32), 0),
+    hist_size_to_roi_offset_params_t(Size(3,32), 1),
+    hist_size_to_roi_offset_params_t(Size(3,32), 2),
+    hist_size_to_roi_offset_params_t(Size(4,32), 0),
+    hist_size_to_roi_offset_params_t(Size(4,32), 1),
+    hist_size_to_roi_offset_params_t(Size(4,32), 2),
+    hist_size_to_roi_offset_params_t(Size(4,32), 3),
+    // uchar and int reads
+    hist_size_to_roi_offset_params_t(Size(129,32), 0),
+    hist_size_to_roi_offset_params_t(Size(129,32), 1),
+    hist_size_to_roi_offset_params_t(Size(129,32), 2),
+    hist_size_to_roi_offset_params_t(Size(129,32), 3),
+    // int reads only
+    hist_size_to_roi_offset_params_t(Size(128,32), 0)
+};
+
+PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
+    int roiOffsetX;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));

        cv::cuda::setDevice(devInfo.deviceID());
    }
@ -66,19 +91,21 @@ PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(HistEven, Accuracy)
 {
    cv::Mat src = randomMat(size, CV_8UC1);
-
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
    int hbins = 30;
    float hranges[] = {50.0f, 200.0f};

    cv::cuda::GpuMat hist;
-    cv::cuda::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);
+    cv::cuda::GpuMat srcDevice = loadMat(src);
+    cv::cuda::histEven(srcDevice(roi), hist, hbins, (int)hranges[0], (int)hranges[1]);

    cv::Mat hist_gold;

    int histSize[] = {hbins};
    const float* ranges[] = {hranges};
    int channels[] = {0};
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);

    hist_gold = hist_gold.t();
    hist_gold.convertTo(hist_gold, CV_32S);
@ -87,22 +114,24 @@ CUDA_TEST_P(HistEven, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HistEven, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // CalcHist

-PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
+PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
    cv::cuda::DeviceInfo devInfo;

    cv::Size size;

+    int roiOffsetX;
+
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));

        cv::cuda::setDevice(devInfo.deviceID());
    }
@ -111,9 +140,10 @@ PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(CalcHist, Accuracy)
 {
    cv::Mat src = randomMat(size, CV_8UC1);
-
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
    cv::cuda::GpuMat hist;
-    cv::cuda::calcHist(loadMat(src), hist);
+    GpuMat srcDevice = loadMat(src);
+    cv::cuda::calcHist(srcDevice(roi), hist);

    cv::Mat hist_gold;

@ -123,7 +153,8 @@ CUDA_TEST_P(CalcHist, Accuracy)
    const float* ranges[] = {hranges};
    const int channels[] = {0};

-    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    const Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
    hist_gold = hist_gold.reshape(1, 1);
    hist_gold.convertTo(hist_gold, CV_32S);

@ -131,19 +162,21 @@ CUDA_TEST_P(CalcHist, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));

-PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
+PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
    cv::cuda::DeviceInfo devInfo;

    cv::Size size;

+    int roiOffsetX;
+
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));

        cv::cuda::setDevice(devInfo.deviceID());
    }
@ -152,11 +185,14 @@ PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(CalcHistWithMask, Accuracy)
 {
    cv::Mat src = randomMat(size, CV_8UC1);
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
    cv::Mat mask = randomMat(size, CV_8UC1);
    cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);

    cv::cuda::GpuMat hist;
-    cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
+    GpuMat srcDevice = loadMat(src);
+    GpuMat maskDevice = loadMat(mask);
+    cv::cuda::calcHist(srcDevice(roi), maskDevice(roi), hist);

    cv::Mat hist_gold;

@ -166,7 +202,8 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
    const float* ranges[] = {hranges};
    const int channels[] = {0};

-    cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
+    const Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, mask(roi), hist_gold, 1, histSize, ranges);
    hist_gold = hist_gold.reshape(1, 1);
    hist_gold.convertTo(hist_gold, CV_32S);

@ -174,8 +211,7 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // EqualizeHist