From 10e29b268389a51155f2685eb1e48726308b9b32 Mon Sep 17 00:00:00 2001 From: cudawarped <12133430+cudawarped@users.noreply.github.com> Date: Wed, 19 Apr 2023 16:41:33 +0300 Subject: [PATCH] cuda: fix bug in histogram kernels when source memory is not aligned to 4 bytes --- modules/cudaimgproc/src/cuda/hist.cu | 160 +++++++++++--------- modules/cudaimgproc/src/histogram.cpp | 16 +- modules/cudaimgproc/test/test_histogram.cpp | 76 +++++++--- 3 files changed, 156 insertions(+), 96 deletions(-) diff --git a/modules/cudaimgproc/src/cuda/hist.cu b/modules/cudaimgproc/src/cuda/hist.cu index 6bc5f15e6..a6d0ce7e9 100644 --- a/modules/cudaimgproc/src/cuda/hist.cu +++ b/modules/cudaimgproc/src/cuda/hist.cu @@ -52,38 +52,41 @@ using namespace cv::cuda::device; namespace hist { - __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist) + template + __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0) { __shared__ int shist[256]; const int y = blockIdx.x * blockDim.y + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; - + const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX; shist[tid] = 0; __syncthreads(); - if (y < rows) - { - const unsigned int* rowPtr = (const unsigned int*) (src + y * step); - - const int cols_4 = cols / 4; - for (int x = threadIdx.x; x < cols_4; x += blockDim.x) - { - unsigned int data = rowPtr[x]; + if (y < rows) { + const uchar* rowPtr = &src[y * step]; + // load uncoalesced head + if (!fourByteAligned && threadIdx.x == 0) { + for (int x = 0; x < min(alignedOffset, cols); x++) + Emulation::smem::atomicAdd(&shist[static_cast(rowPtr[x])], 1); + } - Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); - Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); + // coalesced loads + const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]); + const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4; + for (int x = threadIdx.x; x < cols_4; x += blockDim.x) { + const unsigned int data = rowPtrIntAligned[x]; + Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); + Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1); Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1); } - if (cols % 4 != 0 && threadIdx.x == 0) - { - for (int x = cols_4 * 4; x < cols; ++x) - { - unsigned int data = ((const uchar*)rowPtr)[x]; - Emulation::smem::atomicAdd(&shist[data], 1); - } + // load uncoalesced tail + if (threadIdx.x == 0) { + const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset; + for (int x = iTailStart; x < cols; x++) + Emulation::smem::atomicAdd(&shist[static_cast(rowPtr[x])], 1); } } @@ -94,61 +97,70 @@ namespace hist ::atomicAdd(hist + tid, histVal); } - void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream) + void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(src.rows, block.y)); - - histogram256Kernel<<>>(src.data, src.cols, src.rows, src.step, hist); + if(offsetX) + histogram256Kernel<<>>(src.data, src.cols, src.rows, src.step, hist, offsetX); + else + histogram256Kernel<<>>(src.data, src.cols, src.rows, src.step, hist, offsetX); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist) + template + __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0) { __shared__ int shist[256]; const int y = blockIdx.x * blockDim.y + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; - + const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX; shist[tid] = 0; __syncthreads(); if (y < rows) { - const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep); - const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep); + const uchar* rowPtr = &src[y * srcStep]; + const uchar* maskRowPtr = &mask[y * maskStep]; + // load uncoalesced head + if (!fourByteAligned && threadIdx.x == 0) { + for (int x = 0; x < min(alignedOffset, cols); x++) { + if (maskRowPtr[x]) + Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1); + } + } - const int cols_4 = cols / 4; - for (int x = threadIdx.x; x < cols_4; x += blockDim.x) - { - unsigned int data = rowPtr[x]; - unsigned int m = maskRowPtr[x]; + // coalesced loads + const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]); + const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]); + const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4; + for (int x = threadIdx.x; x < cols_4; x += blockDim.x) { + const unsigned int data = rowPtrIntAligned[x]; + const unsigned int m = maskRowPtrIntAligned[x]; - if ((m >> 0) & 0xFFU) - Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); + if ((m >> 0) & 0xFFU) + Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); - if ((m >> 8) & 0xFFU) - Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); + if ((m >> 8) & 0xFFU) + Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); - if ((m >> 16) & 0xFFU) + if ((m >> 16) & 0xFFU) Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1); - if ((m >> 24) & 0xFFU) + if ((m >> 24) & 0xFFU) Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1); } - if (cols % 4 != 0 && threadIdx.x == 0) - { - for (int x = cols_4 * 4; x < cols; ++x) - { - unsigned int data = ((const uchar*)rowPtr)[x]; - unsigned int m = ((const uchar*)maskRowPtr)[x]; - - if (m) - Emulation::smem::atomicAdd(&shist[data], 1); + // load uncoalesced tail + if (threadIdx.x == 0) { + const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset; + for (int x = iTailStart; x < cols; x++) { + if (maskRowPtr[x]) + Emulation::smem::atomicAdd(&shist[static_cast(rowPtr[x])], 1); } } } @@ -160,12 +172,15 @@ namespace hist ::atomicAdd(hist + tid, histVal); } - void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream) + void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(src.rows, block.y)); - histogram256Kernel<<>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist); + if(offsetX) + histogram256Kernel<<>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX); + else + histogram256Kernel<<>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -186,42 +201,44 @@ namespace hist } } - __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, - int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel) + template + __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize, + const int lowerLevel, const int upperLevel, const int offsetX) { extern __shared__ int shist[]; const int y = blockIdx.x * blockDim.y + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; - + const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX; if (tid < binCount) shist[tid] = 0; - __syncthreads(); if (y < rows) { - const uchar* rowPtr = src + y * step; - const uint* rowPtr4 = (uint*) rowPtr; - - const int cols_4 = cols / 4; - for (int x = threadIdx.x; x < cols_4; x += blockDim.x) - { - const uint data = rowPtr4[x]; + const uchar* rowPtr = &src[y * step]; + // load uncoalesced head + if (!fourByteAligned && threadIdx.x == 0) { + for (int x = 0; x < min(alignedOffset, cols); x++) + histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel); + } - histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel); - histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel); + // coalesced loads + const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]); + const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4; + for (int x = threadIdx.x; x < cols_4; x += blockDim.x) { + const unsigned int data = rowPtrIntAligned[x]; + histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel); + histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel); histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel); histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel); } - if (cols % 4 != 0 && threadIdx.x == 0) - { - for (int x = cols_4 * 4; x < cols; ++x) - { - const uchar data = rowPtr[x]; - histEvenInc(shist, data, binSize, lowerLevel, upperLevel); - } + // load uncoalesced tail + if (threadIdx.x == 0) { + const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset; + for (int x = iTailStart; x < cols; x++) + histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel); } } @@ -236,7 +253,7 @@ namespace hist } } - void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream) + void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(src.rows, block.y)); @@ -245,7 +262,10 @@ namespace hist const size_t smem_size = binCount * sizeof(int); - histEven8u<<>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel); + if(offsetX) + histEven8u<<>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX); + else + histEven8u<<>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX); cudaSafeCall( cudaGetLastError() ); if (stream == 0) diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index c252abc45..177bf75b1 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no namespace hist { - void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream); - void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream); + void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream); + void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream); } void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream) @@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St hist.setTo(Scalar::all(0), stream); + Point ofs; Size wholeSize; + src.locateROI(wholeSize, ofs); if (mask.empty()) - hist::histogram256(src, hist.ptr(), StreamAccessor::getStream(stream)); + hist::histogram256(src, hist.ptr(), ofs.x, StreamAccessor::getStream(stream)); else - hist::histogram256(src, mask, hist.ptr(), StreamAccessor::getStream(stream)); + hist::histogram256(src, mask, hist.ptr(), ofs.x, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// @@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int namespace hist { - void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream); + void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream); } namespace { void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream) { + Point ofs; Size wholeSize; + src.locateROI(wholeSize, ofs); hist.create(1, histSize, CV_32S); cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) ); - hist::histEven8u(src, hist.ptr(), histSize, lowerLevel, upperLevel, stream); + hist::histEven8u(src, hist.ptr(), histSize, lowerLevel, upperLevel, ofs.x, stream); } } diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp index a92eefde8..0be400ab6 100644 --- a/modules/cudaimgproc/test/test_histogram.cpp +++ b/modules/cudaimgproc/test/test_histogram.cpp @@ -49,15 +49,40 @@ namespace opencv_test { namespace { /////////////////////////////////////////////////////////////////////////////////////////////////////// // HistEven -PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size) +typedef tuple hist_size_to_roi_offset_params_t; +const hist_size_to_roi_offset_params_t hist_size_to_roi_offset_params[] = +{ + // uchar reads only + hist_size_to_roi_offset_params_t(Size(1,32), 0), + hist_size_to_roi_offset_params_t(Size(2,32), 0), + hist_size_to_roi_offset_params_t(Size(2,32), 1), + hist_size_to_roi_offset_params_t(Size(3,32), 0), + hist_size_to_roi_offset_params_t(Size(3,32), 1), + hist_size_to_roi_offset_params_t(Size(3,32), 2), + hist_size_to_roi_offset_params_t(Size(4,32), 0), + hist_size_to_roi_offset_params_t(Size(4,32), 1), + hist_size_to_roi_offset_params_t(Size(4,32), 2), + hist_size_to_roi_offset_params_t(Size(4,32), 3), + // uchar and int reads + hist_size_to_roi_offset_params_t(Size(129,32), 0), + hist_size_to_roi_offset_params_t(Size(129,32), 1), + hist_size_to_roi_offset_params_t(Size(129,32), 2), + hist_size_to_roi_offset_params_t(Size(129,32), 3), + // int reads only + hist_size_to_roi_offset_params_t(Size(128,32), 0) +}; + +PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t) { cv::cuda::DeviceInfo devInfo; cv::Size size; + int roiOffsetX; virtual void SetUp() { devInfo = GET_PARAM(0); - size = GET_PARAM(1); + size = get<0>(GET_PARAM(1)); + roiOffsetX = get<1>(GET_PARAM(1)); cv::cuda::setDevice(devInfo.deviceID()); } @@ -66,19 +91,21 @@ PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size) CUDA_TEST_P(HistEven, Accuracy) { cv::Mat src = randomMat(size, CV_8UC1); - + const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows); int hbins = 30; float hranges[] = {50.0f, 200.0f}; cv::cuda::GpuMat hist; - cv::cuda::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]); + cv::cuda::GpuMat srcDevice = loadMat(src); + cv::cuda::histEven(srcDevice(roi), hist, hbins, (int)hranges[0], (int)hranges[1]); cv::Mat hist_gold; int histSize[] = {hbins}; const float* ranges[] = {hranges}; int channels[] = {0}; - cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges); + Mat srcRoi = src(roi); + cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges); hist_gold = hist_gold.t(); hist_gold.convertTo(hist_gold, CV_32S); @@ -87,22 +114,24 @@ CUDA_TEST_P(HistEven, Accuracy) } INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HistEven, testing::Combine( - ALL_DEVICES, - DIFFERENT_SIZES)); + ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params))); /////////////////////////////////////////////////////////////////////////////////////////////////////// // CalcHist -PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size) +PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t) { cv::cuda::DeviceInfo devInfo; cv::Size size; + int roiOffsetX; + virtual void SetUp() { devInfo = GET_PARAM(0); - size = GET_PARAM(1); + size = get<0>(GET_PARAM(1)); + roiOffsetX = get<1>(GET_PARAM(1)); cv::cuda::setDevice(devInfo.deviceID()); } @@ -111,9 +140,10 @@ PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size) CUDA_TEST_P(CalcHist, Accuracy) { cv::Mat src = randomMat(size, CV_8UC1); - + const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows); cv::cuda::GpuMat hist; - cv::cuda::calcHist(loadMat(src), hist); + GpuMat srcDevice = loadMat(src); + cv::cuda::calcHist(srcDevice(roi), hist); cv::Mat hist_gold; @@ -123,7 +153,8 @@ CUDA_TEST_P(CalcHist, Accuracy) const float* ranges[] = {hranges}; const int channels[] = {0}; - cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges); + const Mat srcRoi = src(roi); + cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges); hist_gold = hist_gold.reshape(1, 1); hist_gold.convertTo(hist_gold, CV_32S); @@ -131,19 +162,21 @@ CUDA_TEST_P(CalcHist, Accuracy) } INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine( - ALL_DEVICES, - DIFFERENT_SIZES)); + ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params))); -PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size) +PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t) { cv::cuda::DeviceInfo devInfo; cv::Size size; + int roiOffsetX; + virtual void SetUp() { devInfo = GET_PARAM(0); - size = GET_PARAM(1); + size = get<0>(GET_PARAM(1)); + roiOffsetX = get<1>(GET_PARAM(1)); cv::cuda::setDevice(devInfo.deviceID()); } @@ -152,11 +185,14 @@ PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size) CUDA_TEST_P(CalcHistWithMask, Accuracy) { cv::Mat src = randomMat(size, CV_8UC1); + const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows); cv::Mat mask = randomMat(size, CV_8UC1); cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0); cv::cuda::GpuMat hist; - cv::cuda::calcHist(loadMat(src), loadMat(mask), hist); + GpuMat srcDevice = loadMat(src); + GpuMat maskDevice = loadMat(mask); + cv::cuda::calcHist(srcDevice(roi), maskDevice(roi), hist); cv::Mat hist_gold; @@ -166,7 +202,8 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy) const float* ranges[] = {hranges}; const int channels[] = {0}; - cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges); + const Mat srcRoi = src(roi); + cv::calcHist(&srcRoi, 1, channels, mask(roi), hist_gold, 1, histSize, ranges); hist_gold = hist_gold.reshape(1, 1); hist_gold.convertTo(hist_gold, CV_32S); @@ -174,8 +211,7 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy) } INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine( - ALL_DEVICES, - DIFFERENT_SIZES)); + ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params))); /////////////////////////////////////////////////////////////////////////////////////////////////////// // EqualizeHist