Merge pull request #3475 from cudawarped:cuda_fix_unaligned_hist

Modified histogram kernels to work with non aligned data
pull/3515/head
Alexander Smorkalov 2 years ago committed by GitHub
commit 4f66f8677b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 140
      modules/cudaimgproc/src/cuda/hist.cu
  2. 16
      modules/cudaimgproc/src/histogram.cpp
  3. 76
      modules/cudaimgproc/test/test_histogram.cpp

@ -52,38 +52,41 @@ using namespace cv::cuda::device;
namespace hist
{
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
template<bool fourByteAligned>
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
{
__shared__ int shist[256];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
if (y < rows) {
const uchar* rowPtr = &src[y * step];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++)
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
Emulation::smem::atomicAdd(&shist[data], 1);
}
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++)
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}
}
@ -94,38 +97,50 @@ namespace hist
::atomicAdd(hist + tid, histVal);
}
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
if(offsetX)
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
else
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
template<bool fourByteAligned>
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
{
__shared__ int shist[256];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
const uchar* rowPtr = &src[y * srcStep];
const uchar* maskRowPtr = &mask[y * maskStep];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++) {
if (maskRowPtr[x])
Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
}
}
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
unsigned int m = maskRowPtr[x];
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
const unsigned int m = maskRowPtrIntAligned[x];
if ((m >> 0) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
@ -140,15 +155,12 @@ namespace hist
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
unsigned int m = ((const uchar*)maskRowPtr)[x];
if (m)
Emulation::smem::atomicAdd(&shist[data], 1);
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++) {
if (maskRowPtr[x])
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}
}
}
@ -160,12 +172,15 @@ namespace hist
::atomicAdd(hist + tid, histVal);
}
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
if(offsetX)
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
else
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
@ -186,42 +201,44 @@ namespace hist
}
}
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
template<bool fourByteAligned>
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
const int lowerLevel, const int upperLevel, const int offsetX)
{
extern __shared__ int shist[];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
if (tid < binCount)
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const uchar* rowPtr = src + y * step;
const uint* rowPtr4 = (uint*) rowPtr;
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
const uint data = rowPtr4[x];
const uchar* rowPtr = &src[y * step];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++)
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
}
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
const uchar data = rowPtr[x];
histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
}
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++)
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
}
}
@ -236,7 +253,7 @@ namespace hist
}
}
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
@ -245,7 +262,10 @@ namespace hist
const size_t smem_size = binCount * sizeof(int);
histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
if(offsetX)
histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
else
histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)

@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
namespace hist
{
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
}
void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St
hist.setTo(Scalar::all(0), stream);
Point ofs; Size wholeSize;
src.locateROI(wholeSize, ofs);
if (mask.empty())
hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
else
hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
namespace hist
{
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
}
namespace
{
void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
{
Point ofs; Size wholeSize;
src.locateROI(wholeSize, ofs);
hist.create(1, histSize, CV_32S);
cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
}
}

@ -49,15 +49,40 @@ namespace opencv_test { namespace {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// HistEven
PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
typedef tuple<Size, int> hist_size_to_roi_offset_params_t;
const hist_size_to_roi_offset_params_t hist_size_to_roi_offset_params[] =
{
// uchar reads only
hist_size_to_roi_offset_params_t(Size(1,32), 0),
hist_size_to_roi_offset_params_t(Size(2,32), 0),
hist_size_to_roi_offset_params_t(Size(2,32), 1),
hist_size_to_roi_offset_params_t(Size(3,32), 0),
hist_size_to_roi_offset_params_t(Size(3,32), 1),
hist_size_to_roi_offset_params_t(Size(3,32), 2),
hist_size_to_roi_offset_params_t(Size(4,32), 0),
hist_size_to_roi_offset_params_t(Size(4,32), 1),
hist_size_to_roi_offset_params_t(Size(4,32), 2),
hist_size_to_roi_offset_params_t(Size(4,32), 3),
// uchar and int reads
hist_size_to_roi_offset_params_t(Size(129,32), 0),
hist_size_to_roi_offset_params_t(Size(129,32), 1),
hist_size_to_roi_offset_params_t(Size(129,32), 2),
hist_size_to_roi_offset_params_t(Size(129,32), 3),
// int reads only
hist_size_to_roi_offset_params_t(Size(128,32), 0)
};
PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
int roiOffsetX;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
size = get<0>(GET_PARAM(1));
roiOffsetX = get<1>(GET_PARAM(1));
cv::cuda::setDevice(devInfo.deviceID());
}
@ -66,19 +91,21 @@ PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
CUDA_TEST_P(HistEven, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
int hbins = 30;
float hranges[] = {50.0f, 200.0f};
cv::cuda::GpuMat hist;
cv::cuda::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);
cv::cuda::GpuMat srcDevice = loadMat(src);
cv::cuda::histEven(srcDevice(roi), hist, hbins, (int)hranges[0], (int)hranges[1]);
cv::Mat hist_gold;
int histSize[] = {hbins};
const float* ranges[] = {hranges};
int channels[] = {0};
cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
Mat srcRoi = src(roi);
cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
hist_gold = hist_gold.t();
hist_gold.convertTo(hist_gold, CV_32S);
@ -87,22 +114,24 @@ CUDA_TEST_P(HistEven, Accuracy)
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HistEven, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES));
ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// CalcHist
PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
int roiOffsetX;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
size = get<0>(GET_PARAM(1));
roiOffsetX = get<1>(GET_PARAM(1));
cv::cuda::setDevice(devInfo.deviceID());
}
@ -111,9 +140,10 @@ PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
CUDA_TEST_P(CalcHist, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
cv::cuda::GpuMat hist;
cv::cuda::calcHist(loadMat(src), hist);
GpuMat srcDevice = loadMat(src);
cv::cuda::calcHist(srcDevice(roi), hist);
cv::Mat hist_gold;
@ -123,7 +153,8 @@ CUDA_TEST_P(CalcHist, Accuracy)
const float* ranges[] = {hranges};
const int channels[] = {0};
cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
const Mat srcRoi = src(roi);
cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
hist_gold = hist_gold.reshape(1, 1);
hist_gold.convertTo(hist_gold, CV_32S);
@ -131,19 +162,21 @@ CUDA_TEST_P(CalcHist, Accuracy)
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES));
ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
int roiOffsetX;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
size = get<0>(GET_PARAM(1));
roiOffsetX = get<1>(GET_PARAM(1));
cv::cuda::setDevice(devInfo.deviceID());
}
@ -152,11 +185,14 @@ PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
CUDA_TEST_P(CalcHistWithMask, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
cv::Mat mask = randomMat(size, CV_8UC1);
cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
cv::cuda::GpuMat hist;
cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
GpuMat srcDevice = loadMat(src);
GpuMat maskDevice = loadMat(mask);
cv::cuda::calcHist(srcDevice(roi), maskDevice(roi), hist);
cv::Mat hist_gold;
@ -166,7 +202,8 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
const float* ranges[] = {hranges};
const int channels[] = {0};
cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
const Mat srcRoi = src(roi);
cv::calcHist(&srcRoi, 1, channels, mask(roi), hist_gold, 1, histSize, ranges);
hist_gold = hist_gold.reshape(1, 1);
hist_gold.convertTo(hist_gold, CV_32S);
@ -174,8 +211,7 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES));
ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// EqualizeHist

Loading…
Cancel
Save