diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 0d7b3f8cef..759768563a 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1087,6 +1087,11 @@ namespace cv //! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types. //! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type. CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null()); + + //! Calculates histogram for 8u one channel image + //! Output hist will have one row, 256 cols and CV32SC1 type. + CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); + CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //////////////////////////////// StereoBM_GPU //////////////////////////////// diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu new file mode 100644 index 0000000000..7bcb2e9315 --- /dev/null +++ b/modules/gpu/src/cuda/hist.cu @@ -0,0 +1,193 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "internal_shared.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" + +using namespace cv::gpu; + +using namespace cv::gpu::device; + +#define UINT_BITS 32U + +#define LOG2_WARP_SIZE 5U +#define WARP_SIZE (1U << LOG2_WARP_SIZE) + +//Warps == subhistograms per threadblock +#define WARP_COUNT 6 + +//Threadblock size +#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * WARP_SIZE) +#define HISTOGRAM256_BIN_COUNT 256 + +//Shared memory per threadblock +#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT) + +#define PARTIAL_HISTOGRAM256_COUNT 240 + +#define MERGE_THREADBLOCK_SIZE 256 + +#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120) + +namespace cv { namespace gpu { namespace histograms +{ + #if (!USE_SMEM_ATOMICS) + + #define TAG_MASK ( (1U << (UINT_BITS - LOG2_WARP_SIZE)) - 1U ) + + __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag) + { + uint count; + do + { + count = s_WarpHist[data] & TAG_MASK; + count = threadTag | (count + 1); + s_WarpHist[data] = count; + } while (s_WarpHist[data] != count); + } + + #else + + #define TAG_MASK 0xFFFFFFFFU + + __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag) + { + atomicAdd(s_WarpHist + data, 1); + } + + #endif + + __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols) + { + uint x = pos_x << 2; + + if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag); + if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag); + if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag); + if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag); + } + + __global__ void histogram256(PtrStep_ d_Data, uint* d_PartialHistograms, uint dataCount, uint cols) + { + //Per-warp subhistogram storage + __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY]; + uint* s_WarpHist= s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT; + + //Clear shared memory storage for current threadblock before processing + #pragma unroll + for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++) + s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0; + + //Cycle through the entire data set, update subhistograms for each warp + const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE); + + __syncthreads(); + const uint colsui = d_Data.step / sizeof(uint); + for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x) + { + uint pos_y = pos / colsui; + uint pos_x = pos % colsui; + uint data = d_Data.ptr(pos_y)[pos_x]; + addWord(s_WarpHist, data, tag, pos_x, cols); + } + + //Merge per-warp histograms into per-block and write to global memory + __syncthreads(); + for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE) + { + uint sum = 0; + + for (uint i = 0; i < WARP_COUNT; i++) + sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK; + + d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum; + } + } + + //////////////////////////////////////////////////////////////////////////////// + // Merge histogram256() output + // Run one threadblock per bin; each threadblock adds up the same bin counter + // from every partial histogram. Reads are uncoalesced, but mergeHistogram256 + // takes only a fraction of total processing time + //////////////////////////////////////////////////////////////////////////////// + + __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram) + { + uint sum = 0; + + #pragma unroll + for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE) + sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT]; + + __shared__ uint data[MERGE_THREADBLOCK_SIZE]; + data[threadIdx.x] = sum; + + for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1) + { + __syncthreads(); + if(threadIdx.x < stride) + data[threadIdx.x] += data[threadIdx.x + stride]; + } + + if(threadIdx.x == 0) + d_Histogram[blockIdx.x] = saturate_cast(data[0]); + } + + void histogram256_gpu(DevMem2D src, int* hist, uint* buf, cudaStream_t stream) + { + histogram256<<>>( + DevMem2D_(src), + buf, + src.rows * src.step / sizeof(uint), + src.cols); + + cudaSafeCall( cudaGetLastError() ); + + mergeHistogram256<<>>(buf, hist); + + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +}}} diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 69ac7c9079..dd2d2e3664 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -71,6 +71,8 @@ void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_n void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*, Stream&) { throw_nogpu(); } void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_nogpu(); } +void cv::gpu::calcHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::calcHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); } void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); } void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); } @@ -1037,6 +1039,33 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4 hist_callers[src.depth()](src, hist, levels, StreamAccessor::getStream(stream)); } +namespace cv { namespace gpu { namespace histograms +{ + void histogram256_gpu(DevMem2D src, int* hist, unsigned int* buf, cudaStream_t stream); + + const int PARTIAL_HISTOGRAM256_COUNT = 240; + const int HISTOGRAM256_BIN_COUNT = 256; +}}} + +void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream) +{ + GpuMat buf; + calcHist(src, hist, buf, stream); +} + +void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream) +{ + using namespace cv::gpu::histograms; + + CV_Assert(src.type() == CV_8UC1); + + hist.create(1, 256, CV_32SC1); + + ensureSizeIsEnough(1, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT, CV_32SC1, buf); + + histogram256_gpu(src, hist.ptr(), buf.ptr(), StreamAccessor::getStream(stream)); +} + //////////////////////////////////////////////////////////////////////// // cornerHarris & minEgenVal diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index 05654f90e5..694c9bfbb1 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -967,7 +967,7 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine( /////////////////////////////////////////////////////////////////////////////////////////////////////// // histograms -struct Histograms : testing::TestWithParam +struct HistEven : testing::TestWithParam { static cv::Mat hsv; @@ -1014,9 +1014,9 @@ struct Histograms : testing::TestWithParam } }; -cv::Mat Histograms::hsv; +cv::Mat HistEven::hsv; -TEST_P(Histograms, Accuracy) +TEST_P(HistEven, Accuracy) { ASSERT_TRUE(!hsv.empty()); @@ -1038,7 +1038,61 @@ TEST_P(Histograms, Accuracy) EXPECT_MAT_NEAR(hist_gold, hist, 0.0); } -INSTANTIATE_TEST_CASE_P(ImgProc, Histograms, testing::ValuesIn(devices())); +INSTANTIATE_TEST_CASE_P(ImgProc, HistEven, testing::ValuesIn(devices())); + +struct CalcHist : testing::TestWithParam +{ + cv::gpu::DeviceInfo devInfo; + + cv::Size size; + cv::Mat src; + cv::Mat hist_gold; + + virtual void SetUp() + { + devInfo = GetParam(); + + cv::gpu::setDevice(devInfo.deviceID()); + + cv::RNG& rng = cvtest::TS::ptr()->get_rng(); + + size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200)); + + src = cvtest::randomMat(rng, size, CV_8UC1, 0, 255, false); + + hist_gold.create(1, 256, CV_32SC1); + hist_gold.setTo(cv::Scalar::all(0)); + + int* hist = hist_gold.ptr(); + for (int y = 0; y < src.rows; ++y) + { + const uchar* src_row = src.ptr(y); + + for (int x = 0; x < src.cols; ++x) + ++hist[src_row[x]]; + } + } +}; + +TEST_P(CalcHist, Accuracy) +{ + PRINT_PARAM(devInfo); + PRINT_PARAM(size); + + cv::Mat hist; + + ASSERT_NO_THROW( + cv::gpu::GpuMat gpuHist; + + cv::gpu::calcHist(cv::gpu::GpuMat(src), gpuHist); + + gpuHist.download(hist); + ); + + EXPECT_MAT_NEAR(hist_gold, hist, 0.0); +} + +INSTANTIATE_TEST_CASE_P(ImgProc, CalcHist, testing::ValuesIn(devices())); /////////////////////////////////////////////////////////////////////////////////////////////////////// // cornerHarris diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index 630708b4ee..eb762c1d7c 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -875,7 +875,7 @@ TEST(pyrDown) { SUBTEST << "size " << size; - Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256); + Mat src; gen(src, size, size, CV_16SC3, 0, 256); Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); CPU_ON; @@ -899,7 +899,7 @@ TEST(pyrUp) { SUBTEST << "size " << size; - Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256); + Mat src; gen(src, size, size, CV_16SC3, 0, 256); Mat dst(Size(src.cols * 2, src.rows * 2), src.type()); CPU_ON;