From 3997514b7ca0cd968d045f6565a2801bfbeb4ec0 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Mon, 13 Dec 2010 12:00:58 +0000 Subject: [PATCH] added tests for gpu::sum, it supports all data types, but single channel images only --- modules/gpu/include/opencv2/gpu/gpu.hpp | 9 ++- modules/gpu/src/arithm.cpp | 57 ++++++++++------ modules/gpu/src/cuda/mathfunc.cu | 91 ++++++++++++++----------- tests/gpu/src/arithm.cpp | 78 +++++++++++---------- tests/gpu/src/gputest_main.cpp | 3 - 5 files changed, 131 insertions(+), 107 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index dafa5e113a..f0d4dd3ebe 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -421,9 +421,12 @@ namespace cv CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode); //! computes sum of array elements - //! supports CV_8UC1, CV_8UC4 types - //! disabled until fix crash - CV_EXPORTS Scalar sum(const GpuMat& m); + //! supports only single channel images + CV_EXPORTS Scalar sum(const GpuMat& src); + + //! computes sum of array elements + //! supports only single channel images + CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf); //! finds global minimum and maximum array elements and returns their values CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat()); diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 3dcae2c135..049bfa4334 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -65,6 +65,7 @@ double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; } double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; } void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); } Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); } +Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); } void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); } void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); } @@ -480,36 +481,50 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) //////////////////////////////////////////////////////////////////////// // sum -Scalar cv::gpu::sum(const GpuMat& src) +namespace cv { namespace gpu { namespace mathfunc { - CV_Assert(!"disabled until fix crash"); + template + void sum_caller(const DevMem2D src, PtrStep buf, double* sum); - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4); + template + void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum); - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + namespace sum + { + void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); + } +}}} - Scalar res; +Scalar cv::gpu::sum(const GpuMat& src) +{ + GpuMat buf; + return sum(src, buf); +} - int bufsz; +Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) +{ + using namespace mathfunc; + CV_Assert(src.channels() == 1); - if (src.type() == CV_8UC1) - { - nppiReductionGetBufferHostSize_8u_C1R(sz, &bufsz); - GpuMat buf(1, bufsz, CV_32S); + typedef void (*Caller)(const DevMem2D, PtrStep, double*); + static const Caller callers[2][7] = + { { sum_multipass_caller, sum_multipass_caller, + sum_multipass_caller, sum_multipass_caller, + sum_multipass_caller, sum_multipass_caller, 0 }, + { sum_caller, sum_caller, + sum_caller, sum_caller, + sum_caller, sum_caller, sum_caller } }; - nppSafeCall( nppiSum_8u_C1R(src.ptr(), src.step, sz, buf.ptr(), res.val) ); - } - else - { - nppiReductionGetBufferHostSize_8u_C4R(sz, &bufsz); - GpuMat buf(1, bufsz, CV_32S); + Size bufSize; + sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); + buf.create(bufSize, CV_8U); - nppSafeCall( nppiSum_8u_C4R(src.ptr(), src.step, sz, buf.ptr(), res.val) ); - } + Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type"); - return res; + double result; + caller(src, buf, &result); + return result; } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index b06bef0586..3c620c07fe 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -1419,6 +1419,15 @@ namespace cv { namespace gpu { namespace mathfunc namespace sum { + template struct SumType {}; + template <> struct SumType { typedef unsigned int R; }; + template <> struct SumType { typedef int R; }; + template <> struct SumType { typedef unsigned int R; }; + template <> struct SumType { typedef int R; }; + template <> struct SumType { typedef int R; }; + template <> struct SumType { typedef float R; }; + template <> struct SumType { typedef double R; }; + __constant__ int ctwidth; __constant__ int ctheight; __device__ unsigned int blocks_finished = 0; @@ -1436,12 +1445,11 @@ namespace cv { namespace gpu { namespace mathfunc } - template void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) { dim3 threads, grid; estimate_thread_cfg(cols, rows, threads, grid); - bufcols = grid.x * grid.y * sizeof(T); + bufcols = grid.x * grid.y * sizeof(double); bufrows = 1; } @@ -1454,17 +1462,17 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); } - template - __global__ void sum_kernel(const DevMem2D_ src, T* result) + template + __global__ void sum_kernel(const DevMem2D_ src, R* result) { - __shared__ T smem[nthreads]; + __shared__ R smem[nthreads]; const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int bid = blockIdx.y * gridDim.x + blockIdx.x; - T sum = 0; + R sum = 0; for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { const T* ptr = src.ptr(y0 + y * blockDim.y); @@ -1475,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid] = sum; __syncthreads(); - sum_in_smem(smem, tid); + sum_in_smem(smem, tid); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __shared__ bool is_last; @@ -1496,7 +1504,7 @@ namespace cv { namespace gpu { namespace mathfunc smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; __syncthreads(); - sum_in_smem(smem, tid); + sum_in_smem(smem, tid); if (tid == 0) { @@ -1510,14 +1518,16 @@ namespace cv { namespace gpu { namespace mathfunc } - template - __global__ void sum_pass2_kernel(T* result, int size) + template + __global__ void sum_pass2_kernel(R* result, int size) { - __shared__ T smem[nthreads]; + __shared__ R smem[nthreads]; int tid = threadIdx.y * blockDim.x + threadIdx.x; smem[tid] = tid < size ? result[tid] : 0; - sum_in_smem(smem, tid); + __syncthreads(); + + sum_in_smem(smem, tid); if (tid == 0) result[0] = smem[0]; @@ -1527,60 +1537,61 @@ namespace cv { namespace gpu { namespace mathfunc template - T sum_multipass_caller(const DevMem2D_ src, PtrStep buf) + void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum) { using namespace sum; + typedef typename SumType::R R; dim3 threads, grid; estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - T* buf_ = (T*)buf.ptr(0); + R* buf_ = (R*)buf.ptr(0); - sum_kernel<<>>(src, buf_); - sum_pass2_kernel<<<1, threads_x * threads_y>>>( + sum_kernel<<>>((const DevMem2D_)src, buf_); + sum_pass2_kernel<<<1, threads_x * threads_y>>>( buf_, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); - T sum; - cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost)); - - return sum; + R result = 0; + cudaSafeCall(cudaMemcpy(&result, buf_, result, cudaMemcpyDeviceToHost)); + sum[0] = result; } - template unsigned char sum_multipass_caller(const DevMem2D_, PtrStep); - template char sum_multipass_caller(const DevMem2D_, PtrStep); - template unsigned short sum_multipass_caller(const DevMem2D_, PtrStep); - template short sum_multipass_caller(const DevMem2D_, PtrStep); - template int sum_multipass_caller(const DevMem2D_, PtrStep); - template float sum_multipass_caller(const DevMem2D_, PtrStep); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*); template - T sum_caller(const DevMem2D_ src, PtrStep buf) + void sum_caller(const DevMem2D src, PtrStep buf, double* sum) { using namespace sum; + typedef typename SumType::R R; dim3 threads, grid; estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - T* buf_ = (T*)buf.ptr(0); + R* buf_ = (R*)buf.ptr(0); - sum_kernel<<>>(src, buf_); + sum_kernel<<>>((const DevMem2D_)src, buf_); cudaSafeCall(cudaThreadSynchronize()); - T sum; - cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost)); - - return sum; + R result = 0; + cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost)); + sum[0] = result; } - template unsigned char sum_caller(const DevMem2D_, PtrStep); - template char sum_caller(const DevMem2D_, PtrStep); - template unsigned short sum_caller(const DevMem2D_, PtrStep); - template short sum_caller(const DevMem2D_, PtrStep); - template int sum_caller(const DevMem2D_, PtrStep); - template float sum_caller(const DevMem2D_, PtrStep); - template double sum_caller(const DevMem2D_, PtrStep); + template void sum_caller(const DevMem2D, PtrStep, double*); + template void sum_caller(const DevMem2D, PtrStep, double*); + template void sum_caller(const DevMem2D, PtrStep, double*); + template void sum_caller(const DevMem2D, PtrStep, double*); + template void sum_caller(const DevMem2D, PtrStep, double*); + template void sum_caller(const DevMem2D, PtrStep, double*); + template void sum_caller(const DevMem2D, PtrStep, double*); }}} + diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index 5b7d5d600c..521120c082 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -458,29 +458,6 @@ struct CV_GpuNppImageFlipTest : public CV_GpuArithmTest } }; -//////////////////////////////////////////////////////////////////////////////// -// sum -struct CV_GpuNppImageSumTest : public CV_GpuArithmTest -{ - CV_GpuNppImageSumTest() : CV_GpuArithmTest( "GPU-NppImageSum", "sum" ) {} - - int test( const Mat& mat1, const Mat& ) - { - if (mat1.type() != CV_8UC1 && mat1.type() != CV_8UC4) - { - ts->printf(CvTS::LOG, "\tUnsupported type\t"); - return CvTS::OK; - } - - Scalar cpures = cv::sum(mat1); - - GpuMat gpu1(mat1); - Scalar gpures = cv::gpu::sum(gpu1); - - return CheckNorm(cpures, gpures); - } -}; - //////////////////////////////////////////////////////////////////////////////// // LUT struct CV_GpuNppImageLUTTest : public CV_GpuArithmTest @@ -949,27 +926,49 @@ struct CV_GpuCountNonZeroTest: CvTest } }; -//////////////////////////////////////////////////////////////////////////////// -// min/max -struct CV_GpuImageMinMaxTest : public CV_GpuArithmTest +////////////////////////////////////////////////////////////////////////////// +// sum + +struct CV_GpuSumTest: CvTest { - CV_GpuImageMinMaxTest() : CV_GpuArithmTest( "GPU-ImageMinMax", "min/max" ) {} + CV_GpuSumTest(): CvTest("GPU-SumTest", "sum") {} - int test( const Mat& mat1, const Mat& mat2 ) + void run(int) { - cv::Mat cpuMinRes, cpuMaxRes; - cv::min(mat1, mat2, cpuMinRes); - cv::max(mat1, mat2, cpuMaxRes); + try + { + Mat src; + Scalar a, b; + double max_err = 1e-6; - GpuMat gpu1(mat1); - GpuMat gpu2(mat2); - GpuMat gpuMinRes, gpuMaxRes; - cv::gpu::min(gpu1, gpu2, gpuMinRes); - cv::gpu::max(gpu1, gpu2, gpuMaxRes); + int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F; + for (int type = CV_8U; type <= typemax; ++type) + { + gen(1 + rand() % 1000, 1 + rand() % 1000, type, src); + a = sum(src); + b = sum(GpuMat(src)); + if (abs(a[0] - b[0]) > src.size().area() * max_err) + { + ts->printf(CvTS::CONSOLE, "cols: %d, rows: %d, expected: %f, actual: %f\n", src.cols, src.rows, a[0], b[0]); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return; + } + } + } + catch (const Exception& e) + { + if (!check_and_treat_gpu_exception(e, ts)) throw; + return; + } + } + + void gen(int cols, int rows, int type, Mat& m) + { + m.create(rows, cols, type); + RNG rng; + rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(20)); - return CheckNorm(cpuMinRes, gpuMinRes) == CvTS::OK && CheckNorm(cpuMaxRes, gpuMaxRes) == CvTS::OK ? - CvTS::OK : CvTS::FAIL_GENERIC; } }; @@ -992,7 +991,6 @@ CV_GpuNppImageCompareTest CV_GpuNppImageCompare_test; CV_GpuNppImageMeanStdDevTest CV_GpuNppImageMeanStdDev_test; CV_GpuNppImageNormTest CV_GpuNppImageNorm_test; CV_GpuNppImageFlipTest CV_GpuNppImageFlip_test; -CV_GpuNppImageSumTest CV_GpuNppImageSum_test; CV_GpuNppImageLUTTest CV_GpuNppImageLUT_test; CV_GpuNppImageExpTest CV_GpuNppImageExp_test; CV_GpuNppImageLogTest CV_GpuNppImageLog_test; @@ -1003,4 +1001,4 @@ CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test; CV_GpuMinMaxTest CV_GpuMinMaxTest_test; CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test; CV_GpuCountNonZeroTest CV_CountNonZero_test; -CV_GpuImageMinMaxTest CV_GpuImageMinMax_test; +CV_GpuSumTest CV_GpuSum_test; diff --git a/tests/gpu/src/gputest_main.cpp b/tests/gpu/src/gputest_main.cpp index a634ef0edd..a388fa7da9 100644 --- a/tests/gpu/src/gputest_main.cpp +++ b/tests/gpu/src/gputest_main.cpp @@ -46,9 +46,6 @@ CvTS test_system("gpu"); const char* blacklist[] = { "GPU-AsyncGpuMatOperator", // crash - - "GPU-NppImageSum", // crash, probably npp bug - "GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR 0 };