diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index f0d4dd3ebe..26f9d649e4 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -428,6 +428,14 @@ namespace cv //! supports only single channel images CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf); + //! computes squared sum of array elements + //! supports only single channel images + CV_EXPORTS Scalar sqrSum(const GpuMat& src); + + //! computes squared sum of array elements + //! supports only single channel images + CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf); + //! finds global minimum and maximum array elements and returns their values CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat()); diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 049bfa4334..5f7dd6150d 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -66,6 +66,8 @@ double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); } Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); } Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); } +Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); } +Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); } void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); } void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); } @@ -489,6 +491,12 @@ namespace cv { namespace gpu { namespace mathfunc template void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum); + template + void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum); + + template + void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum); + namespace sum { void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); @@ -527,6 +535,38 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) return result; } +Scalar cv::gpu::sqrSum(const GpuMat& src) +{ + GpuMat buf; + return sqrSum(src, buf); +} + +Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) +{ + using namespace mathfunc; + CV_Assert(src.channels() == 1); + + typedef void (*Caller)(const DevMem2D, PtrStep, double*); + static const Caller callers[2][7] = + { { sqsum_multipass_caller, sqsum_multipass_caller, + sqsum_multipass_caller, sqsum_multipass_caller, + sqsum_multipass_caller, sqsum_multipass_caller, 0 }, + { sqsum_caller, sqsum_caller, + sqsum_caller, sqsum_caller, + sqsum_caller, sqsum_caller, sqsum_caller } }; + + Size bufSize; + sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); + buf.create(bufSize, CV_8U); + + Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type"); + + double result; + caller(src, buf, &result); + return result; +} + //////////////////////////////////////////////////////////////////////// // minMax diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index 3d6e1f606a..c990228541 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -1428,6 +1428,12 @@ namespace cv { namespace gpu { namespace mathfunc template <> struct SumType { typedef float R; }; template <> struct SumType { typedef double R; }; + template + struct IdentityOp { static __device__ R call(R x) { return x; } }; + + template + struct SqrOp { static __device__ R call(R x) { return x * x; } }; + __constant__ int ctwidth; __constant__ int ctheight; __device__ unsigned int blocks_finished = 0; @@ -1462,7 +1468,7 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); } - template + template __global__ void sum_kernel(const DevMem2D_ src, R* result) { __shared__ R smem[nthreads]; @@ -1477,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc { const T* ptr = src.ptr(y0 + y * blockDim.y); for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - sum += ptr[x0 + x * blockDim.x]; + sum += Op::call(ptr[x0 + x * blockDim.x]); } smem[tid] = sum; @@ -1548,9 +1554,8 @@ namespace cv { namespace gpu { namespace mathfunc R* buf_ = (R*)buf.ptr(0); - sum_kernel<<>>((const DevMem2D_)src, buf_); - sum_pass2_kernel<<<1, threads_x * threads_y>>>( - buf_, grid.x * grid.y); + sum_kernel, threads_x * threads_y><<>>((const DevMem2D_)src, buf_); + sum_pass2_kernel<<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); R result = 0; @@ -1566,6 +1571,35 @@ namespace cv { namespace gpu { namespace mathfunc template void sum_multipass_caller(const DevMem2D, PtrStep, double*); + template + void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum) + { + using namespace sum; + typedef typename SumType::R R; + + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + R* buf_ = (R*)buf.ptr(0); + + sum_kernel, threads_x * threads_y><<>>((const DevMem2D_)src, buf_); + sum_pass2_kernel<<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y); + cudaSafeCall(cudaThreadSynchronize()); + + R result = 0; + cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost)); + sum[0] = result; + } + + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + + template void sum_caller(const DevMem2D src, PtrStep buf, double* sum) { @@ -1578,7 +1612,7 @@ namespace cv { namespace gpu { namespace mathfunc R* buf_ = (R*)buf.ptr(0); - sum_kernel<<>>((const DevMem2D_)src, buf_); + sum_kernel, threads_x * threads_y><<>>((const DevMem2D_)src, buf_); cudaSafeCall(cudaThreadSynchronize()); R result = 0; @@ -1593,5 +1627,34 @@ namespace cv { namespace gpu { namespace mathfunc template void sum_caller(const DevMem2D, PtrStep, double*); template void sum_caller(const DevMem2D, PtrStep, double*); template void sum_caller(const DevMem2D, PtrStep, double*); + + + template + void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum) + { + using namespace sum; + typedef typename SumType::R R; + + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + R* buf_ = (R*)buf.ptr(0); + + sum_kernel, threads_x * threads_y><<>>((const DevMem2D_)src, buf_); + cudaSafeCall(cudaThreadSynchronize()); + + R result = 0; + cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost)); + sum[0] = result; + } + + template void sqsum_caller(const DevMem2D, PtrStep, double*); + template void sqsum_caller(const DevMem2D, PtrStep, double*); + template void sqsum_caller(const DevMem2D, PtrStep, double*); + template void sqsum_caller(const DevMem2D, PtrStep, double*); + template void sqsum_caller(const DevMem2D, PtrStep, double*); + template void sqsum_caller(const DevMem2D, PtrStep, double*); + template void sqsum_caller(const DevMem2D, PtrStep, double*); }}} diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index 521120c082..abe76a1a37 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -940,7 +940,7 @@ struct CV_GpuSumTest: CvTest { Mat src; Scalar a, b; - double max_err = 1e-6; + double max_err = 1e-5; int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F; for (int type = CV_8U; type <= typemax; ++type) @@ -954,6 +954,19 @@ struct CV_GpuSumTest: CvTest ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); return; } + if (type != CV_8S) + { + b = sqrSum(GpuMat(src)); + Mat sqrsrc; + multiply(src, src, sqrsrc); + a = sum(sqrsrc); + if (abs(a[0] - b[0]) > src.size().area() * max_err) + { + ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return; + } + } } } catch (const Exception& e) @@ -967,7 +980,7 @@ struct CV_GpuSumTest: CvTest { m.create(rows, cols, type); RNG rng; - rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(20)); + rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(16)); } };