diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 8e9a2151f2..ba64111a96 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -492,10 +492,10 @@ namespace cv { namespace gpu { namespace mathfunc void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); template - void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum); + void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); template - void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum); + void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); namespace sum { @@ -543,9 +543,8 @@ Scalar cv::gpu::sqrSum(const GpuMat& src) Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) { using namespace mathfunc; - CV_Assert(src.channels() == 1); - typedef void (*Caller)(const DevMem2D, PtrStep, double*); + typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); static const Caller callers[2][7] = { { sqsum_multipass_caller, sqsum_multipass_caller, sqsum_multipass_caller, sqsum_multipass_caller, @@ -555,15 +554,15 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) sqsum_caller, sqsum_caller, 0 } }; Size bufSize; - sum::get_buf_size_required(src.cols, src.rows, 1, bufSize.width, bufSize.height); + sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); buf.create(bufSize, CV_8U); - Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type"); - double result; - caller(src, buf, &result); - return result; + double result[4]; + caller(src, buf, result, src.channels()); + return Scalar(result[0], result[1], result[2], result[3]); } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index f7bc2cb3af..08e749f8b6 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -1989,7 +1989,7 @@ namespace cv { namespace gpu { namespace mathfunc template - void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum) + void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) { using namespace sum; typedef typename SumType::R R; @@ -1998,27 +1998,54 @@ namespace cv { namespace gpu { namespace mathfunc estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - sum_kernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + switch (cn) + { + case 1: + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + break; + case 2: + sum_kernel_C2, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + break; + case 3: + sum_kernel_C3, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C3<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + break; + case 4: + sum_kernel_C4, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C4<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + break; + } cudaSafeCall(cudaThreadSynchronize()); - R result = 0; - cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost)); - sum[0] = result; + R result[4] = {0, 0, 0, 0}; + cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); + + sum[0] = result[0]; + sum[1] = result[1]; + sum[2] = result[2]; + sum[3] = result[3]; } - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); template - void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum) + void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) { using namespace sum; typedef typename SumType::R R; @@ -2027,20 +2054,42 @@ namespace cv { namespace gpu { namespace mathfunc estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - sum_kernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); + switch (cn) + { + case 1: + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 2: + sum_kernel_C2, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 3: + sum_kernel_C3, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 4: + sum_kernel_C4, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + } cudaSafeCall(cudaThreadSynchronize()); - R result = 0; - cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost)); - sum[0] = result; - } + R result[4] = {0, 0, 0, 0}; + cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - template void sqsum_caller(const DevMem2D, PtrStep, double*); - template void sqsum_caller(const DevMem2D, PtrStep, double*); - template void sqsum_caller(const DevMem2D, PtrStep, double*); - template void sqsum_caller(const DevMem2D, PtrStep, double*); - template void sqsum_caller(const DevMem2D, PtrStep, double*); - template void sqsum_caller(const DevMem2D, PtrStep, double*); + sum[0] = result[0]; + sum[1] = result[1]; + sum[2] = result[2]; + sum[3] = result[3]; + } + + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); }}} + diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index 69a8f47198..7dc7ba3a4e 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -983,6 +983,7 @@ struct CV_GpuSumTest: CvTest } if (type != CV_8S) { + gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 1), src); b = sqrSum(GpuMat(src)); Mat sqrsrc; multiply(src, src, sqrsrc); @@ -993,6 +994,36 @@ struct CV_GpuSumTest: CvTest ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); return; } + gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 2), src); + b = sqrSum(GpuMat(src)); + multiply(src, src, sqrsrc); + a = sum(sqrsrc); + if (abs(a[0] - b[0]) + abs(a[1] - b[1])> src.size().area() * max_err * 2) + { + ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return; + } + gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 3), src); + b = sqrSum(GpuMat(src)); + multiply(src, src, sqrsrc); + a = sum(sqrsrc); + if (abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2])> src.size().area() * max_err * 3) + { + ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return; + } + gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 4), src); + b = sqrSum(GpuMat(src)); + multiply(src, src, sqrsrc); + a = sum(sqrsrc); + if (abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2]) + abs(a[3] - b[3])> src.size().area() * max_err * 4) + { + ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return; + } } } }