diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 5f7dd6150d..8e9a2151f2 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -486,10 +486,10 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) namespace cv { namespace gpu { namespace mathfunc { template - void sum_caller(const DevMem2D src, PtrStep buf, double* sum); + void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); template - void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum); + void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); template void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum); @@ -499,7 +499,7 @@ namespace cv { namespace gpu { namespace mathfunc namespace sum { - void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); + void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows); } }}} @@ -512,27 +512,26 @@ Scalar cv::gpu::sum(const GpuMat& src) Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) { using namespace mathfunc; - CV_Assert(src.channels() == 1); - typedef void (*Caller)(const DevMem2D, PtrStep, double*); + typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); static const Caller callers[2][7] = { { sum_multipass_caller, sum_multipass_caller, sum_multipass_caller, sum_multipass_caller, sum_multipass_caller, sum_multipass_caller, 0 }, { sum_caller, sum_caller, sum_caller, sum_caller, - sum_caller, sum_caller, sum_caller } }; + sum_caller, sum_caller, 0 } }; Size bufSize; - sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); + sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); buf.create(bufSize, CV_8U); - Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type"); - double result; - caller(src, buf, &result); - return result; + double result[4]; + caller(src, buf, result, src.channels()); + return Scalar(result[0], result[1], result[2], result[3]); } Scalar cv::gpu::sqrSum(const GpuMat& src) @@ -553,10 +552,10 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) sqsum_multipass_caller, sqsum_multipass_caller, 0 }, { sqsum_caller, sqsum_caller, sqsum_caller, sqsum_caller, - sqsum_caller, sqsum_caller, sqsum_caller } }; + sqsum_caller, sqsum_caller, 0 } }; Size bufSize; - sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); + sum::get_buf_size_required(src.cols, src.rows, 1, bufSize.width, bufSize.height); buf.create(bufSize, CV_8U); Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index c990228541..d8b59d2f15 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -42,6 +42,7 @@ #include "opencv2/gpu/device/limits_gpu.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vecmath.hpp" #include "transform.hpp" #include "internal_shared.hpp" @@ -1451,11 +1452,11 @@ namespace cv { namespace gpu { namespace mathfunc } - void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) + void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows) { dim3 threads, grid; estimate_thread_cfg(cols, rows, threads, grid); - bufcols = grid.x * grid.y * sizeof(double); + bufcols = grid.x * grid.y * sizeof(double) * cn; bufrows = 1; } @@ -1469,7 +1470,7 @@ namespace cv { namespace gpu { namespace mathfunc } template - __global__ void sum_kernel(const DevMem2D_ src, R* result) + __global__ void sum_kernel(const DevMem2D src, R* result) { __shared__ R smem[nthreads]; @@ -1481,7 +1482,7 @@ namespace cv { namespace gpu { namespace mathfunc R sum = 0; for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { - const T* ptr = src.ptr(y0 + y * blockDim.y); + const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) sum += Op::call(ptr[x0 + x * blockDim.x]); } @@ -1539,11 +1540,116 @@ namespace cv { namespace gpu { namespace mathfunc result[0] = smem[0]; } + + template + __global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec::vec_t* result) + { + typedef typename TypeVec::vec_t SrcType; + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 2]; + + const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int bid = blockIdx.y * gridDim.x + blockIdx.x; + + SrcType val; + DstType sum = VecTraits::all(0); + for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); + for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + { + val = ptr[x0 + x * blockDim.x]; + sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y)); + } + } + + smem[tid] = sum.x; + smem[tid + nthreads] = sum.y; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + result[bid] = res; + __threadfence(); + + unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = (ticket == gridDim.x * gridDim.y - 1); + } + + __syncthreads(); + + if (is_last) + { + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + result[0] = res; + blocks_finished = 0; + } + } +#else + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + result[bid] = res; + } +#endif + } + + + template + __global__ void sum_pass2_kernel_C2(typename TypeVec::vec_t* result, int size) + { + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 2]; + + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + result[0] = res; + } + } + } // namespace sum template - void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum) + void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) { using namespace sum; typedef typename SumType::R R; @@ -1552,27 +1658,40 @@ namespace cv { namespace gpu { namespace mathfunc estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - R* buf_ = (R*)buf.ptr(0); - - sum_kernel, threads_x * threads_y><<>>((const DevMem2D_)src, buf_); - sum_pass2_kernel<<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y); + switch (cn) + { + case 1: + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + case 2: + sum_kernel_C2, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + } cudaSafeCall(cudaThreadSynchronize()); - R result = 0; - cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost)); - sum[0] = result; + R result[4] = {0, 0, 0, 0}; + cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); + + sum[0] = result[0]; + sum[1] = result[1]; + sum[2] = result[2]; + sum[3] = result[3]; } - template void sum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); template - void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum) + void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) { using namespace sum; typedef typename SumType::R R; @@ -1581,27 +1700,38 @@ namespace cv { namespace gpu { namespace mathfunc estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - R* buf_ = (R*)buf.ptr(0); - - sum_kernel, threads_x * threads_y><<>>((const DevMem2D_)src, buf_); - sum_pass2_kernel<<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y); + switch (cn) + { + case 1: + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 2: + sum_kernel_C2, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + } cudaSafeCall(cudaThreadSynchronize()); - R result = 0; - cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost)); - sum[0] = result; + R result[4] = {0, 0, 0, 0}; + cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); + + sum[0] = result[0]; + sum[1] = result[1]; + sum[2] = result[2]; + sum[3] = result[3]; } - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); template - void sum_caller(const DevMem2D src, PtrStep buf, double* sum) + void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum) { using namespace sum; typedef typename SumType::R R; @@ -1610,23 +1740,23 @@ namespace cv { namespace gpu { namespace mathfunc estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - R* buf_ = (R*)buf.ptr(0); - - sum_kernel, threads_x * threads_y><<>>((const DevMem2D_)src, buf_); + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); R result = 0; - cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost)); sum[0] = result; } - template void sum_caller(const DevMem2D, PtrStep, double*); - template void sum_caller(const DevMem2D, PtrStep, double*); - template void sum_caller(const DevMem2D, PtrStep, double*); - template void sum_caller(const DevMem2D, PtrStep, double*); - template void sum_caller(const DevMem2D, PtrStep, double*); - template void sum_caller(const DevMem2D, PtrStep, double*); - template void sum_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*); template @@ -1639,13 +1769,12 @@ namespace cv { namespace gpu { namespace mathfunc estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - R* buf_ = (R*)buf.ptr(0); - - sum_kernel, threads_x * threads_y><<>>((const DevMem2D_)src, buf_); + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); cudaSafeCall(cudaThreadSynchronize()); R result = 0; - cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost)); sum[0] = result; } @@ -1655,6 +1784,5 @@ namespace cv { namespace gpu { namespace mathfunc template void sqsum_caller(const DevMem2D, PtrStep, double*); template void sqsum_caller(const DevMem2D, PtrStep, double*); template void sqsum_caller(const DevMem2D, PtrStep, double*); - template void sqsum_caller(const DevMem2D, PtrStep, double*); }}} diff --git a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp index 19e1e88c62..dc04203e5d 100644 --- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp @@ -866,6 +866,91 @@ namespace cv return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); } + + static __device__ uint1 operator+(const uint1& a, const uint1& b) + { + return make_uint1(a.x + b.x); + } + static __device__ uint1 operator-(const uint1& a, const uint1& b) + { + return make_uint1(a.x - b.x); + } + static __device__ uint1 operator*(const uint1& a, const uint1& b) + { + return make_uint1(a.x * b.x); + } + static __device__ uint1 operator/(const uint1& a, const uint1& b) + { + return make_uint1(a.x / b.x); + } + static __device__ float1 operator*(const uint1& a, float s) + { + return make_float1(a.x * s); + } + + static __device__ uint2 operator+(const uint2& a, const uint2& b) + { + return make_uint2(a.x + b.x, a.y + b.y); + } + static __device__ uint2 operator-(const uint2& a, const uint2& b) + { + return make_uint2(a.x - b.x, a.y - b.y); + } + static __device__ uint2 operator*(const uint2& a, const uint2& b) + { + return make_uint2(a.x * b.x, a.y * b.y); + } + static __device__ uint2 operator/(const uint2& a, const uint2& b) + { + return make_uint2(a.x / b.x, a.y / b.y); + } + static __device__ float2 operator*(const uint2& a, float s) + { + return make_float2(a.x * s, a.y * s); + } + + static __device__ uint3 operator+(const uint3& a, const uint3& b) + { + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); + } + static __device__ uint3 operator-(const uint3& a, const uint3& b) + { + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); + } + static __device__ uint3 operator*(const uint3& a, const uint3& b) + { + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); + } + static __device__ uint3 operator/(const uint3& a, const uint3& b) + { + return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z); + } + static __device__ float3 operator*(const uint3& a, float s) + { + return make_float3(a.x * s, a.y * s, a.z * s); + } + + static __device__ uint4 operator+(const uint4& a, const uint4& b) + { + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); + } + static __device__ uint4 operator-(const uint4& a, const uint4& b) + { + return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); + } + static __device__ uint4 operator*(const uint4& a, const uint4& b) + { + return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); + } + static __device__ uint4 operator/(const uint4& a, const uint4& b) + { + return make_uint4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); + } + static __device__ float4 operator*(const uint4& a, float s) + { + return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); + } + static __device__ float1 operator+(const float1& a, const float1& b) { return make_float1(a.x + b.x); diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index 944ec096d0..f20c03adea 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -942,9 +942,18 @@ struct CV_GpuSumTest: CvTest Scalar a, b; double max_err = 1e-5; - int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F; + int typemax = CV_32F; for (int type = CV_8U; type <= typemax; ++type) { + gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 2), src); + a = sum(src); + b = sum(GpuMat(src)); + if (abs(a[0] - b[0]) + abs(a[1] - b[1]) > src.size().area() * max_err) + { + ts->printf(CvTS::CONSOLE, "cols: %d, rows: %d, expected: %f, actual: %f\n", src.cols, src.rows, a[0], b[0]); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return; + } gen(1 + rand() % 500, 1 + rand() % 500, type, src); a = sum(src); b = sum(GpuMat(src));