From f56d9c340f67e4e43ca610f99b52501e30cef1e4 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Wed, 15 Dec 2010 15:28:35 +0000 Subject: [PATCH] added support of remaining image number of channels into gpu::sum --- modules/gpu/src/cuda/mathfunc.cu | 258 +++++++++++++++++++++++++++++++ tests/gpu/src/arithm.cpp | 20 ++- 2 files changed, 277 insertions(+), 1 deletion(-) diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index d8b59d2f15..f7bc2cb3af 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -1645,6 +1645,246 @@ namespace cv { namespace gpu { namespace mathfunc } } + + template + __global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec::vec_t* result) + { + typedef typename TypeVec::vec_t SrcType; + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 3]; + + const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int bid = blockIdx.y * gridDim.x + blockIdx.x; + + SrcType val; + DstType sum = VecTraits::all(0); + for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); + for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + { + val = ptr[x0 + x * blockDim.x]; + sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), Op::call(val.z)); + } + } + + smem[tid] = sum.x; + smem[tid + nthreads] = sum.y; + smem[tid + 2 * nthreads] = sum.z; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + result[bid] = res; + __threadfence(); + + unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = (ticket == gridDim.x * gridDim.y - 1); + } + + __syncthreads(); + + if (is_last) + { + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + smem[tid + 2 * nthreads] = res.z; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + result[0] = res; + blocks_finished = 0; + } + } +#else + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + result[bid] = res; + } +#endif + } + + + template + __global__ void sum_pass2_kernel_C3(typename TypeVec::vec_t* result, int size) + { + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 3]; + + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + smem[tid + 2 * nthreads] = res.z; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + result[0] = res; + } + } + + template + __global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec::vec_t* result) + { + typedef typename TypeVec::vec_t SrcType; + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 4]; + + const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int bid = blockIdx.y * gridDim.x + blockIdx.x; + + SrcType val; + DstType sum = VecTraits::all(0); + for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); + for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + { + val = ptr[x0 + x * blockDim.x]; + sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), + Op::call(val.z), Op::call(val.w)); + } + } + + smem[tid] = sum.x; + smem[tid + nthreads] = sum.y; + smem[tid + 2 * nthreads] = sum.z; + smem[tid + 3 * nthreads] = sum.w; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + sum_in_smem(smem + 3 * nthreads, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + res.w = smem[3 * nthreads]; + result[bid] = res; + __threadfence(); + + unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = (ticket == gridDim.x * gridDim.y - 1); + } + + __syncthreads(); + + if (is_last) + { + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + smem[tid + 2 * nthreads] = res.z; + smem[tid + 3 * nthreads] = res.w; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + sum_in_smem(smem + 3 * nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + res.w = smem[3 * nthreads]; + result[0] = res; + blocks_finished = 0; + } + } +#else + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + res.w = smem[3 * nthreads]; + result[bid] = res; + } +#endif + } + + + template + __global__ void sum_pass2_kernel_C4(typename TypeVec::vec_t* result, int size) + { + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 4]; + + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + smem[tid + 2 * nthreads] = res.z; + smem[tid + 3 * nthreads] = res.z; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + sum_in_smem(smem + 3 * nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + res.w = smem[3 * nthreads]; + result[0] = res; + } + } + } // namespace sum @@ -1670,6 +1910,16 @@ namespace cv { namespace gpu { namespace mathfunc src, (typename TypeVec::vec_t*)buf.ptr(0)); sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + case 3: + sum_kernel_C3, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C3<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + case 4: + sum_kernel_C4, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C4<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); } cudaSafeCall(cudaThreadSynchronize()); @@ -1710,6 +1960,14 @@ namespace cv { namespace gpu { namespace mathfunc sum_kernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_t*)buf.ptr(0)); break; + case 3: + sum_kernel_C3, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 4: + sum_kernel_C4, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; } cudaSafeCall(cudaThreadSynchronize()); diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index f20c03adea..69a8f47198 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -950,7 +950,25 @@ struct CV_GpuSumTest: CvTest b = sum(GpuMat(src)); if (abs(a[0] - b[0]) + abs(a[1] - b[1]) > src.size().area() * max_err) { - ts->printf(CvTS::CONSOLE, "cols: %d, rows: %d, expected: %f, actual: %f\n", src.cols, src.rows, a[0], b[0]); + ts->printf(CvTS::CONSOLE, "2 cols: %d, rows: %d, expected: %f, actual: %f\n", src.cols, src.rows, a[1], b[1]); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return; + } + gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 3), src); + a = sum(src); + b = sum(GpuMat(src)); + if (abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2])> src.size().area() * max_err) + { + ts->printf(CvTS::CONSOLE, "3 cols: %d, rows: %d, expected: %f, actual: %f\n", src.cols, src.rows, a[2], b[2]); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return; + } + gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 4), src); + a = sum(src); + b = sum(GpuMat(src)); + if (abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2]) + abs(a[3] - b[3])> src.size().area() * max_err) + { + ts->printf(CvTS::CONSOLE, "4 cols: %d, rows: %d, expected: %f, actual: %f\n", src.cols, src.rows, a[3], b[3]); ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); return; }