From 5af529c1bd9662dca3f35eb27ddb1c5599e70833 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 15 Feb 2012 12:05:59 +0000 Subject: [PATCH] fixed and updated gpu implementation of separable liner filters now it supports kernel's size up to 32 --- modules/gpu/perf/perf_filters.cpp | 2 +- modules/gpu/src/cuda/column_filter.cu | 365 +++++++++++++++---------- modules/gpu/src/cuda/row_filter.cu | 367 +++++++++++++++----------- modules/gpu/src/filtering.cpp | 28 +- modules/gpu/test/test_filters.cpp | 10 +- 5 files changed, 454 insertions(+), 318 deletions(-) diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp index 79c60f5ec6..f6ba4a9d30 100644 --- a/modules/gpu/perf/perf_filters.cpp +++ b/modules/gpu/perf/perf_filters.cpp @@ -139,6 +139,6 @@ INSTANTIATE_TEST_CASE_P(Filter, SeparableLinearFilter, testing::Combine( ALL_DEVICES, GPU_TYPICAL_MAT_SIZES, testing::Values(CV_8UC1, CV_8UC4, CV_32FC1), - testing::Values(3, 5))); + testing::Values(3, 5, 7, 9, 11, 13, 15))); #endif diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu index df856410e8..36dd7bb320 100644 --- a/modules/gpu/src/cuda/column_filter.cu +++ b/modules/gpu/src/cuda/column_filter.cu @@ -46,17 +46,14 @@ #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" +#include "opencv2/gpu/device/static_check.hpp" namespace cv { namespace gpu { namespace device { - #define MAX_KERNEL_SIZE 16 - #define BLOCK_DIM_X 16 - #define BLOCK_DIM_Y 4 - #define RESULT_STEPS 8 - #define HALO_STEPS 1 - namespace column_filter { + #define MAX_KERNEL_SIZE 32 + __constant__ float c_kernel[MAX_KERNEL_SIZE]; void loadKernel(const float kernel[], int ksize) @@ -64,64 +61,75 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); } - template - __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep dst, int anchor, const B b) + template + __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep dst, const int anchor, const B brd) { + Static::check(); + Static= KSIZE>::check(); + Static::cn == VecTraits::cn>::check(); + typedef typename TypeVec::cn>::vec_type sum_t; - __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1]; + __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; - //Offset to the upper halo edge const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; - const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y; - if (x < src.cols) - { - const T* src_col = src.ptr() + x; + if (x >= src.cols) + return; - //Main data - #pragma unroll - for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); + const T* src_col = src.ptr() + x; - //Upper halo - #pragma unroll - for(int i = 0; i < HALO_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step); + const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; - //Lower halo - #pragma unroll - for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); - __syncthreads(); + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); - #pragma unroll - for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) - { - sum_t sum = VecTraits::all(0); + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); - #pragma unroll - for(int j = 0; j < KERNEL_SIZE; ++j) - sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j]; + __syncthreads(); - int dstY = y + i * BLOCK_DIM_Y; + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int y = yStart + j * BLOCK_DIM_Y; + + if (y >= src.rows) + return; + + sum_t sum = VecTraits::all(0); + + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; - if (dstY < src.rows) - dst.ptr(dstY)[x] = saturate_cast(sum); - } + dst(y, x) = saturate_cast(sum); } } - template class B> - void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) - { + template class B> + void linearColumnFilter_caller(DevMem2D_ src, DevMem2D_ dst, int anchor, cudaStream_t stream) + { + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 16; + const int PATCH_PER_BLOCK = 4; + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y)); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); + + B brd(src.rows); - B b(src.rows); + linearColumnFilter<<>>(src, dst, anchor, brd); - linearColumnFilter<<>>(src, dst, anchor, b); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -129,106 +137,187 @@ namespace cv { namespace gpu { namespace device } template - void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) + void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) { - typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); - static const caller_t callers[5][17] = + typedef void (*caller_t)(DevMem2D_ src, DevMem2D_ dst, int anchor, cudaStream_t stream); + + static const caller_t callers[5][33] = { { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReflect101>, - linearColumnFilter_caller<2 , T, D, BrdColReflect101>, - linearColumnFilter_caller<3 , T, D, BrdColReflect101>, - linearColumnFilter_caller<4 , T, D, BrdColReflect101>, - linearColumnFilter_caller<5 , T, D, BrdColReflect101>, - linearColumnFilter_caller<6 , T, D, BrdColReflect101>, - linearColumnFilter_caller<7 , T, D, BrdColReflect101>, - linearColumnFilter_caller<8 , T, D, BrdColReflect101>, - linearColumnFilter_caller<9 , T, D, BrdColReflect101>, - linearColumnFilter_caller<10, T, D, BrdColReflect101>, - linearColumnFilter_caller<11, T, D, BrdColReflect101>, - linearColumnFilter_caller<12, T, D, BrdColReflect101>, - linearColumnFilter_caller<13, T, D, BrdColReflect101>, - linearColumnFilter_caller<14, T, D, BrdColReflect101>, - linearColumnFilter_caller<15, T, D, BrdColReflect101>, - linearColumnFilter_caller<16, T, D, BrdColReflect101> + 0, + linearColumnFilter_caller< 1, T, D, BrdColReflect101>, + linearColumnFilter_caller< 2, T, D, BrdColReflect101>, + linearColumnFilter_caller< 3, T, D, BrdColReflect101>, + linearColumnFilter_caller< 4, T, D, BrdColReflect101>, + linearColumnFilter_caller< 5, T, D, BrdColReflect101>, + linearColumnFilter_caller< 6, T, D, BrdColReflect101>, + linearColumnFilter_caller< 7, T, D, BrdColReflect101>, + linearColumnFilter_caller< 8, T, D, BrdColReflect101>, + linearColumnFilter_caller< 9, T, D, BrdColReflect101>, + linearColumnFilter_caller<10, T, D, BrdColReflect101>, + linearColumnFilter_caller<11, T, D, BrdColReflect101>, + linearColumnFilter_caller<12, T, D, BrdColReflect101>, + linearColumnFilter_caller<13, T, D, BrdColReflect101>, + linearColumnFilter_caller<14, T, D, BrdColReflect101>, + linearColumnFilter_caller<15, T, D, BrdColReflect101>, + linearColumnFilter_caller<16, T, D, BrdColReflect101>, + linearColumnFilter_caller<17, T, D, BrdColReflect101>, + linearColumnFilter_caller<18, T, D, BrdColReflect101>, + linearColumnFilter_caller<19, T, D, BrdColReflect101>, + linearColumnFilter_caller<20, T, D, BrdColReflect101>, + linearColumnFilter_caller<21, T, D, BrdColReflect101>, + linearColumnFilter_caller<22, T, D, BrdColReflect101>, + linearColumnFilter_caller<23, T, D, BrdColReflect101>, + linearColumnFilter_caller<24, T, D, BrdColReflect101>, + linearColumnFilter_caller<25, T, D, BrdColReflect101>, + linearColumnFilter_caller<26, T, D, BrdColReflect101>, + linearColumnFilter_caller<27, T, D, BrdColReflect101>, + linearColumnFilter_caller<28, T, D, BrdColReflect101>, + linearColumnFilter_caller<29, T, D, BrdColReflect101>, + linearColumnFilter_caller<30, T, D, BrdColReflect101>, + linearColumnFilter_caller<31, T, D, BrdColReflect101>, + linearColumnFilter_caller<32, T, D, BrdColReflect101> }, { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReplicate>, - linearColumnFilter_caller<2 , T, D, BrdColReplicate>, - linearColumnFilter_caller<3 , T, D, BrdColReplicate>, - linearColumnFilter_caller<4 , T, D, BrdColReplicate>, - linearColumnFilter_caller<5 , T, D, BrdColReplicate>, - linearColumnFilter_caller<6 , T, D, BrdColReplicate>, - linearColumnFilter_caller<7 , T, D, BrdColReplicate>, - linearColumnFilter_caller<8 , T, D, BrdColReplicate>, - linearColumnFilter_caller<9 , T, D, BrdColReplicate>, - linearColumnFilter_caller<10, T, D, BrdColReplicate>, - linearColumnFilter_caller<11, T, D, BrdColReplicate>, - linearColumnFilter_caller<12, T, D, BrdColReplicate>, - linearColumnFilter_caller<13, T, D, BrdColReplicate>, - linearColumnFilter_caller<14, T, D, BrdColReplicate>, - linearColumnFilter_caller<15, T, D, BrdColReplicate>, - linearColumnFilter_caller<16, T, D, BrdColReplicate> + 0, + linearColumnFilter_caller< 1, T, D, BrdColReplicate>, + linearColumnFilter_caller< 2, T, D, BrdColReplicate>, + linearColumnFilter_caller< 3, T, D, BrdColReplicate>, + linearColumnFilter_caller< 4, T, D, BrdColReplicate>, + linearColumnFilter_caller< 5, T, D, BrdColReplicate>, + linearColumnFilter_caller< 6, T, D, BrdColReplicate>, + linearColumnFilter_caller< 7, T, D, BrdColReplicate>, + linearColumnFilter_caller< 8, T, D, BrdColReplicate>, + linearColumnFilter_caller< 9, T, D, BrdColReplicate>, + linearColumnFilter_caller<10, T, D, BrdColReplicate>, + linearColumnFilter_caller<11, T, D, BrdColReplicate>, + linearColumnFilter_caller<12, T, D, BrdColReplicate>, + linearColumnFilter_caller<13, T, D, BrdColReplicate>, + linearColumnFilter_caller<14, T, D, BrdColReplicate>, + linearColumnFilter_caller<15, T, D, BrdColReplicate>, + linearColumnFilter_caller<16, T, D, BrdColReplicate>, + linearColumnFilter_caller<17, T, D, BrdColReplicate>, + linearColumnFilter_caller<18, T, D, BrdColReplicate>, + linearColumnFilter_caller<19, T, D, BrdColReplicate>, + linearColumnFilter_caller<20, T, D, BrdColReplicate>, + linearColumnFilter_caller<21, T, D, BrdColReplicate>, + linearColumnFilter_caller<22, T, D, BrdColReplicate>, + linearColumnFilter_caller<23, T, D, BrdColReplicate>, + linearColumnFilter_caller<24, T, D, BrdColReplicate>, + linearColumnFilter_caller<25, T, D, BrdColReplicate>, + linearColumnFilter_caller<26, T, D, BrdColReplicate>, + linearColumnFilter_caller<27, T, D, BrdColReplicate>, + linearColumnFilter_caller<28, T, D, BrdColReplicate>, + linearColumnFilter_caller<29, T, D, BrdColReplicate>, + linearColumnFilter_caller<30, T, D, BrdColReplicate>, + linearColumnFilter_caller<31, T, D, BrdColReplicate>, + linearColumnFilter_caller<32, T, D, BrdColReplicate> }, { - 0, - linearColumnFilter_caller<1 , T, D, BrdColConstant>, - linearColumnFilter_caller<2 , T, D, BrdColConstant>, - linearColumnFilter_caller<3 , T, D, BrdColConstant>, - linearColumnFilter_caller<4 , T, D, BrdColConstant>, - linearColumnFilter_caller<5 , T, D, BrdColConstant>, - linearColumnFilter_caller<6 , T, D, BrdColConstant>, - linearColumnFilter_caller<7 , T, D, BrdColConstant>, - linearColumnFilter_caller<8 , T, D, BrdColConstant>, - linearColumnFilter_caller<9 , T, D, BrdColConstant>, - linearColumnFilter_caller<10, T, D, BrdColConstant>, - linearColumnFilter_caller<11, T, D, BrdColConstant>, - linearColumnFilter_caller<12, T, D, BrdColConstant>, - linearColumnFilter_caller<13, T, D, BrdColConstant>, - linearColumnFilter_caller<14, T, D, BrdColConstant>, - linearColumnFilter_caller<15, T, D, BrdColConstant>, - linearColumnFilter_caller<16, T, D, BrdColConstant> + 0, + linearColumnFilter_caller< 1, T, D, BrdColConstant>, + linearColumnFilter_caller< 2, T, D, BrdColConstant>, + linearColumnFilter_caller< 3, T, D, BrdColConstant>, + linearColumnFilter_caller< 4, T, D, BrdColConstant>, + linearColumnFilter_caller< 5, T, D, BrdColConstant>, + linearColumnFilter_caller< 6, T, D, BrdColConstant>, + linearColumnFilter_caller< 7, T, D, BrdColConstant>, + linearColumnFilter_caller< 8, T, D, BrdColConstant>, + linearColumnFilter_caller< 9, T, D, BrdColConstant>, + linearColumnFilter_caller<10, T, D, BrdColConstant>, + linearColumnFilter_caller<11, T, D, BrdColConstant>, + linearColumnFilter_caller<12, T, D, BrdColConstant>, + linearColumnFilter_caller<13, T, D, BrdColConstant>, + linearColumnFilter_caller<14, T, D, BrdColConstant>, + linearColumnFilter_caller<15, T, D, BrdColConstant>, + linearColumnFilter_caller<16, T, D, BrdColConstant>, + linearColumnFilter_caller<17, T, D, BrdColConstant>, + linearColumnFilter_caller<18, T, D, BrdColConstant>, + linearColumnFilter_caller<19, T, D, BrdColConstant>, + linearColumnFilter_caller<20, T, D, BrdColConstant>, + linearColumnFilter_caller<21, T, D, BrdColConstant>, + linearColumnFilter_caller<22, T, D, BrdColConstant>, + linearColumnFilter_caller<23, T, D, BrdColConstant>, + linearColumnFilter_caller<24, T, D, BrdColConstant>, + linearColumnFilter_caller<25, T, D, BrdColConstant>, + linearColumnFilter_caller<26, T, D, BrdColConstant>, + linearColumnFilter_caller<27, T, D, BrdColConstant>, + linearColumnFilter_caller<28, T, D, BrdColConstant>, + linearColumnFilter_caller<29, T, D, BrdColConstant>, + linearColumnFilter_caller<30, T, D, BrdColConstant>, + linearColumnFilter_caller<31, T, D, BrdColConstant>, + linearColumnFilter_caller<32, T, D, BrdColConstant> }, { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReflect>, - linearColumnFilter_caller<2 , T, D, BrdColReflect>, - linearColumnFilter_caller<3 , T, D, BrdColReflect>, - linearColumnFilter_caller<4 , T, D, BrdColReflect>, - linearColumnFilter_caller<5 , T, D, BrdColReflect>, - linearColumnFilter_caller<6 , T, D, BrdColReflect>, - linearColumnFilter_caller<7 , T, D, BrdColReflect>, - linearColumnFilter_caller<8 , T, D, BrdColReflect>, - linearColumnFilter_caller<9 , T, D, BrdColReflect>, - linearColumnFilter_caller<10, T, D, BrdColReflect>, - linearColumnFilter_caller<11, T, D, BrdColReflect>, - linearColumnFilter_caller<12, T, D, BrdColReflect>, - linearColumnFilter_caller<13, T, D, BrdColReflect>, - linearColumnFilter_caller<14, T, D, BrdColReflect>, - linearColumnFilter_caller<15, T, D, BrdColReflect>, - linearColumnFilter_caller<16, T, D, BrdColReflect> + 0, + linearColumnFilter_caller< 1, T, D, BrdColReflect>, + linearColumnFilter_caller< 2, T, D, BrdColReflect>, + linearColumnFilter_caller< 3, T, D, BrdColReflect>, + linearColumnFilter_caller< 4, T, D, BrdColReflect>, + linearColumnFilter_caller< 5, T, D, BrdColReflect>, + linearColumnFilter_caller< 6, T, D, BrdColReflect>, + linearColumnFilter_caller< 7, T, D, BrdColReflect>, + linearColumnFilter_caller< 8, T, D, BrdColReflect>, + linearColumnFilter_caller< 9, T, D, BrdColReflect>, + linearColumnFilter_caller<10, T, D, BrdColReflect>, + linearColumnFilter_caller<11, T, D, BrdColReflect>, + linearColumnFilter_caller<12, T, D, BrdColReflect>, + linearColumnFilter_caller<13, T, D, BrdColReflect>, + linearColumnFilter_caller<14, T, D, BrdColReflect>, + linearColumnFilter_caller<15, T, D, BrdColReflect>, + linearColumnFilter_caller<16, T, D, BrdColReflect>, + linearColumnFilter_caller<17, T, D, BrdColReflect>, + linearColumnFilter_caller<18, T, D, BrdColReflect>, + linearColumnFilter_caller<19, T, D, BrdColReflect>, + linearColumnFilter_caller<20, T, D, BrdColReflect>, + linearColumnFilter_caller<21, T, D, BrdColReflect>, + linearColumnFilter_caller<22, T, D, BrdColReflect>, + linearColumnFilter_caller<23, T, D, BrdColReflect>, + linearColumnFilter_caller<24, T, D, BrdColReflect>, + linearColumnFilter_caller<25, T, D, BrdColReflect>, + linearColumnFilter_caller<26, T, D, BrdColReflect>, + linearColumnFilter_caller<27, T, D, BrdColReflect>, + linearColumnFilter_caller<28, T, D, BrdColReflect>, + linearColumnFilter_caller<29, T, D, BrdColReflect>, + linearColumnFilter_caller<30, T, D, BrdColReflect>, + linearColumnFilter_caller<31, T, D, BrdColReflect>, + linearColumnFilter_caller<32, T, D, BrdColReflect> }, { - 0, - linearColumnFilter_caller<1 , T, D, BrdColWrap>, - linearColumnFilter_caller<2 , T, D, BrdColWrap>, - linearColumnFilter_caller<3 , T, D, BrdColWrap>, - linearColumnFilter_caller<4 , T, D, BrdColWrap>, - linearColumnFilter_caller<5 , T, D, BrdColWrap>, - linearColumnFilter_caller<6 , T, D, BrdColWrap>, - linearColumnFilter_caller<7 , T, D, BrdColWrap>, - linearColumnFilter_caller<8 , T, D, BrdColWrap>, - linearColumnFilter_caller<9 , T, D, BrdColWrap>, - linearColumnFilter_caller<10, T, D, BrdColWrap>, - linearColumnFilter_caller<11, T, D, BrdColWrap>, - linearColumnFilter_caller<12, T, D, BrdColWrap>, - linearColumnFilter_caller<13, T, D, BrdColWrap>, - linearColumnFilter_caller<14, T, D, BrdColWrap>, - linearColumnFilter_caller<15, T, D, BrdColWrap>, + 0, + linearColumnFilter_caller< 1, T, D, BrdColWrap>, + linearColumnFilter_caller< 2, T, D, BrdColWrap>, + linearColumnFilter_caller< 3, T, D, BrdColWrap>, + linearColumnFilter_caller< 4, T, D, BrdColWrap>, + linearColumnFilter_caller< 5, T, D, BrdColWrap>, + linearColumnFilter_caller< 6, T, D, BrdColWrap>, + linearColumnFilter_caller< 7, T, D, BrdColWrap>, + linearColumnFilter_caller< 8, T, D, BrdColWrap>, + linearColumnFilter_caller< 9, T, D, BrdColWrap>, + linearColumnFilter_caller<10, T, D, BrdColWrap>, + linearColumnFilter_caller<11, T, D, BrdColWrap>, + linearColumnFilter_caller<12, T, D, BrdColWrap>, + linearColumnFilter_caller<13, T, D, BrdColWrap>, + linearColumnFilter_caller<14, T, D, BrdColWrap>, + linearColumnFilter_caller<15, T, D, BrdColWrap>, linearColumnFilter_caller<16, T, D, BrdColWrap>, - } + linearColumnFilter_caller<17, T, D, BrdColWrap>, + linearColumnFilter_caller<18, T, D, BrdColWrap>, + linearColumnFilter_caller<19, T, D, BrdColWrap>, + linearColumnFilter_caller<20, T, D, BrdColWrap>, + linearColumnFilter_caller<21, T, D, BrdColWrap>, + linearColumnFilter_caller<22, T, D, BrdColWrap>, + linearColumnFilter_caller<23, T, D, BrdColWrap>, + linearColumnFilter_caller<24, T, D, BrdColWrap>, + linearColumnFilter_caller<25, T, D, BrdColWrap>, + linearColumnFilter_caller<26, T, D, BrdColWrap>, + linearColumnFilter_caller<27, T, D, BrdColWrap>, + linearColumnFilter_caller<28, T, D, BrdColWrap>, + linearColumnFilter_caller<29, T, D, BrdColWrap>, + linearColumnFilter_caller<30, T, D, BrdColWrap>, + linearColumnFilter_caller<31, T, D, BrdColWrap>, + linearColumnFilter_caller<32, T, D, BrdColWrap> + } }; loadKernel(kernel, ksize); @@ -236,12 +325,10 @@ namespace cv { namespace gpu { namespace device callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); } - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - //template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - //template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); } // namespace column_filter }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu index 0855d2bf0b..1e4d3cc692 100644 --- a/modules/gpu/src/cuda/row_filter.cu +++ b/modules/gpu/src/cuda/row_filter.cu @@ -46,17 +46,14 @@ #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" +#include "opencv2/gpu/device/static_check.hpp" namespace cv { namespace gpu { namespace device { - #define MAX_KERNEL_SIZE 16 - #define BLOCK_DIM_X 16 - #define BLOCK_DIM_Y 4 - #define RESULT_STEPS 8 - #define HALO_STEPS 1 - namespace row_filter { + #define MAX_KERNEL_SIZE 32 + __constant__ float c_kernel[MAX_KERNEL_SIZE]; void loadKernel(const float kernel[], int ksize) @@ -64,87 +61,74 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); } - namespace detail - { - template struct SmemType - { - typedef typename TypeVec::cn>::vec_type smem_t; - }; - - template struct SmemType - { - typedef T smem_t; - }; - } - - template struct SmemType + template + __global__ void linearRowFilter(const DevMem2D_ src, PtrStep dst, const int anchor, const B brd) { - typedef typename detail::SmemType::smem_t smem_t; - }; + Static::check(); + Static= KSIZE>::check(); + Static::cn == VecTraits::cn>::check(); - template - __global__ void linearRowFilter(const DevMem2D_ src, PtrStep dst, int anchor, const B b) - { - typedef typename SmemType::smem_t smem_t; typedef typename TypeVec::cn>::vec_type sum_t; - __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X]; - - //Offset to the left halo edge - const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x; + __shared__ typename sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; + const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; - if (y < src.rows) - { - const T* src_row = src.ptr(y); + if (y >= src.rows) + return; - //Load main data - #pragma unroll - for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) - smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row); + const T* src_row = src.ptr(y); - //Load left halo - #pragma unroll - for(int i = 0; i < HALO_STEPS; ++i) - smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row); + const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; - //Load right halo - #pragma unroll - for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i) - smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row); + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); - __syncthreads(); + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); - D* dst_row = dst.ptr(y); + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); - #pragma unroll - for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) - { - sum_t sum = VecTraits::all(0); + __syncthreads(); + + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int x = xStart + j * BLOCK_DIM_X; - #pragma unroll - for (int j = 0; j < KERNEL_SIZE; ++j) - sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j]; + if (x >= src.cols) + return; - int dstX = x + i * BLOCK_DIM_X; + sum_t sum = VecTraits::all(0); - if (dstX < src.cols) - dst_row[dstX] = saturate_cast(sum); - } + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; + + dst(y, x) = saturate_cast(sum); } } - template class B> - void linearRowFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) + template class B> + void linearRowFilter_caller(DevMem2D_ src, DevMem2D_ dst, int anchor, cudaStream_t stream) { - typedef typename SmemType::smem_t smem_t; + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 4; const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); - B b(src.cols); + B brd(src.cols); - linearRowFilter<<>>(src, dst, anchor, b); + linearRowFilter<<>>(src, dst, anchor, brd); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -152,106 +136,187 @@ namespace cv { namespace gpu { namespace device } template - void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) + void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) { - typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); - static const caller_t callers[5][17] = + typedef void (*caller_t)(DevMem2D_ src, DevMem2D_ dst, int anchor, cudaStream_t stream); + + static const caller_t callers[5][33] = { { - 0, - linearRowFilter_caller<1 , T, D, BrdRowReflect101>, - linearRowFilter_caller<2 , T, D, BrdRowReflect101>, - linearRowFilter_caller<3 , T, D, BrdRowReflect101>, - linearRowFilter_caller<4 , T, D, BrdRowReflect101>, - linearRowFilter_caller<5 , T, D, BrdRowReflect101>, - linearRowFilter_caller<6 , T, D, BrdRowReflect101>, - linearRowFilter_caller<7 , T, D, BrdRowReflect101>, - linearRowFilter_caller<8 , T, D, BrdRowReflect101>, - linearRowFilter_caller<9 , T, D, BrdRowReflect101>, - linearRowFilter_caller<10, T, D, BrdRowReflect101>, - linearRowFilter_caller<11, T, D, BrdRowReflect101>, - linearRowFilter_caller<12, T, D, BrdRowReflect101>, - linearRowFilter_caller<13, T, D, BrdRowReflect101>, + 0, + linearRowFilter_caller< 1, T, D, BrdRowReflect101>, + linearRowFilter_caller< 2, T, D, BrdRowReflect101>, + linearRowFilter_caller< 3, T, D, BrdRowReflect101>, + linearRowFilter_caller< 4, T, D, BrdRowReflect101>, + linearRowFilter_caller< 5, T, D, BrdRowReflect101>, + linearRowFilter_caller< 6, T, D, BrdRowReflect101>, + linearRowFilter_caller< 7, T, D, BrdRowReflect101>, + linearRowFilter_caller< 8, T, D, BrdRowReflect101>, + linearRowFilter_caller< 9, T, D, BrdRowReflect101>, + linearRowFilter_caller<10, T, D, BrdRowReflect101>, + linearRowFilter_caller<11, T, D, BrdRowReflect101>, + linearRowFilter_caller<12, T, D, BrdRowReflect101>, + linearRowFilter_caller<13, T, D, BrdRowReflect101>, linearRowFilter_caller<14, T, D, BrdRowReflect101>, - linearRowFilter_caller<15, T, D, BrdRowReflect101>, - linearRowFilter_caller<16, T, D, BrdRowReflect101> + linearRowFilter_caller<15, T, D, BrdRowReflect101>, + linearRowFilter_caller<16, T, D, BrdRowReflect101>, + linearRowFilter_caller<17, T, D, BrdRowReflect101>, + linearRowFilter_caller<18, T, D, BrdRowReflect101>, + linearRowFilter_caller<19, T, D, BrdRowReflect101>, + linearRowFilter_caller<20, T, D, BrdRowReflect101>, + linearRowFilter_caller<21, T, D, BrdRowReflect101>, + linearRowFilter_caller<22, T, D, BrdRowReflect101>, + linearRowFilter_caller<23, T, D, BrdRowReflect101>, + linearRowFilter_caller<24, T, D, BrdRowReflect101>, + linearRowFilter_caller<25, T, D, BrdRowReflect101>, + linearRowFilter_caller<26, T, D, BrdRowReflect101>, + linearRowFilter_caller<27, T, D, BrdRowReflect101>, + linearRowFilter_caller<28, T, D, BrdRowReflect101>, + linearRowFilter_caller<29, T, D, BrdRowReflect101>, + linearRowFilter_caller<30, T, D, BrdRowReflect101>, + linearRowFilter_caller<31, T, D, BrdRowReflect101>, + linearRowFilter_caller<32, T, D, BrdRowReflect101> }, { - 0, - linearRowFilter_caller<1 , T, D, BrdRowReplicate>, - linearRowFilter_caller<2 , T, D, BrdRowReplicate>, - linearRowFilter_caller<3 , T, D, BrdRowReplicate>, - linearRowFilter_caller<4 , T, D, BrdRowReplicate>, - linearRowFilter_caller<5 , T, D, BrdRowReplicate>, - linearRowFilter_caller<6 , T, D, BrdRowReplicate>, - linearRowFilter_caller<7 , T, D, BrdRowReplicate>, - linearRowFilter_caller<8 , T, D, BrdRowReplicate>, - linearRowFilter_caller<9 , T, D, BrdRowReplicate>, - linearRowFilter_caller<10, T, D, BrdRowReplicate>, - linearRowFilter_caller<11, T, D, BrdRowReplicate>, - linearRowFilter_caller<12, T, D, BrdRowReplicate>, - linearRowFilter_caller<13, T, D, BrdRowReplicate>, + 0, + linearRowFilter_caller< 1, T, D, BrdRowReplicate>, + linearRowFilter_caller< 2, T, D, BrdRowReplicate>, + linearRowFilter_caller< 3, T, D, BrdRowReplicate>, + linearRowFilter_caller< 4, T, D, BrdRowReplicate>, + linearRowFilter_caller< 5, T, D, BrdRowReplicate>, + linearRowFilter_caller< 6, T, D, BrdRowReplicate>, + linearRowFilter_caller< 7, T, D, BrdRowReplicate>, + linearRowFilter_caller< 8, T, D, BrdRowReplicate>, + linearRowFilter_caller< 9, T, D, BrdRowReplicate>, + linearRowFilter_caller<10, T, D, BrdRowReplicate>, + linearRowFilter_caller<11, T, D, BrdRowReplicate>, + linearRowFilter_caller<12, T, D, BrdRowReplicate>, + linearRowFilter_caller<13, T, D, BrdRowReplicate>, linearRowFilter_caller<14, T, D, BrdRowReplicate>, - linearRowFilter_caller<15, T, D, BrdRowReplicate>, - linearRowFilter_caller<16, T, D, BrdRowReplicate> + linearRowFilter_caller<15, T, D, BrdRowReplicate>, + linearRowFilter_caller<16, T, D, BrdRowReplicate>, + linearRowFilter_caller<17, T, D, BrdRowReplicate>, + linearRowFilter_caller<18, T, D, BrdRowReplicate>, + linearRowFilter_caller<19, T, D, BrdRowReplicate>, + linearRowFilter_caller<20, T, D, BrdRowReplicate>, + linearRowFilter_caller<21, T, D, BrdRowReplicate>, + linearRowFilter_caller<22, T, D, BrdRowReplicate>, + linearRowFilter_caller<23, T, D, BrdRowReplicate>, + linearRowFilter_caller<24, T, D, BrdRowReplicate>, + linearRowFilter_caller<25, T, D, BrdRowReplicate>, + linearRowFilter_caller<26, T, D, BrdRowReplicate>, + linearRowFilter_caller<27, T, D, BrdRowReplicate>, + linearRowFilter_caller<28, T, D, BrdRowReplicate>, + linearRowFilter_caller<29, T, D, BrdRowReplicate>, + linearRowFilter_caller<30, T, D, BrdRowReplicate>, + linearRowFilter_caller<31, T, D, BrdRowReplicate>, + linearRowFilter_caller<32, T, D, BrdRowReplicate> }, { - 0, - linearRowFilter_caller<1 , T, D, BrdRowConstant>, - linearRowFilter_caller<2 , T, D, BrdRowConstant>, - linearRowFilter_caller<3 , T, D, BrdRowConstant>, - linearRowFilter_caller<4 , T, D, BrdRowConstant>, - linearRowFilter_caller<5 , T, D, BrdRowConstant>, - linearRowFilter_caller<6 , T, D, BrdRowConstant>, - linearRowFilter_caller<7 , T, D, BrdRowConstant>, - linearRowFilter_caller<8 , T, D, BrdRowConstant>, - linearRowFilter_caller<9 , T, D, BrdRowConstant>, - linearRowFilter_caller<10, T, D, BrdRowConstant>, - linearRowFilter_caller<11, T, D, BrdRowConstant>, - linearRowFilter_caller<12, T, D, BrdRowConstant>, + 0, + linearRowFilter_caller< 1, T, D, BrdRowConstant>, + linearRowFilter_caller< 2, T, D, BrdRowConstant>, + linearRowFilter_caller< 3, T, D, BrdRowConstant>, + linearRowFilter_caller< 4, T, D, BrdRowConstant>, + linearRowFilter_caller< 5, T, D, BrdRowConstant>, + linearRowFilter_caller< 6, T, D, BrdRowConstant>, + linearRowFilter_caller< 7, T, D, BrdRowConstant>, + linearRowFilter_caller< 8, T, D, BrdRowConstant>, + linearRowFilter_caller< 9, T, D, BrdRowConstant>, + linearRowFilter_caller<10, T, D, BrdRowConstant>, + linearRowFilter_caller<11, T, D, BrdRowConstant>, + linearRowFilter_caller<12, T, D, BrdRowConstant>, linearRowFilter_caller<13, T, D, BrdRowConstant>, linearRowFilter_caller<14, T, D, BrdRowConstant>, - linearRowFilter_caller<15, T, D, BrdRowConstant>, - linearRowFilter_caller<16, T, D, BrdRowConstant> + linearRowFilter_caller<15, T, D, BrdRowConstant>, + linearRowFilter_caller<16, T, D, BrdRowConstant>, + linearRowFilter_caller<17, T, D, BrdRowConstant>, + linearRowFilter_caller<18, T, D, BrdRowConstant>, + linearRowFilter_caller<19, T, D, BrdRowConstant>, + linearRowFilter_caller<20, T, D, BrdRowConstant>, + linearRowFilter_caller<21, T, D, BrdRowConstant>, + linearRowFilter_caller<22, T, D, BrdRowConstant>, + linearRowFilter_caller<23, T, D, BrdRowConstant>, + linearRowFilter_caller<24, T, D, BrdRowConstant>, + linearRowFilter_caller<25, T, D, BrdRowConstant>, + linearRowFilter_caller<26, T, D, BrdRowConstant>, + linearRowFilter_caller<27, T, D, BrdRowConstant>, + linearRowFilter_caller<28, T, D, BrdRowConstant>, + linearRowFilter_caller<29, T, D, BrdRowConstant>, + linearRowFilter_caller<30, T, D, BrdRowConstant>, + linearRowFilter_caller<31, T, D, BrdRowConstant>, + linearRowFilter_caller<32, T, D, BrdRowConstant> }, { - 0, - linearRowFilter_caller<1 , T, D, BrdRowReflect>, - linearRowFilter_caller<2 , T, D, BrdRowReflect>, - linearRowFilter_caller<3 , T, D, BrdRowReflect>, - linearRowFilter_caller<4 , T, D, BrdRowReflect>, - linearRowFilter_caller<5 , T, D, BrdRowReflect>, - linearRowFilter_caller<6 , T, D, BrdRowReflect>, - linearRowFilter_caller<7 , T, D, BrdRowReflect>, - linearRowFilter_caller<8 , T, D, BrdRowReflect>, - linearRowFilter_caller<9 , T, D, BrdRowReflect>, - linearRowFilter_caller<10, T, D, BrdRowReflect>, - linearRowFilter_caller<11, T, D, BrdRowReflect>, - linearRowFilter_caller<12, T, D, BrdRowReflect>, + 0, + linearRowFilter_caller< 1, T, D, BrdRowReflect>, + linearRowFilter_caller< 2, T, D, BrdRowReflect>, + linearRowFilter_caller< 3, T, D, BrdRowReflect>, + linearRowFilter_caller< 4, T, D, BrdRowReflect>, + linearRowFilter_caller< 5, T, D, BrdRowReflect>, + linearRowFilter_caller< 6, T, D, BrdRowReflect>, + linearRowFilter_caller< 7, T, D, BrdRowReflect>, + linearRowFilter_caller< 8, T, D, BrdRowReflect>, + linearRowFilter_caller< 9, T, D, BrdRowReflect>, + linearRowFilter_caller<10, T, D, BrdRowReflect>, + linearRowFilter_caller<11, T, D, BrdRowReflect>, + linearRowFilter_caller<12, T, D, BrdRowReflect>, linearRowFilter_caller<13, T, D, BrdRowReflect>, linearRowFilter_caller<14, T, D, BrdRowReflect>, - linearRowFilter_caller<15, T, D, BrdRowReflect>, - linearRowFilter_caller<16, T, D, BrdRowReflect> + linearRowFilter_caller<15, T, D, BrdRowReflect>, + linearRowFilter_caller<16, T, D, BrdRowReflect>, + linearRowFilter_caller<17, T, D, BrdRowReflect>, + linearRowFilter_caller<18, T, D, BrdRowReflect>, + linearRowFilter_caller<19, T, D, BrdRowReflect>, + linearRowFilter_caller<20, T, D, BrdRowReflect>, + linearRowFilter_caller<21, T, D, BrdRowReflect>, + linearRowFilter_caller<22, T, D, BrdRowReflect>, + linearRowFilter_caller<23, T, D, BrdRowReflect>, + linearRowFilter_caller<24, T, D, BrdRowReflect>, + linearRowFilter_caller<25, T, D, BrdRowReflect>, + linearRowFilter_caller<26, T, D, BrdRowReflect>, + linearRowFilter_caller<27, T, D, BrdRowReflect>, + linearRowFilter_caller<28, T, D, BrdRowReflect>, + linearRowFilter_caller<29, T, D, BrdRowReflect>, + linearRowFilter_caller<30, T, D, BrdRowReflect>, + linearRowFilter_caller<31, T, D, BrdRowReflect>, + linearRowFilter_caller<32, T, D, BrdRowReflect> }, { - 0, - linearRowFilter_caller<1 , T, D, BrdRowWrap>, - linearRowFilter_caller<2 , T, D, BrdRowWrap>, - linearRowFilter_caller<3 , T, D, BrdRowWrap>, - linearRowFilter_caller<4 , T, D, BrdRowWrap>, - linearRowFilter_caller<5 , T, D, BrdRowWrap>, - linearRowFilter_caller<6 , T, D, BrdRowWrap>, - linearRowFilter_caller<7 , T, D, BrdRowWrap>, - linearRowFilter_caller<8 , T, D, BrdRowWrap>, - linearRowFilter_caller<9 , T, D, BrdRowWrap>, - linearRowFilter_caller<10, T, D, BrdRowWrap>, - linearRowFilter_caller<11, T, D, BrdRowWrap>, - linearRowFilter_caller<12, T, D, BrdRowWrap>, + 0, + linearRowFilter_caller< 1, T, D, BrdRowWrap>, + linearRowFilter_caller< 2, T, D, BrdRowWrap>, + linearRowFilter_caller< 3, T, D, BrdRowWrap>, + linearRowFilter_caller< 4, T, D, BrdRowWrap>, + linearRowFilter_caller< 5, T, D, BrdRowWrap>, + linearRowFilter_caller< 6, T, D, BrdRowWrap>, + linearRowFilter_caller< 7, T, D, BrdRowWrap>, + linearRowFilter_caller< 8, T, D, BrdRowWrap>, + linearRowFilter_caller< 9, T, D, BrdRowWrap>, + linearRowFilter_caller<10, T, D, BrdRowWrap>, + linearRowFilter_caller<11, T, D, BrdRowWrap>, + linearRowFilter_caller<12, T, D, BrdRowWrap>, linearRowFilter_caller<13, T, D, BrdRowWrap>, linearRowFilter_caller<14, T, D, BrdRowWrap>, - linearRowFilter_caller<15, T, D, BrdRowWrap>, - linearRowFilter_caller<16, T, D, BrdRowWrap> - } + linearRowFilter_caller<15, T, D, BrdRowWrap>, + linearRowFilter_caller<16, T, D, BrdRowWrap>, + linearRowFilter_caller<17, T, D, BrdRowWrap>, + linearRowFilter_caller<18, T, D, BrdRowWrap>, + linearRowFilter_caller<19, T, D, BrdRowWrap>, + linearRowFilter_caller<20, T, D, BrdRowWrap>, + linearRowFilter_caller<21, T, D, BrdRowWrap>, + linearRowFilter_caller<22, T, D, BrdRowWrap>, + linearRowFilter_caller<23, T, D, BrdRowWrap>, + linearRowFilter_caller<24, T, D, BrdRowWrap>, + linearRowFilter_caller<25, T, D, BrdRowWrap>, + linearRowFilter_caller<26, T, D, BrdRowWrap>, + linearRowFilter_caller<27, T, D, BrdRowWrap>, + linearRowFilter_caller<28, T, D, BrdRowWrap>, + linearRowFilter_caller<29, T, D, BrdRowWrap>, + linearRowFilter_caller<30, T, D, BrdRowWrap>, + linearRowFilter_caller<31, T, D, BrdRowWrap>, + linearRowFilter_caller<32, T, D, BrdRowWrap> + } }; loadKernel(kernel, ksize); @@ -259,12 +324,10 @@ namespace cv { namespace gpu { namespace device callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); } - template void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - //template void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - //template void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); } // namespace row_filter }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index fb3cec42f6..45e2cd03e4 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -740,13 +740,13 @@ namespace cv { namespace gpu { namespace device namespace row_filter { template - void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); } namespace column_filter { template - void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); } }}} @@ -755,7 +755,7 @@ namespace typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor); - typedef void (*gpuFilter1D_t)(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); struct NppLinearRowFilter : public BaseRowFilter_GPU { @@ -825,8 +825,7 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType)); - CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 /*|| srcType == CV_16SC1*/ /*|| srcType == CV_16SC2*/ - || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1); + CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1); CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType)); @@ -836,7 +835,7 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, int ksize = cont_krnl.cols; - CV_Assert(ksize > 0 && ksize <= 16); + CV_Assert(ksize > 0 && ksize <= 32); normalizeAnchor(anchor, ksize); @@ -850,12 +849,6 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, case CV_8UC4: func = linearRowFilter_gpu; break; - /*case CV_16SC1: - func = linearRowFilter_gpu; - break;*/ - /*case CV_16SC2: - func = linearRowFilter_gpu; - break;*/ case CV_16SC3: func = linearRowFilter_gpu; break; @@ -940,8 +933,7 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType)); - CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 /*|| dstType == CV_16SC1*/ /*|| dstType == CV_16SC2*/ - || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1); + CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1); CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType)); @@ -951,7 +943,7 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds int ksize = cont_krnl.cols; - CV_Assert(ksize > 0 && ksize <= 16); + CV_Assert(ksize > 0 && ksize <= 32); normalizeAnchor(anchor, ksize); @@ -965,12 +957,6 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds case CV_8UC4: func = linearColumnFilter_gpu; break; - /*case CV_16SC1: - func = linearColumnFilter_gpu; - break;*/ - /*case CV_16SC2: - func = linearColumnFilter_gpu; - break;*/ case CV_16SC3: func = linearColumnFilter_gpu; break; diff --git a/modules/gpu/test/test_filters.cpp b/modules/gpu/test/test_filters.cpp index 4d282fa3b8..d5c668f63c 100644 --- a/modules/gpu/test/test_filters.cpp +++ b/modules/gpu/test/test_filters.cpp @@ -188,7 +188,7 @@ TEST_P(Sobel, Rgba) dev_dst_rgba.download(dst_rgba); - EXPECT_MAT_NEAR_KSIZE(dst_gold_rgba, dst_rgba, ksize, 0.0); + EXPECT_MAT_NEAR(dst_gold_rgba, dst_rgba, 0.0); } TEST_P(Sobel, Gray) @@ -204,7 +204,7 @@ TEST_P(Sobel, Gray) dev_dst_gray.download(dst_gray); - EXPECT_MAT_NEAR_KSIZE(dst_gold_gray, dst_gray, ksize, 0.0); + EXPECT_MAT_NEAR(dst_gold_gray, dst_gray, 0.0); } INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine( @@ -342,7 +342,7 @@ TEST_P(GaussianBlur, Rgba) dev_dst_rgba.download(dst_rgba); - EXPECT_MAT_NEAR_KSIZE(dst_gold_rgba, dst_rgba, ksize, 3.0); + EXPECT_MAT_NEAR(dst_gold_rgba, dst_rgba, 4.0); } TEST_P(GaussianBlur, Gray) @@ -355,12 +355,12 @@ TEST_P(GaussianBlur, Gray) dev_dst_gray.download(dst_gray); - EXPECT_MAT_NEAR_KSIZE(dst_gold_gray, dst_gray, ksize, 3.0); + EXPECT_MAT_NEAR(dst_gold_gray, dst_gray, 4.0); } INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine( ALL_DEVICES, - Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)), + Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7), cv::Size(9, 9), cv::Size(11, 11), cv::Size(13, 13), cv::Size(15, 15), cv::Size(17, 17), cv::Size(19, 19), cv::Size(21, 21), cv::Size(23, 23), cv::Size(25, 25), cv::Size(27, 27), cv::Size(29, 29), cv::Size(31, 31)), USE_ROI)); /////////////////////////////////////////////////////////////////////////////////////////////////