|
|
|
@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace filters |
|
|
|
|
|
|
|
|
|
namespace filter_krnls |
|
|
|
|
{ |
|
|
|
|
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, int CN, typename T, typename D> |
|
|
|
|
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D> |
|
|
|
|
__global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height) |
|
|
|
|
{ |
|
|
|
|
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; |
|
|
|
@ -102,7 +102,7 @@ namespace filter_krnls |
|
|
|
|
|
|
|
|
|
if (threadX < width) |
|
|
|
|
{ |
|
|
|
|
typedef typename TypeVec<float, CN>::vec_t sum_t; |
|
|
|
|
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t; |
|
|
|
|
sum_t sum = VecTraits<sum_t>::all(0); |
|
|
|
|
|
|
|
|
|
sDataRow += threadIdx.x + blockDim.x - anchor; |
|
|
|
@ -119,7 +119,7 @@ namespace filter_krnls |
|
|
|
|
|
|
|
|
|
namespace cv { namespace gpu { namespace filters |
|
|
|
|
{ |
|
|
|
|
template <int KERNEL_SIZE, int CN, typename T, typename D> |
|
|
|
|
template <int KERNEL_SIZE, typename T, typename D> |
|
|
|
|
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor) |
|
|
|
|
{ |
|
|
|
|
const int BLOCK_DIM_X = 16; |
|
|
|
@ -128,85 +128,50 @@ namespace cv { namespace gpu { namespace filters |
|
|
|
|
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); |
|
|
|
|
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); |
|
|
|
|
|
|
|
|
|
filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.data, src.step/src.elemSize(), |
|
|
|
|
filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.data, src.step/src.elemSize(), |
|
|
|
|
dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows); |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaThreadSynchronize() ); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template <int CN, typename T, typename D> |
|
|
|
|
inline void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
template <typename T, typename D> |
|
|
|
|
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor); |
|
|
|
|
static const caller_t callers[] = |
|
|
|
|
{linearRowFilter_caller<0 , CN, T, D>, linearRowFilter_caller<1 , CN, T, D>, |
|
|
|
|
linearRowFilter_caller<2 , CN, T, D>, linearRowFilter_caller<3 , CN, T, D>, |
|
|
|
|
linearRowFilter_caller<4 , CN, T, D>, linearRowFilter_caller<5 , CN, T, D>, |
|
|
|
|
linearRowFilter_caller<6 , CN, T, D>, linearRowFilter_caller<7 , CN, T, D>, |
|
|
|
|
linearRowFilter_caller<8 , CN, T, D>, linearRowFilter_caller<9 , CN, T, D>, |
|
|
|
|
linearRowFilter_caller<10, CN, T, D>, linearRowFilter_caller<11, CN, T, D>, |
|
|
|
|
linearRowFilter_caller<12, CN, T, D>, linearRowFilter_caller<13, CN, T, D>, |
|
|
|
|
linearRowFilter_caller<14, CN, T, D>, linearRowFilter_caller<15, CN, T, D>}; |
|
|
|
|
{linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>, |
|
|
|
|
linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>, |
|
|
|
|
linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>, |
|
|
|
|
linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>, |
|
|
|
|
linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>, |
|
|
|
|
linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>, |
|
|
|
|
linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>, |
|
|
|
|
linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>}; |
|
|
|
|
|
|
|
|
|
loadLinearKernel(kernel, ksize); |
|
|
|
|
|
|
|
|
|
callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template void linearRowFilter_gpu<4, uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<uchar4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<char4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<char4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
|
|
|
|
|
/* void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<4, uchar4, uchar4>(src, dst, kernel, ksize, anchor); |
|
|
|
|
}*/ |
|
|
|
|
void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<4, uchar4, char4>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_8s_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<4, char4, uchar4>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_8s_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<4, char4, char4>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_16u_16u_c2(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<2, ushort2, ushort2>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_16u_16s_c2(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<2, ushort2, short2>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_16s_16u_c2(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<2, short2, ushort2>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_16s_16s_c2(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<2, short2, short2>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_32s_32s_c1(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<1, int, int>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_32s_32f_c1(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<1, int, float>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_32f_32s_c1(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<1, float, int>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearRowFilter_gpu_32f_32f_c1(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearRowFilter_gpu<1 ,float, float>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
template void linearRowFilter_gpu<ushort2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<ushort2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<short2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<short2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
|
|
|
|
|
template void linearRowFilter_gpu<int, int>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<int, float>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<float, int>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearRowFilter_gpu<float, float>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
}}} |
|
|
|
|
|
|
|
|
|
namespace filter_krnls |
|
|
|
|
{ |
|
|
|
|
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, int CN, typename T, typename D> |
|
|
|
|
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D> |
|
|
|
|
__global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height) |
|
|
|
|
{ |
|
|
|
|
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; |
|
|
|
@ -238,7 +203,7 @@ namespace filter_krnls |
|
|
|
|
|
|
|
|
|
if (threadY < height) |
|
|
|
|
{ |
|
|
|
|
typedef typename TypeVec<float, CN>::vec_t sum_t; |
|
|
|
|
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t; |
|
|
|
|
sum_t sum = VecTraits<sum_t>::all(0); |
|
|
|
|
|
|
|
|
|
sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step; |
|
|
|
@ -255,7 +220,7 @@ namespace filter_krnls |
|
|
|
|
|
|
|
|
|
namespace cv { namespace gpu { namespace filters |
|
|
|
|
{ |
|
|
|
|
template <int KERNEL_SIZE, int CN, typename T, typename D> |
|
|
|
|
template <int KERNEL_SIZE, typename T, typename D> |
|
|
|
|
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor) |
|
|
|
|
{ |
|
|
|
|
const int BLOCK_DIM_X = 16; |
|
|
|
@ -264,78 +229,45 @@ namespace cv { namespace gpu { namespace filters |
|
|
|
|
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); |
|
|
|
|
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); |
|
|
|
|
|
|
|
|
|
filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.data, src.step/src.elemSize(), |
|
|
|
|
filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.data, src.step/src.elemSize(), |
|
|
|
|
dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows); |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaThreadSynchronize() ); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template <int CN, typename T, typename D> |
|
|
|
|
inline void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
template <typename T, typename D> |
|
|
|
|
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor); |
|
|
|
|
static const caller_t callers[] = |
|
|
|
|
{linearColumnFilter_caller<0 , CN, T, D>, linearColumnFilter_caller<1 , CN, T, D>, |
|
|
|
|
linearColumnFilter_caller<2 , CN, T, D>, linearColumnFilter_caller<3 , CN, T, D>, |
|
|
|
|
linearColumnFilter_caller<4 , CN, T, D>, linearColumnFilter_caller<5 , CN, T, D>, |
|
|
|
|
linearColumnFilter_caller<6 , CN, T, D>, linearColumnFilter_caller<7 , CN, T, D>, |
|
|
|
|
linearColumnFilter_caller<8 , CN, T, D>, linearColumnFilter_caller<9 , CN, T, D>, |
|
|
|
|
linearColumnFilter_caller<10, CN, T, D>, linearColumnFilter_caller<11, CN, T, D>, |
|
|
|
|
linearColumnFilter_caller<12, CN, T, D>, linearColumnFilter_caller<13, CN, T, D>, |
|
|
|
|
linearColumnFilter_caller<14, CN, T, D>, linearColumnFilter_caller<15, CN, T, D>}; |
|
|
|
|
{linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>, |
|
|
|
|
linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>, |
|
|
|
|
linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>, |
|
|
|
|
linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>, |
|
|
|
|
linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>, |
|
|
|
|
linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>, |
|
|
|
|
linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>, |
|
|
|
|
linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>}; |
|
|
|
|
|
|
|
|
|
loadLinearKernel(kernel, ksize); |
|
|
|
|
|
|
|
|
|
callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void linearColumnFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<4, uchar4, uchar4>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<4, uchar4, char4>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_8s_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<4, char4, uchar4>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_8s_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<4, char4, char4>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_16u_16u_c2(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<2, ushort2, ushort2>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_16u_16s_c2(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<2, ushort2, short2>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_16s_16u_c2(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<2, short2, ushort2>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_16s_16s_c2(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<2, short2, short2>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_32s_32s_c1(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<1, int, int>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_32s_32f_c1(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<1, int, float>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_32f_32s_c1(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<1, float, int>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
void linearColumnFilter_gpu_32f_32f_c1(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) |
|
|
|
|
{ |
|
|
|
|
linearColumnFilter_gpu<1, float, float>(src, dst, kernel, ksize, anchor); |
|
|
|
|
} |
|
|
|
|
template void linearColumnFilter_gpu<uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<uchar4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<char4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<char4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
|
|
|
|
|
template void linearColumnFilter_gpu<ushort2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<ushort2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<short2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<short2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
|
|
|
|
|
template void linearColumnFilter_gpu<int, int>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<int, float>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<float, int>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
template void linearColumnFilter_gpu<float, float>(const DevMem2D&, const DevMem2D&, const float[], int , int); |
|
|
|
|
}}} |
|
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|