diff --git a/modules/gpu/src/cuda/column_filter.0.cu b/modules/gpu/src/cuda/column_filter.0.cu new file mode 100644 index 0000000000..c35c6ee64d --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.0.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.1.cu b/modules/gpu/src/cuda/column_filter.1.cu new file mode 100644 index 0000000000..9a2d6a0427 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.1.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.2.cu b/modules/gpu/src/cuda/column_filter.2.cu new file mode 100644 index 0000000000..05ee01c763 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.2.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.3.cu b/modules/gpu/src/cuda/column_filter.3.cu new file mode 100644 index 0000000000..1bf49219f9 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.3.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.4.cu b/modules/gpu/src/cuda/column_filter.4.cu new file mode 100644 index 0000000000..bec7a085a0 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.4.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.5.cu b/modules/gpu/src/cuda/column_filter.5.cu new file mode 100644 index 0000000000..8194ee39aa --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.5.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.6.cu b/modules/gpu/src/cuda/column_filter.6.cu new file mode 100644 index 0000000000..d8fc49be68 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.6.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.7.cu b/modules/gpu/src/cuda/column_filter.7.cu new file mode 100644 index 0000000000..534bd821ef --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.7.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu deleted file mode 100644 index af7369ad5e..0000000000 --- a/modules/gpu/src/cuda/column_filter.cu +++ /dev/null @@ -1,391 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if !defined CUDA_DISABLER - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" -#include "opencv2/gpu/device/static_check.hpp" - -namespace cv { namespace gpu { namespace device -{ - namespace column_filter - { - #define MAX_KERNEL_SIZE 32 - - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - - void loadKernel(const float* kernel, int ksize, cudaStream_t stream) - { - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - } - - template <int KSIZE, typename T, typename D, typename B> - __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 16; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; - #else - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 8; - const int PATCH_PER_BLOCK = 2; - const int HALO_SIZE = 2; - #endif - - typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; - - __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; - - const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; - - if (x >= src.cols) - return; - - const T* src_col = src.ptr() + x; - - const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; - - if (blockIdx.y > 0) - { - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); - } - else - { - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); - } - - if (blockIdx.y + 2 < gridDim.y) - { - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x)); - - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); - } - else - { - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); - - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); - } - - __syncthreads(); - - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - { - const int y = yStart + j * BLOCK_DIM_Y; - - if (y < src.rows) - { - sum_t sum = VecTraits<sum_t>::all(0); - - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; - - dst(y, x) = saturate_cast<D>(sum); - } - } - } - - template <int KSIZE, typename T, typename D, template<typename> class B> - void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) - { - int BLOCK_DIM_X; - int BLOCK_DIM_Y; - int PATCH_PER_BLOCK; - - if (cc >= 20) - { - BLOCK_DIM_X = 16; - BLOCK_DIM_Y = 16; - PATCH_PER_BLOCK = 4; - } - else - { - BLOCK_DIM_X = 16; - BLOCK_DIM_Y = 8; - PATCH_PER_BLOCK = 2; - } - - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); - - B<T> brd(src.rows); - - linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template <typename T, typename D> - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) - { - typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream); - - static const caller_t callers[5][33] = - { - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReflect101>, - linearColumnFilter_caller< 2, T, D, BrdColReflect101>, - linearColumnFilter_caller< 3, T, D, BrdColReflect101>, - linearColumnFilter_caller< 4, T, D, BrdColReflect101>, - linearColumnFilter_caller< 5, T, D, BrdColReflect101>, - linearColumnFilter_caller< 6, T, D, BrdColReflect101>, - linearColumnFilter_caller< 7, T, D, BrdColReflect101>, - linearColumnFilter_caller< 8, T, D, BrdColReflect101>, - linearColumnFilter_caller< 9, T, D, BrdColReflect101>, - linearColumnFilter_caller<10, T, D, BrdColReflect101>, - linearColumnFilter_caller<11, T, D, BrdColReflect101>, - linearColumnFilter_caller<12, T, D, BrdColReflect101>, - linearColumnFilter_caller<13, T, D, BrdColReflect101>, - linearColumnFilter_caller<14, T, D, BrdColReflect101>, - linearColumnFilter_caller<15, T, D, BrdColReflect101>, - linearColumnFilter_caller<16, T, D, BrdColReflect101>, - linearColumnFilter_caller<17, T, D, BrdColReflect101>, - linearColumnFilter_caller<18, T, D, BrdColReflect101>, - linearColumnFilter_caller<19, T, D, BrdColReflect101>, - linearColumnFilter_caller<20, T, D, BrdColReflect101>, - linearColumnFilter_caller<21, T, D, BrdColReflect101>, - linearColumnFilter_caller<22, T, D, BrdColReflect101>, - linearColumnFilter_caller<23, T, D, BrdColReflect101>, - linearColumnFilter_caller<24, T, D, BrdColReflect101>, - linearColumnFilter_caller<25, T, D, BrdColReflect101>, - linearColumnFilter_caller<26, T, D, BrdColReflect101>, - linearColumnFilter_caller<27, T, D, BrdColReflect101>, - linearColumnFilter_caller<28, T, D, BrdColReflect101>, - linearColumnFilter_caller<29, T, D, BrdColReflect101>, - linearColumnFilter_caller<30, T, D, BrdColReflect101>, - linearColumnFilter_caller<31, T, D, BrdColReflect101>, - linearColumnFilter_caller<32, T, D, BrdColReflect101> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReplicate>, - linearColumnFilter_caller< 2, T, D, BrdColReplicate>, - linearColumnFilter_caller< 3, T, D, BrdColReplicate>, - linearColumnFilter_caller< 4, T, D, BrdColReplicate>, - linearColumnFilter_caller< 5, T, D, BrdColReplicate>, - linearColumnFilter_caller< 6, T, D, BrdColReplicate>, - linearColumnFilter_caller< 7, T, D, BrdColReplicate>, - linearColumnFilter_caller< 8, T, D, BrdColReplicate>, - linearColumnFilter_caller< 9, T, D, BrdColReplicate>, - linearColumnFilter_caller<10, T, D, BrdColReplicate>, - linearColumnFilter_caller<11, T, D, BrdColReplicate>, - linearColumnFilter_caller<12, T, D, BrdColReplicate>, - linearColumnFilter_caller<13, T, D, BrdColReplicate>, - linearColumnFilter_caller<14, T, D, BrdColReplicate>, - linearColumnFilter_caller<15, T, D, BrdColReplicate>, - linearColumnFilter_caller<16, T, D, BrdColReplicate>, - linearColumnFilter_caller<17, T, D, BrdColReplicate>, - linearColumnFilter_caller<18, T, D, BrdColReplicate>, - linearColumnFilter_caller<19, T, D, BrdColReplicate>, - linearColumnFilter_caller<20, T, D, BrdColReplicate>, - linearColumnFilter_caller<21, T, D, BrdColReplicate>, - linearColumnFilter_caller<22, T, D, BrdColReplicate>, - linearColumnFilter_caller<23, T, D, BrdColReplicate>, - linearColumnFilter_caller<24, T, D, BrdColReplicate>, - linearColumnFilter_caller<25, T, D, BrdColReplicate>, - linearColumnFilter_caller<26, T, D, BrdColReplicate>, - linearColumnFilter_caller<27, T, D, BrdColReplicate>, - linearColumnFilter_caller<28, T, D, BrdColReplicate>, - linearColumnFilter_caller<29, T, D, BrdColReplicate>, - linearColumnFilter_caller<30, T, D, BrdColReplicate>, - linearColumnFilter_caller<31, T, D, BrdColReplicate>, - linearColumnFilter_caller<32, T, D, BrdColReplicate> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColConstant>, - linearColumnFilter_caller< 2, T, D, BrdColConstant>, - linearColumnFilter_caller< 3, T, D, BrdColConstant>, - linearColumnFilter_caller< 4, T, D, BrdColConstant>, - linearColumnFilter_caller< 5, T, D, BrdColConstant>, - linearColumnFilter_caller< 6, T, D, BrdColConstant>, - linearColumnFilter_caller< 7, T, D, BrdColConstant>, - linearColumnFilter_caller< 8, T, D, BrdColConstant>, - linearColumnFilter_caller< 9, T, D, BrdColConstant>, - linearColumnFilter_caller<10, T, D, BrdColConstant>, - linearColumnFilter_caller<11, T, D, BrdColConstant>, - linearColumnFilter_caller<12, T, D, BrdColConstant>, - linearColumnFilter_caller<13, T, D, BrdColConstant>, - linearColumnFilter_caller<14, T, D, BrdColConstant>, - linearColumnFilter_caller<15, T, D, BrdColConstant>, - linearColumnFilter_caller<16, T, D, BrdColConstant>, - linearColumnFilter_caller<17, T, D, BrdColConstant>, - linearColumnFilter_caller<18, T, D, BrdColConstant>, - linearColumnFilter_caller<19, T, D, BrdColConstant>, - linearColumnFilter_caller<20, T, D, BrdColConstant>, - linearColumnFilter_caller<21, T, D, BrdColConstant>, - linearColumnFilter_caller<22, T, D, BrdColConstant>, - linearColumnFilter_caller<23, T, D, BrdColConstant>, - linearColumnFilter_caller<24, T, D, BrdColConstant>, - linearColumnFilter_caller<25, T, D, BrdColConstant>, - linearColumnFilter_caller<26, T, D, BrdColConstant>, - linearColumnFilter_caller<27, T, D, BrdColConstant>, - linearColumnFilter_caller<28, T, D, BrdColConstant>, - linearColumnFilter_caller<29, T, D, BrdColConstant>, - linearColumnFilter_caller<30, T, D, BrdColConstant>, - linearColumnFilter_caller<31, T, D, BrdColConstant>, - linearColumnFilter_caller<32, T, D, BrdColConstant> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReflect>, - linearColumnFilter_caller< 2, T, D, BrdColReflect>, - linearColumnFilter_caller< 3, T, D, BrdColReflect>, - linearColumnFilter_caller< 4, T, D, BrdColReflect>, - linearColumnFilter_caller< 5, T, D, BrdColReflect>, - linearColumnFilter_caller< 6, T, D, BrdColReflect>, - linearColumnFilter_caller< 7, T, D, BrdColReflect>, - linearColumnFilter_caller< 8, T, D, BrdColReflect>, - linearColumnFilter_caller< 9, T, D, BrdColReflect>, - linearColumnFilter_caller<10, T, D, BrdColReflect>, - linearColumnFilter_caller<11, T, D, BrdColReflect>, - linearColumnFilter_caller<12, T, D, BrdColReflect>, - linearColumnFilter_caller<13, T, D, BrdColReflect>, - linearColumnFilter_caller<14, T, D, BrdColReflect>, - linearColumnFilter_caller<15, T, D, BrdColReflect>, - linearColumnFilter_caller<16, T, D, BrdColReflect>, - linearColumnFilter_caller<17, T, D, BrdColReflect>, - linearColumnFilter_caller<18, T, D, BrdColReflect>, - linearColumnFilter_caller<19, T, D, BrdColReflect>, - linearColumnFilter_caller<20, T, D, BrdColReflect>, - linearColumnFilter_caller<21, T, D, BrdColReflect>, - linearColumnFilter_caller<22, T, D, BrdColReflect>, - linearColumnFilter_caller<23, T, D, BrdColReflect>, - linearColumnFilter_caller<24, T, D, BrdColReflect>, - linearColumnFilter_caller<25, T, D, BrdColReflect>, - linearColumnFilter_caller<26, T, D, BrdColReflect>, - linearColumnFilter_caller<27, T, D, BrdColReflect>, - linearColumnFilter_caller<28, T, D, BrdColReflect>, - linearColumnFilter_caller<29, T, D, BrdColReflect>, - linearColumnFilter_caller<30, T, D, BrdColReflect>, - linearColumnFilter_caller<31, T, D, BrdColReflect>, - linearColumnFilter_caller<32, T, D, BrdColReflect> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColWrap>, - linearColumnFilter_caller< 2, T, D, BrdColWrap>, - linearColumnFilter_caller< 3, T, D, BrdColWrap>, - linearColumnFilter_caller< 4, T, D, BrdColWrap>, - linearColumnFilter_caller< 5, T, D, BrdColWrap>, - linearColumnFilter_caller< 6, T, D, BrdColWrap>, - linearColumnFilter_caller< 7, T, D, BrdColWrap>, - linearColumnFilter_caller< 8, T, D, BrdColWrap>, - linearColumnFilter_caller< 9, T, D, BrdColWrap>, - linearColumnFilter_caller<10, T, D, BrdColWrap>, - linearColumnFilter_caller<11, T, D, BrdColWrap>, - linearColumnFilter_caller<12, T, D, BrdColWrap>, - linearColumnFilter_caller<13, T, D, BrdColWrap>, - linearColumnFilter_caller<14, T, D, BrdColWrap>, - linearColumnFilter_caller<15, T, D, BrdColWrap>, - linearColumnFilter_caller<16, T, D, BrdColWrap>, - linearColumnFilter_caller<17, T, D, BrdColWrap>, - linearColumnFilter_caller<18, T, D, BrdColWrap>, - linearColumnFilter_caller<19, T, D, BrdColWrap>, - linearColumnFilter_caller<20, T, D, BrdColWrap>, - linearColumnFilter_caller<21, T, D, BrdColWrap>, - linearColumnFilter_caller<22, T, D, BrdColWrap>, - linearColumnFilter_caller<23, T, D, BrdColWrap>, - linearColumnFilter_caller<24, T, D, BrdColWrap>, - linearColumnFilter_caller<25, T, D, BrdColWrap>, - linearColumnFilter_caller<26, T, D, BrdColWrap>, - linearColumnFilter_caller<27, T, D, BrdColWrap>, - linearColumnFilter_caller<28, T, D, BrdColWrap>, - linearColumnFilter_caller<29, T, D, BrdColWrap>, - linearColumnFilter_caller<30, T, D, BrdColWrap>, - linearColumnFilter_caller<31, T, D, BrdColWrap>, - linearColumnFilter_caller<32, T, D, BrdColWrap> - } - }; - - loadKernel(kernel, ksize, stream); - - callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream); - } - - template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu<float , int >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } // namespace column_filter -}}} // namespace cv { namespace gpu { namespace device - - -#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.h b/modules/gpu/src/cuda/column_filter.h new file mode 100644 index 0000000000..dbcd09fa35 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.h @@ -0,0 +1,378 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace +{ + #define MAX_KERNEL_SIZE 32 + + __constant__ float c_kernel[MAX_KERNEL_SIZE]; + + void loadKernel(const float* kernel, int ksize, cudaStream_t stream) + { + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); + } + + template <int KSIZE, typename T, typename D, typename B> + __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 16; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; + #else + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 2; + const int HALO_SIZE = 2; + #endif + + typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; + + __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; + + const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; + + if (x >= src.cols) + return; + + const T* src_col = src.ptr() + x; + + const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; + + if (blockIdx.y > 0) + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); + } + else + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); + } + + if (blockIdx.y + 2 < gridDim.y) + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); + } + else + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int y = yStart + j * BLOCK_DIM_Y; + + if (y < src.rows) + { + sum_t sum = VecTraits<sum_t>::all(0); + + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; + + dst(y, x) = saturate_cast<D>(sum); + } + } + } + + template <int KSIZE, typename T, typename D, template<typename> class B> + void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) + { + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 16; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 2; + } + + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); + + B<T> brd(src.rows); + + linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); + + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +namespace filter +{ + template <typename T, typename D> + void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + { + typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream); + + static const caller_t callers[5][33] = + { + { + 0, + ::caller< 1, T, D, BrdColReflect101>, + ::caller< 2, T, D, BrdColReflect101>, + ::caller< 3, T, D, BrdColReflect101>, + ::caller< 4, T, D, BrdColReflect101>, + ::caller< 5, T, D, BrdColReflect101>, + ::caller< 6, T, D, BrdColReflect101>, + ::caller< 7, T, D, BrdColReflect101>, + ::caller< 8, T, D, BrdColReflect101>, + ::caller< 9, T, D, BrdColReflect101>, + ::caller<10, T, D, BrdColReflect101>, + ::caller<11, T, D, BrdColReflect101>, + ::caller<12, T, D, BrdColReflect101>, + ::caller<13, T, D, BrdColReflect101>, + ::caller<14, T, D, BrdColReflect101>, + ::caller<15, T, D, BrdColReflect101>, + ::caller<16, T, D, BrdColReflect101>, + ::caller<17, T, D, BrdColReflect101>, + ::caller<18, T, D, BrdColReflect101>, + ::caller<19, T, D, BrdColReflect101>, + ::caller<20, T, D, BrdColReflect101>, + ::caller<21, T, D, BrdColReflect101>, + ::caller<22, T, D, BrdColReflect101>, + ::caller<23, T, D, BrdColReflect101>, + ::caller<24, T, D, BrdColReflect101>, + ::caller<25, T, D, BrdColReflect101>, + ::caller<26, T, D, BrdColReflect101>, + ::caller<27, T, D, BrdColReflect101>, + ::caller<28, T, D, BrdColReflect101>, + ::caller<29, T, D, BrdColReflect101>, + ::caller<30, T, D, BrdColReflect101>, + ::caller<31, T, D, BrdColReflect101>, + ::caller<32, T, D, BrdColReflect101> + }, + { + 0, + ::caller< 1, T, D, BrdColReplicate>, + ::caller< 2, T, D, BrdColReplicate>, + ::caller< 3, T, D, BrdColReplicate>, + ::caller< 4, T, D, BrdColReplicate>, + ::caller< 5, T, D, BrdColReplicate>, + ::caller< 6, T, D, BrdColReplicate>, + ::caller< 7, T, D, BrdColReplicate>, + ::caller< 8, T, D, BrdColReplicate>, + ::caller< 9, T, D, BrdColReplicate>, + ::caller<10, T, D, BrdColReplicate>, + ::caller<11, T, D, BrdColReplicate>, + ::caller<12, T, D, BrdColReplicate>, + ::caller<13, T, D, BrdColReplicate>, + ::caller<14, T, D, BrdColReplicate>, + ::caller<15, T, D, BrdColReplicate>, + ::caller<16, T, D, BrdColReplicate>, + ::caller<17, T, D, BrdColReplicate>, + ::caller<18, T, D, BrdColReplicate>, + ::caller<19, T, D, BrdColReplicate>, + ::caller<20, T, D, BrdColReplicate>, + ::caller<21, T, D, BrdColReplicate>, + ::caller<22, T, D, BrdColReplicate>, + ::caller<23, T, D, BrdColReplicate>, + ::caller<24, T, D, BrdColReplicate>, + ::caller<25, T, D, BrdColReplicate>, + ::caller<26, T, D, BrdColReplicate>, + ::caller<27, T, D, BrdColReplicate>, + ::caller<28, T, D, BrdColReplicate>, + ::caller<29, T, D, BrdColReplicate>, + ::caller<30, T, D, BrdColReplicate>, + ::caller<31, T, D, BrdColReplicate>, + ::caller<32, T, D, BrdColReplicate> + }, + { + 0, + ::caller< 1, T, D, BrdColConstant>, + ::caller< 2, T, D, BrdColConstant>, + ::caller< 3, T, D, BrdColConstant>, + ::caller< 4, T, D, BrdColConstant>, + ::caller< 5, T, D, BrdColConstant>, + ::caller< 6, T, D, BrdColConstant>, + ::caller< 7, T, D, BrdColConstant>, + ::caller< 8, T, D, BrdColConstant>, + ::caller< 9, T, D, BrdColConstant>, + ::caller<10, T, D, BrdColConstant>, + ::caller<11, T, D, BrdColConstant>, + ::caller<12, T, D, BrdColConstant>, + ::caller<13, T, D, BrdColConstant>, + ::caller<14, T, D, BrdColConstant>, + ::caller<15, T, D, BrdColConstant>, + ::caller<16, T, D, BrdColConstant>, + ::caller<17, T, D, BrdColConstant>, + ::caller<18, T, D, BrdColConstant>, + ::caller<19, T, D, BrdColConstant>, + ::caller<20, T, D, BrdColConstant>, + ::caller<21, T, D, BrdColConstant>, + ::caller<22, T, D, BrdColConstant>, + ::caller<23, T, D, BrdColConstant>, + ::caller<24, T, D, BrdColConstant>, + ::caller<25, T, D, BrdColConstant>, + ::caller<26, T, D, BrdColConstant>, + ::caller<27, T, D, BrdColConstant>, + ::caller<28, T, D, BrdColConstant>, + ::caller<29, T, D, BrdColConstant>, + ::caller<30, T, D, BrdColConstant>, + ::caller<31, T, D, BrdColConstant>, + ::caller<32, T, D, BrdColConstant> + }, + { + 0, + ::caller< 1, T, D, BrdColReflect>, + ::caller< 2, T, D, BrdColReflect>, + ::caller< 3, T, D, BrdColReflect>, + ::caller< 4, T, D, BrdColReflect>, + ::caller< 5, T, D, BrdColReflect>, + ::caller< 6, T, D, BrdColReflect>, + ::caller< 7, T, D, BrdColReflect>, + ::caller< 8, T, D, BrdColReflect>, + ::caller< 9, T, D, BrdColReflect>, + ::caller<10, T, D, BrdColReflect>, + ::caller<11, T, D, BrdColReflect>, + ::caller<12, T, D, BrdColReflect>, + ::caller<13, T, D, BrdColReflect>, + ::caller<14, T, D, BrdColReflect>, + ::caller<15, T, D, BrdColReflect>, + ::caller<16, T, D, BrdColReflect>, + ::caller<17, T, D, BrdColReflect>, + ::caller<18, T, D, BrdColReflect>, + ::caller<19, T, D, BrdColReflect>, + ::caller<20, T, D, BrdColReflect>, + ::caller<21, T, D, BrdColReflect>, + ::caller<22, T, D, BrdColReflect>, + ::caller<23, T, D, BrdColReflect>, + ::caller<24, T, D, BrdColReflect>, + ::caller<25, T, D, BrdColReflect>, + ::caller<26, T, D, BrdColReflect>, + ::caller<27, T, D, BrdColReflect>, + ::caller<28, T, D, BrdColReflect>, + ::caller<29, T, D, BrdColReflect>, + ::caller<30, T, D, BrdColReflect>, + ::caller<31, T, D, BrdColReflect>, + ::caller<32, T, D, BrdColReflect> + }, + { + 0, + ::caller< 1, T, D, BrdColWrap>, + ::caller< 2, T, D, BrdColWrap>, + ::caller< 3, T, D, BrdColWrap>, + ::caller< 4, T, D, BrdColWrap>, + ::caller< 5, T, D, BrdColWrap>, + ::caller< 6, T, D, BrdColWrap>, + ::caller< 7, T, D, BrdColWrap>, + ::caller< 8, T, D, BrdColWrap>, + ::caller< 9, T, D, BrdColWrap>, + ::caller<10, T, D, BrdColWrap>, + ::caller<11, T, D, BrdColWrap>, + ::caller<12, T, D, BrdColWrap>, + ::caller<13, T, D, BrdColWrap>, + ::caller<14, T, D, BrdColWrap>, + ::caller<15, T, D, BrdColWrap>, + ::caller<16, T, D, BrdColWrap>, + ::caller<17, T, D, BrdColWrap>, + ::caller<18, T, D, BrdColWrap>, + ::caller<19, T, D, BrdColWrap>, + ::caller<20, T, D, BrdColWrap>, + ::caller<21, T, D, BrdColWrap>, + ::caller<22, T, D, BrdColWrap>, + ::caller<23, T, D, BrdColWrap>, + ::caller<24, T, D, BrdColWrap>, + ::caller<25, T, D, BrdColWrap>, + ::caller<26, T, D, BrdColWrap>, + ::caller<27, T, D, BrdColWrap>, + ::caller<28, T, D, BrdColWrap>, + ::caller<29, T, D, BrdColWrap>, + ::caller<30, T, D, BrdColWrap>, + ::caller<31, T, D, BrdColWrap>, + ::caller<32, T, D, BrdColWrap> + } + }; + + ::loadKernel(kernel, ksize, stream); + + callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream); + } +} diff --git a/modules/gpu/src/cuda/row_filter.0.cu b/modules/gpu/src/cuda/row_filter.0.cu new file mode 100644 index 0000000000..a1a8f36cad --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.0.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow<uchar, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.1.cu b/modules/gpu/src/cuda/row_filter.1.cu new file mode 100644 index 0000000000..ab2248e1b2 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.1.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.2.cu b/modules/gpu/src/cuda/row_filter.2.cu new file mode 100644 index 0000000000..5aa2e2b80a --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.2.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.3.cu b/modules/gpu/src/cuda/row_filter.3.cu new file mode 100644 index 0000000000..9d131a959d --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.3.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.4.cu b/modules/gpu/src/cuda/row_filter.4.cu new file mode 100644 index 0000000000..0aae534ce7 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.4.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.5.cu b/modules/gpu/src/cuda/row_filter.5.cu new file mode 100644 index 0000000000..dd1f2be135 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.5.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.6.cu b/modules/gpu/src/cuda/row_filter.6.cu new file mode 100644 index 0000000000..548069d363 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.6.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.7.cu b/modules/gpu/src/cuda/row_filter.7.cu new file mode 100644 index 0000000000..8c5c09ed93 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.7.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu deleted file mode 100644 index 39fc53fdc4..0000000000 --- a/modules/gpu/src/cuda/row_filter.cu +++ /dev/null @@ -1,390 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if !defined CUDA_DISABLER - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" -#include "opencv2/gpu/device/static_check.hpp" - -namespace cv { namespace gpu { namespace device -{ - namespace row_filter - { - #define MAX_KERNEL_SIZE 32 - - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - - void loadKernel(const float* kernel, int ksize, cudaStream_t stream) - { - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - } - - template <int KSIZE, typename T, typename D, typename B> - __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) - const int BLOCK_DIM_X = 32; - const int BLOCK_DIM_Y = 8; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = 1; - #else - const int BLOCK_DIM_X = 32; - const int BLOCK_DIM_Y = 4; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = 1; - #endif - - typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; - - __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; - - const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; - - if (y >= src.rows) - return; - - const T* src_row = src.ptr(y); - - const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; - - if (blockIdx.x > 0) - { - //Load left halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); - } - else - { - //Load left halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); - } - - if (blockIdx.x + 2 < gridDim.x) - { - //Load main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]); - - //Load right halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); - } - else - { - //Load main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); - - //Load right halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); - } - - __syncthreads(); - - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - { - const int x = xStart + j * BLOCK_DIM_X; - - if (x < src.cols) - { - sum_t sum = VecTraits<sum_t>::all(0); - - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; - - dst(y, x) = saturate_cast<D>(sum); - } - } - } - - template <int KSIZE, typename T, typename D, template<typename> class B> - void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) - { - int BLOCK_DIM_X; - int BLOCK_DIM_Y; - int PATCH_PER_BLOCK; - - if (cc >= 20) - { - BLOCK_DIM_X = 32; - BLOCK_DIM_Y = 8; - PATCH_PER_BLOCK = 4; - } - else - { - BLOCK_DIM_X = 32; - BLOCK_DIM_Y = 4; - PATCH_PER_BLOCK = 4; - } - - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); - - B<T> brd(src.cols); - - linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template <typename T, typename D> - void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) - { - typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream); - - static const caller_t callers[5][33] = - { - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReflect101>, - linearRowFilter_caller< 2, T, D, BrdRowReflect101>, - linearRowFilter_caller< 3, T, D, BrdRowReflect101>, - linearRowFilter_caller< 4, T, D, BrdRowReflect101>, - linearRowFilter_caller< 5, T, D, BrdRowReflect101>, - linearRowFilter_caller< 6, T, D, BrdRowReflect101>, - linearRowFilter_caller< 7, T, D, BrdRowReflect101>, - linearRowFilter_caller< 8, T, D, BrdRowReflect101>, - linearRowFilter_caller< 9, T, D, BrdRowReflect101>, - linearRowFilter_caller<10, T, D, BrdRowReflect101>, - linearRowFilter_caller<11, T, D, BrdRowReflect101>, - linearRowFilter_caller<12, T, D, BrdRowReflect101>, - linearRowFilter_caller<13, T, D, BrdRowReflect101>, - linearRowFilter_caller<14, T, D, BrdRowReflect101>, - linearRowFilter_caller<15, T, D, BrdRowReflect101>, - linearRowFilter_caller<16, T, D, BrdRowReflect101>, - linearRowFilter_caller<17, T, D, BrdRowReflect101>, - linearRowFilter_caller<18, T, D, BrdRowReflect101>, - linearRowFilter_caller<19, T, D, BrdRowReflect101>, - linearRowFilter_caller<20, T, D, BrdRowReflect101>, - linearRowFilter_caller<21, T, D, BrdRowReflect101>, - linearRowFilter_caller<22, T, D, BrdRowReflect101>, - linearRowFilter_caller<23, T, D, BrdRowReflect101>, - linearRowFilter_caller<24, T, D, BrdRowReflect101>, - linearRowFilter_caller<25, T, D, BrdRowReflect101>, - linearRowFilter_caller<26, T, D, BrdRowReflect101>, - linearRowFilter_caller<27, T, D, BrdRowReflect101>, - linearRowFilter_caller<28, T, D, BrdRowReflect101>, - linearRowFilter_caller<29, T, D, BrdRowReflect101>, - linearRowFilter_caller<30, T, D, BrdRowReflect101>, - linearRowFilter_caller<31, T, D, BrdRowReflect101>, - linearRowFilter_caller<32, T, D, BrdRowReflect101> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReplicate>, - linearRowFilter_caller< 2, T, D, BrdRowReplicate>, - linearRowFilter_caller< 3, T, D, BrdRowReplicate>, - linearRowFilter_caller< 4, T, D, BrdRowReplicate>, - linearRowFilter_caller< 5, T, D, BrdRowReplicate>, - linearRowFilter_caller< 6, T, D, BrdRowReplicate>, - linearRowFilter_caller< 7, T, D, BrdRowReplicate>, - linearRowFilter_caller< 8, T, D, BrdRowReplicate>, - linearRowFilter_caller< 9, T, D, BrdRowReplicate>, - linearRowFilter_caller<10, T, D, BrdRowReplicate>, - linearRowFilter_caller<11, T, D, BrdRowReplicate>, - linearRowFilter_caller<12, T, D, BrdRowReplicate>, - linearRowFilter_caller<13, T, D, BrdRowReplicate>, - linearRowFilter_caller<14, T, D, BrdRowReplicate>, - linearRowFilter_caller<15, T, D, BrdRowReplicate>, - linearRowFilter_caller<16, T, D, BrdRowReplicate>, - linearRowFilter_caller<17, T, D, BrdRowReplicate>, - linearRowFilter_caller<18, T, D, BrdRowReplicate>, - linearRowFilter_caller<19, T, D, BrdRowReplicate>, - linearRowFilter_caller<20, T, D, BrdRowReplicate>, - linearRowFilter_caller<21, T, D, BrdRowReplicate>, - linearRowFilter_caller<22, T, D, BrdRowReplicate>, - linearRowFilter_caller<23, T, D, BrdRowReplicate>, - linearRowFilter_caller<24, T, D, BrdRowReplicate>, - linearRowFilter_caller<25, T, D, BrdRowReplicate>, - linearRowFilter_caller<26, T, D, BrdRowReplicate>, - linearRowFilter_caller<27, T, D, BrdRowReplicate>, - linearRowFilter_caller<28, T, D, BrdRowReplicate>, - linearRowFilter_caller<29, T, D, BrdRowReplicate>, - linearRowFilter_caller<30, T, D, BrdRowReplicate>, - linearRowFilter_caller<31, T, D, BrdRowReplicate>, - linearRowFilter_caller<32, T, D, BrdRowReplicate> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowConstant>, - linearRowFilter_caller< 2, T, D, BrdRowConstant>, - linearRowFilter_caller< 3, T, D, BrdRowConstant>, - linearRowFilter_caller< 4, T, D, BrdRowConstant>, - linearRowFilter_caller< 5, T, D, BrdRowConstant>, - linearRowFilter_caller< 6, T, D, BrdRowConstant>, - linearRowFilter_caller< 7, T, D, BrdRowConstant>, - linearRowFilter_caller< 8, T, D, BrdRowConstant>, - linearRowFilter_caller< 9, T, D, BrdRowConstant>, - linearRowFilter_caller<10, T, D, BrdRowConstant>, - linearRowFilter_caller<11, T, D, BrdRowConstant>, - linearRowFilter_caller<12, T, D, BrdRowConstant>, - linearRowFilter_caller<13, T, D, BrdRowConstant>, - linearRowFilter_caller<14, T, D, BrdRowConstant>, - linearRowFilter_caller<15, T, D, BrdRowConstant>, - linearRowFilter_caller<16, T, D, BrdRowConstant>, - linearRowFilter_caller<17, T, D, BrdRowConstant>, - linearRowFilter_caller<18, T, D, BrdRowConstant>, - linearRowFilter_caller<19, T, D, BrdRowConstant>, - linearRowFilter_caller<20, T, D, BrdRowConstant>, - linearRowFilter_caller<21, T, D, BrdRowConstant>, - linearRowFilter_caller<22, T, D, BrdRowConstant>, - linearRowFilter_caller<23, T, D, BrdRowConstant>, - linearRowFilter_caller<24, T, D, BrdRowConstant>, - linearRowFilter_caller<25, T, D, BrdRowConstant>, - linearRowFilter_caller<26, T, D, BrdRowConstant>, - linearRowFilter_caller<27, T, D, BrdRowConstant>, - linearRowFilter_caller<28, T, D, BrdRowConstant>, - linearRowFilter_caller<29, T, D, BrdRowConstant>, - linearRowFilter_caller<30, T, D, BrdRowConstant>, - linearRowFilter_caller<31, T, D, BrdRowConstant>, - linearRowFilter_caller<32, T, D, BrdRowConstant> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReflect>, - linearRowFilter_caller< 2, T, D, BrdRowReflect>, - linearRowFilter_caller< 3, T, D, BrdRowReflect>, - linearRowFilter_caller< 4, T, D, BrdRowReflect>, - linearRowFilter_caller< 5, T, D, BrdRowReflect>, - linearRowFilter_caller< 6, T, D, BrdRowReflect>, - linearRowFilter_caller< 7, T, D, BrdRowReflect>, - linearRowFilter_caller< 8, T, D, BrdRowReflect>, - linearRowFilter_caller< 9, T, D, BrdRowReflect>, - linearRowFilter_caller<10, T, D, BrdRowReflect>, - linearRowFilter_caller<11, T, D, BrdRowReflect>, - linearRowFilter_caller<12, T, D, BrdRowReflect>, - linearRowFilter_caller<13, T, D, BrdRowReflect>, - linearRowFilter_caller<14, T, D, BrdRowReflect>, - linearRowFilter_caller<15, T, D, BrdRowReflect>, - linearRowFilter_caller<16, T, D, BrdRowReflect>, - linearRowFilter_caller<17, T, D, BrdRowReflect>, - linearRowFilter_caller<18, T, D, BrdRowReflect>, - linearRowFilter_caller<19, T, D, BrdRowReflect>, - linearRowFilter_caller<20, T, D, BrdRowReflect>, - linearRowFilter_caller<21, T, D, BrdRowReflect>, - linearRowFilter_caller<22, T, D, BrdRowReflect>, - linearRowFilter_caller<23, T, D, BrdRowReflect>, - linearRowFilter_caller<24, T, D, BrdRowReflect>, - linearRowFilter_caller<25, T, D, BrdRowReflect>, - linearRowFilter_caller<26, T, D, BrdRowReflect>, - linearRowFilter_caller<27, T, D, BrdRowReflect>, - linearRowFilter_caller<28, T, D, BrdRowReflect>, - linearRowFilter_caller<29, T, D, BrdRowReflect>, - linearRowFilter_caller<30, T, D, BrdRowReflect>, - linearRowFilter_caller<31, T, D, BrdRowReflect>, - linearRowFilter_caller<32, T, D, BrdRowReflect> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowWrap>, - linearRowFilter_caller< 2, T, D, BrdRowWrap>, - linearRowFilter_caller< 3, T, D, BrdRowWrap>, - linearRowFilter_caller< 4, T, D, BrdRowWrap>, - linearRowFilter_caller< 5, T, D, BrdRowWrap>, - linearRowFilter_caller< 6, T, D, BrdRowWrap>, - linearRowFilter_caller< 7, T, D, BrdRowWrap>, - linearRowFilter_caller< 8, T, D, BrdRowWrap>, - linearRowFilter_caller< 9, T, D, BrdRowWrap>, - linearRowFilter_caller<10, T, D, BrdRowWrap>, - linearRowFilter_caller<11, T, D, BrdRowWrap>, - linearRowFilter_caller<12, T, D, BrdRowWrap>, - linearRowFilter_caller<13, T, D, BrdRowWrap>, - linearRowFilter_caller<14, T, D, BrdRowWrap>, - linearRowFilter_caller<15, T, D, BrdRowWrap>, - linearRowFilter_caller<16, T, D, BrdRowWrap>, - linearRowFilter_caller<17, T, D, BrdRowWrap>, - linearRowFilter_caller<18, T, D, BrdRowWrap>, - linearRowFilter_caller<19, T, D, BrdRowWrap>, - linearRowFilter_caller<20, T, D, BrdRowWrap>, - linearRowFilter_caller<21, T, D, BrdRowWrap>, - linearRowFilter_caller<22, T, D, BrdRowWrap>, - linearRowFilter_caller<23, T, D, BrdRowWrap>, - linearRowFilter_caller<24, T, D, BrdRowWrap>, - linearRowFilter_caller<25, T, D, BrdRowWrap>, - linearRowFilter_caller<26, T, D, BrdRowWrap>, - linearRowFilter_caller<27, T, D, BrdRowWrap>, - linearRowFilter_caller<28, T, D, BrdRowWrap>, - linearRowFilter_caller<29, T, D, BrdRowWrap>, - linearRowFilter_caller<30, T, D, BrdRowWrap>, - linearRowFilter_caller<31, T, D, BrdRowWrap>, - linearRowFilter_caller<32, T, D, BrdRowWrap> - } - }; - - loadKernel(kernel, ksize, stream); - - callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream); - } - - template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu<int , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } // namespace row_filter -}}} // namespace cv { namespace gpu { namespace device - - -#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.h b/modules/gpu/src/cuda/row_filter.h new file mode 100644 index 0000000000..0da2dfe0c5 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.h @@ -0,0 +1,377 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace +{ + #define MAX_KERNEL_SIZE 32 + + __constant__ float c_kernel[MAX_KERNEL_SIZE]; + + void loadKernel(const float* kernel, int ksize, cudaStream_t stream) + { + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); + } + + template <int KSIZE, typename T, typename D, typename B> + __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = 1; + #else + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 4; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = 1; + #endif + + typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; + + __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; + + const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; + + if (y >= src.rows) + return; + + const T* src_row = src.ptr(y); + + const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; + + if (blockIdx.x > 0) + { + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); + } + else + { + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); + } + + if (blockIdx.x + 2 < gridDim.x) + { + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]); + + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); + } + else + { + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); + + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int x = xStart + j * BLOCK_DIM_X; + + if (x < src.cols) + { + sum_t sum = VecTraits<sum_t>::all(0); + + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; + + dst(y, x) = saturate_cast<D>(sum); + } + } + } + + template <int KSIZE, typename T, typename D, template<typename> class B> + void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) + { + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 32; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 32; + BLOCK_DIM_Y = 4; + PATCH_PER_BLOCK = 4; + } + + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); + + B<T> brd(src.cols); + + linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +namespace filter +{ + template <typename T, typename D> + void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + { + typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream); + + static const caller_t callers[5][33] = + { + { + 0, + ::caller< 1, T, D, BrdRowReflect101>, + ::caller< 2, T, D, BrdRowReflect101>, + ::caller< 3, T, D, BrdRowReflect101>, + ::caller< 4, T, D, BrdRowReflect101>, + ::caller< 5, T, D, BrdRowReflect101>, + ::caller< 6, T, D, BrdRowReflect101>, + ::caller< 7, T, D, BrdRowReflect101>, + ::caller< 8, T, D, BrdRowReflect101>, + ::caller< 9, T, D, BrdRowReflect101>, + ::caller<10, T, D, BrdRowReflect101>, + ::caller<11, T, D, BrdRowReflect101>, + ::caller<12, T, D, BrdRowReflect101>, + ::caller<13, T, D, BrdRowReflect101>, + ::caller<14, T, D, BrdRowReflect101>, + ::caller<15, T, D, BrdRowReflect101>, + ::caller<16, T, D, BrdRowReflect101>, + ::caller<17, T, D, BrdRowReflect101>, + ::caller<18, T, D, BrdRowReflect101>, + ::caller<19, T, D, BrdRowReflect101>, + ::caller<20, T, D, BrdRowReflect101>, + ::caller<21, T, D, BrdRowReflect101>, + ::caller<22, T, D, BrdRowReflect101>, + ::caller<23, T, D, BrdRowReflect101>, + ::caller<24, T, D, BrdRowReflect101>, + ::caller<25, T, D, BrdRowReflect101>, + ::caller<26, T, D, BrdRowReflect101>, + ::caller<27, T, D, BrdRowReflect101>, + ::caller<28, T, D, BrdRowReflect101>, + ::caller<29, T, D, BrdRowReflect101>, + ::caller<30, T, D, BrdRowReflect101>, + ::caller<31, T, D, BrdRowReflect101>, + ::caller<32, T, D, BrdRowReflect101> + }, + { + 0, + ::caller< 1, T, D, BrdRowReplicate>, + ::caller< 2, T, D, BrdRowReplicate>, + ::caller< 3, T, D, BrdRowReplicate>, + ::caller< 4, T, D, BrdRowReplicate>, + ::caller< 5, T, D, BrdRowReplicate>, + ::caller< 6, T, D, BrdRowReplicate>, + ::caller< 7, T, D, BrdRowReplicate>, + ::caller< 8, T, D, BrdRowReplicate>, + ::caller< 9, T, D, BrdRowReplicate>, + ::caller<10, T, D, BrdRowReplicate>, + ::caller<11, T, D, BrdRowReplicate>, + ::caller<12, T, D, BrdRowReplicate>, + ::caller<13, T, D, BrdRowReplicate>, + ::caller<14, T, D, BrdRowReplicate>, + ::caller<15, T, D, BrdRowReplicate>, + ::caller<16, T, D, BrdRowReplicate>, + ::caller<17, T, D, BrdRowReplicate>, + ::caller<18, T, D, BrdRowReplicate>, + ::caller<19, T, D, BrdRowReplicate>, + ::caller<20, T, D, BrdRowReplicate>, + ::caller<21, T, D, BrdRowReplicate>, + ::caller<22, T, D, BrdRowReplicate>, + ::caller<23, T, D, BrdRowReplicate>, + ::caller<24, T, D, BrdRowReplicate>, + ::caller<25, T, D, BrdRowReplicate>, + ::caller<26, T, D, BrdRowReplicate>, + ::caller<27, T, D, BrdRowReplicate>, + ::caller<28, T, D, BrdRowReplicate>, + ::caller<29, T, D, BrdRowReplicate>, + ::caller<30, T, D, BrdRowReplicate>, + ::caller<31, T, D, BrdRowReplicate>, + ::caller<32, T, D, BrdRowReplicate> + }, + { + 0, + ::caller< 1, T, D, BrdRowConstant>, + ::caller< 2, T, D, BrdRowConstant>, + ::caller< 3, T, D, BrdRowConstant>, + ::caller< 4, T, D, BrdRowConstant>, + ::caller< 5, T, D, BrdRowConstant>, + ::caller< 6, T, D, BrdRowConstant>, + ::caller< 7, T, D, BrdRowConstant>, + ::caller< 8, T, D, BrdRowConstant>, + ::caller< 9, T, D, BrdRowConstant>, + ::caller<10, T, D, BrdRowConstant>, + ::caller<11, T, D, BrdRowConstant>, + ::caller<12, T, D, BrdRowConstant>, + ::caller<13, T, D, BrdRowConstant>, + ::caller<14, T, D, BrdRowConstant>, + ::caller<15, T, D, BrdRowConstant>, + ::caller<16, T, D, BrdRowConstant>, + ::caller<17, T, D, BrdRowConstant>, + ::caller<18, T, D, BrdRowConstant>, + ::caller<19, T, D, BrdRowConstant>, + ::caller<20, T, D, BrdRowConstant>, + ::caller<21, T, D, BrdRowConstant>, + ::caller<22, T, D, BrdRowConstant>, + ::caller<23, T, D, BrdRowConstant>, + ::caller<24, T, D, BrdRowConstant>, + ::caller<25, T, D, BrdRowConstant>, + ::caller<26, T, D, BrdRowConstant>, + ::caller<27, T, D, BrdRowConstant>, + ::caller<28, T, D, BrdRowConstant>, + ::caller<29, T, D, BrdRowConstant>, + ::caller<30, T, D, BrdRowConstant>, + ::caller<31, T, D, BrdRowConstant>, + ::caller<32, T, D, BrdRowConstant> + }, + { + 0, + ::caller< 1, T, D, BrdRowReflect>, + ::caller< 2, T, D, BrdRowReflect>, + ::caller< 3, T, D, BrdRowReflect>, + ::caller< 4, T, D, BrdRowReflect>, + ::caller< 5, T, D, BrdRowReflect>, + ::caller< 6, T, D, BrdRowReflect>, + ::caller< 7, T, D, BrdRowReflect>, + ::caller< 8, T, D, BrdRowReflect>, + ::caller< 9, T, D, BrdRowReflect>, + ::caller<10, T, D, BrdRowReflect>, + ::caller<11, T, D, BrdRowReflect>, + ::caller<12, T, D, BrdRowReflect>, + ::caller<13, T, D, BrdRowReflect>, + ::caller<14, T, D, BrdRowReflect>, + ::caller<15, T, D, BrdRowReflect>, + ::caller<16, T, D, BrdRowReflect>, + ::caller<17, T, D, BrdRowReflect>, + ::caller<18, T, D, BrdRowReflect>, + ::caller<19, T, D, BrdRowReflect>, + ::caller<20, T, D, BrdRowReflect>, + ::caller<21, T, D, BrdRowReflect>, + ::caller<22, T, D, BrdRowReflect>, + ::caller<23, T, D, BrdRowReflect>, + ::caller<24, T, D, BrdRowReflect>, + ::caller<25, T, D, BrdRowReflect>, + ::caller<26, T, D, BrdRowReflect>, + ::caller<27, T, D, BrdRowReflect>, + ::caller<28, T, D, BrdRowReflect>, + ::caller<29, T, D, BrdRowReflect>, + ::caller<30, T, D, BrdRowReflect>, + ::caller<31, T, D, BrdRowReflect>, + ::caller<32, T, D, BrdRowReflect> + }, + { + 0, + ::caller< 1, T, D, BrdRowWrap>, + ::caller< 2, T, D, BrdRowWrap>, + ::caller< 3, T, D, BrdRowWrap>, + ::caller< 4, T, D, BrdRowWrap>, + ::caller< 5, T, D, BrdRowWrap>, + ::caller< 6, T, D, BrdRowWrap>, + ::caller< 7, T, D, BrdRowWrap>, + ::caller< 8, T, D, BrdRowWrap>, + ::caller< 9, T, D, BrdRowWrap>, + ::caller<10, T, D, BrdRowWrap>, + ::caller<11, T, D, BrdRowWrap>, + ::caller<12, T, D, BrdRowWrap>, + ::caller<13, T, D, BrdRowWrap>, + ::caller<14, T, D, BrdRowWrap>, + ::caller<15, T, D, BrdRowWrap>, + ::caller<16, T, D, BrdRowWrap>, + ::caller<17, T, D, BrdRowWrap>, + ::caller<18, T, D, BrdRowWrap>, + ::caller<19, T, D, BrdRowWrap>, + ::caller<20, T, D, BrdRowWrap>, + ::caller<21, T, D, BrdRowWrap>, + ::caller<22, T, D, BrdRowWrap>, + ::caller<23, T, D, BrdRowWrap>, + ::caller<24, T, D, BrdRowWrap>, + ::caller<25, T, D, BrdRowWrap>, + ::caller<26, T, D, BrdRowWrap>, + ::caller<27, T, D, BrdRowWrap>, + ::caller<28, T, D, BrdRowWrap>, + ::caller<29, T, D, BrdRowWrap>, + ::caller<30, T, D, BrdRowWrap>, + ::caller<31, T, D, BrdRowWrap>, + ::caller<32, T, D, BrdRowWrap> + } + }; + + loadKernel(kernel, ksize, stream); + + callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream); + } +} diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index 77ed46e159..6b7135ab69 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -830,20 +830,14 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke //////////////////////////////////////////////////////////////////////////////////////////////////// // Separable Linear Filter -namespace cv { namespace gpu { namespace device +namespace filter { - namespace row_filter - { - template <typename T, typename D> - void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } + template <typename T, typename D> + void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - namespace column_filter - { - template <typename T, typename D> - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } -}}} + template <typename T, typename D> + void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} namespace { @@ -899,8 +893,6 @@ namespace Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType) { - using namespace ::cv::gpu::device::row_filter; - static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4)) @@ -940,28 +932,28 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, switch (srcType) { case CV_8UC1: - func = linearRowFilter_gpu<uchar, float>; + func = filter::linearRow<uchar, float>; break; case CV_8UC3: - func = linearRowFilter_gpu<uchar3, float3>; + func = filter::linearRow<uchar3, float3>; break; case CV_8UC4: - func = linearRowFilter_gpu<uchar4, float4>; + func = filter::linearRow<uchar4, float4>; break; case CV_16SC3: - func = linearRowFilter_gpu<short3, float3>; + func = filter::linearRow<short3, float3>; break; case CV_32SC1: - func = linearRowFilter_gpu<int, float>; + func = filter::linearRow<int, float>; break; case CV_32FC1: - func = linearRowFilter_gpu<float, float>; + func = filter::linearRow<float, float>; break; case CV_32FC3: - func = linearRowFilter_gpu<float3, float3>; + func = filter::linearRow<float3, float3>; break; case CV_32FC4: - func = linearRowFilter_gpu<float4, float4>; + func = filter::linearRow<float4, float4>; break; } @@ -1020,8 +1012,6 @@ namespace Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType) { - using namespace ::cv::gpu::device::column_filter; - static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R}; if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4)) @@ -1061,28 +1051,28 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds switch (dstType) { case CV_8UC1: - func = linearColumnFilter_gpu<float, uchar>; + func = filter::linearColumn<float, uchar>; break; case CV_8UC3: - func = linearColumnFilter_gpu<float3, uchar3>; + func = filter::linearColumn<float3, uchar3>; break; case CV_8UC4: - func = linearColumnFilter_gpu<float4, uchar4>; + func = filter::linearColumn<float4, uchar4>; break; case CV_16SC3: - func = linearColumnFilter_gpu<float3, short3>; + func = filter::linearColumn<float3, short3>; break; case CV_32SC1: - func = linearColumnFilter_gpu<float, int>; + func = filter::linearColumn<float, int>; break; case CV_32FC1: - func = linearColumnFilter_gpu<float, float>; + func = filter::linearColumn<float, float>; break; case CV_32FC3: - func = linearColumnFilter_gpu<float3, float3>; + func = filter::linearColumn<float3, float3>; break; case CV_32FC4: - func = linearColumnFilter_gpu<float4, float4>; + func = filter::linearColumn<float4, float4>; break; }