Merge pull request #1540 from jet47:gpuarithm-cudev

pull/1575/merge
Roman Donchenko 12 years ago committed by OpenCV Buildbot
commit 21233656bd
  1. 10
      modules/core/src/cuda/gpu_mat.cu
  2. 2
      modules/cudaarithm/CMakeLists.txt
  3. 57
      modules/cudaarithm/perf/perf_arithm.cpp
  4. 61
      modules/cudaarithm/perf/perf_reductions.cpp
  5. 89
      modules/cudaarithm/src/arithm.cpp
  6. 479
      modules/cudaarithm/src/core.cpp
  7. 159
      modules/cudaarithm/src/cuda/absdiff_mat.cu
  8. 76
      modules/cudaarithm/src/cuda/absdiff_scalar.cu
  9. 254
      modules/cudaarithm/src/cuda/add_mat.cu
  10. 200
      modules/cudaarithm/src/cuda/add_scalar.cu
  11. 800
      modules/cudaarithm/src/cuda/add_weighted.cu
  12. 145
      modules/cudaarithm/src/cuda/arithm_func_traits.hpp
  13. 212
      modules/cudaarithm/src/cuda/bitwise_mat.cu
  14. 145
      modules/cudaarithm/src/cuda/bitwise_scalar.cu
  15. 231
      modules/cudaarithm/src/cuda/cmp_mat.cu
  16. 291
      modules/cudaarithm/src/cuda/cmp_scalar.cu
  17. 170
      modules/cudaarithm/src/cuda/copy_make_border.cu
  18. 150
      modules/cudaarithm/src/cuda/countnonzero.cu
  19. 288
      modules/cudaarithm/src/cuda/div_mat.cu
  20. 274
      modules/cudaarithm/src/cuda/div_scalar.cu
  21. 444
      modules/cudaarithm/src/cuda/integral.cu
  22. 207
      modules/cudaarithm/src/cuda/lut.cu
  23. 333
      modules/cudaarithm/src/cuda/math.cu
  24. 232
      modules/cudaarithm/src/cuda/minmax.cu
  25. 267
      modules/cudaarithm/src/cuda/minmax_mat.cu
  26. 231
      modules/cudaarithm/src/cuda/minmaxloc.cu
  27. 271
      modules/cudaarithm/src/cuda/mul_mat.cu
  28. 204
      modules/cudaarithm/src/cuda/mul_scalar.cu
  29. 174
      modules/cudaarithm/src/cuda/mul_spectrums.cu
  30. 119
      modules/cudaarithm/src/cuda/norm.cu
  31. 336
      modules/cudaarithm/src/cuda/polar_cart.cu
  32. 463
      modules/cudaarithm/src/cuda/reduce.cu
  33. 545
      modules/cudaarithm/src/cuda/split_merge.cu
  34. 254
      modules/cudaarithm/src/cuda/sub_mat.cu
  35. 224
      modules/cudaarithm/src/cuda/sub_scalar.cu
  36. 407
      modules/cudaarithm/src/cuda/sum.cu
  37. 130
      modules/cudaarithm/src/cuda/threshold.cu
  38. 98
      modules/cudaarithm/src/cuda/transpose.cu
  39. 135
      modules/cudaarithm/src/cuda/unroll_detail.hpp
  40. 2801
      modules/cudaarithm/src/element_operations.cpp
  41. 7
      modules/cudaarithm/src/precomp.hpp
  42. 619
      modules/cudaarithm/src/reductions.cpp
  43. 37
      modules/cudaarithm/test/test_arithm.cpp
  44. 74
      modules/cudaarithm/test/test_reductions.cpp
  45. 2
      modules/cudev/CMakeLists.txt
  46. 2
      modules/cudev/include/opencv2/cudev.hpp
  47. 2
      modules/cudev/include/opencv2/cudev/expr/reduction.hpp
  48. 24
      modules/cudev/include/opencv2/cudev/functional/functional.hpp
  49. 4
      modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
  50. 177
      modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
  51. 31
      modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
  52. 44
      modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
  53. 10
      modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp
  54. 25
      modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
  55. 167
      modules/cudev/include/opencv2/cudev/grid/reduce.hpp
  56. 29
      modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
  57. 185
      modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
  58. 136
      modules/cudev/include/opencv2/cudev/grid/transform.hpp
  59. 40
      modules/cudev/include/opencv2/cudev/grid/transpose.hpp
  60. 6
      modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
  61. 36
      modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
  62. 17
      modules/cudev/include/opencv2/cudev/util/vec_math.hpp
  63. 2
      modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
  64. 12
      modules/cudev/test/test_reduction.cu

@ -216,7 +216,7 @@ namespace
template <typename T>
void copyWithMask(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream)
{
gridTransform_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
gridTransformUnary_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
}
}
@ -268,14 +268,14 @@ namespace
void setToWithOutMask(const GpuMat& mat, Scalar _scalar, Stream& stream)
{
Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
}
template <typename T>
void setToWithMask(const GpuMat& mat, const GpuMat& mask, Scalar _scalar, Stream& stream)
{
Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
}
}
@ -382,7 +382,7 @@ namespace
typedef typename LargerType<src_elem_type, float>::type larger_elem_type;
typedef typename LargerType<float, dst_elem_type>::type scalar_type;
gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
}
template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
@ -408,7 +408,7 @@ namespace
op.alpha = cv::saturate_cast<scalar_type>(alpha);
op.beta = cv::saturate_cast<scalar_type>(beta);
gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
}
}

@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Operations on Matrices")
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy)
ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev)
ocv_module_include_directories()
ocv_glob_module_sources()

@ -248,60 +248,3 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// Integral
PERF_TEST_P(Sz, Integral,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst;
cv::cuda::GpuMat d_buf;
TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
CUDA_SANITY_CHECK(dst);
}
else
{
cv::Mat dst;
TEST_CYCLE() cv::integral(src, dst);
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// IntegralSqr
PERF_TEST_P(Sz, IntegralSqr,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst, buf;
TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
CUDA_SANITY_CHECK(dst);
}
else
{
FAIL_NO_CPU();
}
}

@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst;
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp);
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F);
CUDA_SANITY_CHECK(dst);
}
@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
{
cv::Mat dst;
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp);
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F);
CPU_SANITY_CHECK(dst);
}
@ -465,3 +465,60 @@ PERF_TEST_P(Sz, MeanStdDev,
SANITY_CHECK(cpu_stddev);
}
}
//////////////////////////////////////////////////////////////////////
// Integral
PERF_TEST_P(Sz, Integral,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst;
cv::cuda::GpuMat d_buf;
TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
CUDA_SANITY_CHECK(dst);
}
else
{
cv::Mat dst;
TEST_CYCLE() cv::integral(src, dst);
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// IntegralSqr
PERF_TEST_P(Sz, IntegralSqr,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst, buf;
TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
CUDA_SANITY_CHECK(dst);
}
else
{
FAIL_NO_CPU();
}
}

@ -292,95 +292,6 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
#endif
}
//////////////////////////////////////////////////////////////////////////////
// mulSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace cuda { namespace device
{
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) _src1;
(void) _src2;
(void) _dst;
(void) flags;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void) flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ };
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
Caller caller = callers[(int)conjB];
caller(src1, src2, dst, StreamAccessor::getStream(stream));
#endif
}
//////////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace cuda { namespace device
{
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) _src1;
(void) _src2;
(void) _dst;
(void) flags;
(void) scale;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void)flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ };
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
Caller caller = callers[(int)conjB];
caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
#endif
}
//////////////////////////////////////////////////////////////////////////////
// dft

@ -63,163 +63,6 @@ void cv::cuda::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int,
#else /* !defined (HAVE_CUDA) */
////////////////////////////////////////////////////////////////////////
// merge/split
namespace cv { namespace cuda { namespace device
{
namespace split_merge
{
void merge(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
}
}}}
namespace
{
void merge_caller(const GpuMat* src, size_t n, OutputArray _dst, Stream& stream)
{
CV_Assert( src != 0 );
CV_Assert( n > 0 && n <= 4 );
const int depth = src[0].depth();
const Size size = src[0].size();
for (size_t i = 0; i < n; ++i)
{
CV_Assert( src[i].size() == size );
CV_Assert( src[i].depth() == depth );
CV_Assert( src[i].channels() == 1 );
}
if (depth == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
if (n == 1)
{
src[0].copyTo(_dst, stream);
}
else
{
_dst.create(size, CV_MAKE_TYPE(depth, (int)n));
GpuMat dst = _dst.getGpuMat();
PtrStepSzb src_as_devmem[4];
for(size_t i = 0; i < n; ++i)
src_as_devmem[i] = src[i];
PtrStepSzb dst_as_devmem(dst);
cv::cuda::device::split_merge::merge(src_as_devmem, dst_as_devmem, (int)n, CV_ELEM_SIZE(depth), StreamAccessor::getStream(stream));
}
}
void split_caller(const GpuMat& src, GpuMat* dst, Stream& stream)
{
CV_Assert( dst != 0 );
const int depth = src.depth();
const int num_channels = src.channels();
CV_Assert( num_channels <= 4 );
if (depth == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
if (num_channels == 1)
{
src.copyTo(dst[0], stream);
return;
}
for (int i = 0; i < num_channels; ++i)
dst[i].create(src.size(), depth);
PtrStepSzb dst_as_devmem[4];
for (int i = 0; i < num_channels; ++i)
dst_as_devmem[i] = dst[i];
PtrStepSzb src_as_devmem(src);
cv::cuda::device::split_merge::split(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), StreamAccessor::getStream(stream));
}
}
void cv::cuda::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream)
{
merge_caller(src, n, dst, stream);
}
void cv::cuda::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream)
{
merge_caller(&src[0], src.size(), dst, stream);
}
void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream)
{
GpuMat src = _src.getGpuMat();
split_caller(src, dst, stream);
}
void cv::cuda::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
{
GpuMat src = _src.getGpuMat();
dst.resize(src.channels());
if(src.channels() > 0)
split_caller(src, &dst[0], stream);
}
////////////////////////////////////////////////////////////////////////
// transpose
namespace arithm
{
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
}
void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& _stream)
{
GpuMat src = _src.getGpuMat();
CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
_dst.create( src.cols, src.rows, src.type() );
GpuMat dst = _dst.getGpuMat();
cudaStream_t stream = StreamAccessor::getStream(_stream);
if (src.elemSize() == 1)
{
NppStreamHandler h(stream);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
else if (src.elemSize() == 4)
{
arithm::transpose<int>(src, dst, stream);
}
else // if (src.elemSize() == 8)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
arithm::transpose<double>(src, dst, stream);
}
}
////////////////////////////////////////////////////////////////////////
// flip
@ -287,326 +130,4 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
// LUT
#if (CUDA_VERSION >= 5000)
namespace
{
class LookUpTableImpl : public LookUpTable
{
public:
LookUpTableImpl(InputArray lut);
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
private:
int lut_cn;
int nValues3[3];
const Npp32s* pValues3[3];
const Npp32s* pLevels3[3];
GpuMat d_pLevels;
GpuMat d_nppLut;
GpuMat d_nppLut3[3];
};
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
{
nValues3[0] = nValues3[1] = nValues3[2] = 256;
Npp32s pLevels[256];
for (int i = 0; i < 256; ++i)
pLevels[i] = i;
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
GpuMat lut;
if (_lut.kind() == _InputArray::GPU_MAT)
{
lut = _lut.getGpuMat();
}
else
{
Mat hLut = _lut.getMat();
CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
lut.upload(Mat(1, 256, hLut.type(), hLut.data));
}
lut_cn = lut.channels();
CV_Assert( lut.depth() == CV_8U );
CV_Assert( lut.rows == 1 && lut.cols == 256 );
lut.convertTo(d_nppLut, CV_32S);
if (lut_cn == 1)
{
pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
}
else
{
cuda::split(d_nppLut, d_nppLut3);
pValues3[0] = d_nppLut3[0].ptr<Npp32s>();
pValues3[1] = d_nppLut3[1].ptr<Npp32s>();
pValues3[2] = d_nppLut3[2].ptr<Npp32s>();
}
}
void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
{
GpuMat src = _src.getGpuMat();
const int cn = src.channels();
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
CV_Assert( lut_cn == 1 || lut_cn == cn );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
cudaStream_t stream = StreamAccessor::getStream(_stream);
NppStreamHandler h(stream);
NppiSize sz;
sz.height = src.rows;
sz.width = src.cols;
if (src.type() == CV_8UC1)
{
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
}
else
{
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
#else // (CUDA_VERSION >= 5000)
namespace
{
class LookUpTableImpl : public LookUpTable
{
public:
LookUpTableImpl(InputArray lut);
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
private:
int lut_cn;
Npp32s pLevels[256];
int nValues3[3];
const Npp32s* pValues3[3];
const Npp32s* pLevels3[3];
Mat nppLut;
Mat nppLut3[3];
};
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
{
nValues3[0] = nValues3[1] = nValues3[2] = 256;
for (int i = 0; i < 256; ++i)
pLevels[i] = i;
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
Mat lut;
if (_lut.kind() == _InputArray::GPU_MAT)
{
lut = Mat(_lut.getGpuMat());
}
else
{
Mat hLut = _lut.getMat();
CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
lut = hLut;
}
lut_cn = lut.channels();
CV_Assert( lut.depth() == CV_8U );
CV_Assert( lut.rows == 1 && lut.cols == 256 );
lut.convertTo(nppLut, CV_32S);
if (lut_cn == 1)
{
pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
}
else
{
cv::split(nppLut, nppLut3);
pValues3[0] = nppLut3[0].ptr<Npp32s>();
pValues3[1] = nppLut3[1].ptr<Npp32s>();
pValues3[2] = nppLut3[2].ptr<Npp32s>();
}
}
void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
{
GpuMat src = _src.getGpuMat();
const int cn = src.channels();
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
CV_Assert( lut_cn == 1 || lut_cn == cn );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
cudaStream_t stream = StreamAccessor::getStream(_stream);
NppStreamHandler h(stream);
NppiSize sz;
sz.height = src.rows;
sz.width = src.cols;
if (src.type() == CV_8UC1)
{
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
}
else
{
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
#endif // (CUDA_VERSION >= 5000)
Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray lut)
{
return makePtr<LookUpTableImpl>(lut);
}
////////////////////////////////////////////////////////////////////////
// copyMakeBorder
namespace cv { namespace cuda { namespace device
{
namespace imgproc
{
template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
}
}}}
namespace
{
template <typename T, int cn> void copyMakeBorder_caller(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
{
using namespace ::cv::cuda::device::imgproc;
Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
}
}
#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__ > 4
typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
#else
typedef Npp32s Npp32s_a;
#endif
void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream)
{
GpuMat src = _src.getGpuMat();
CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
_dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
GpuMat dst = _dst.getGpuMat();
cudaStream_t stream = StreamAccessor::getStream(_stream);
if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
{
NppiSize srcsz;
srcsz.width = src.cols;
srcsz.height = src.rows;
NppiSize dstsz;
dstsz.width = dst.cols;
dstsz.height = dst.rows;
NppStreamHandler h(stream);
switch (src.type())
{
case CV_8UC1:
{
Npp8u nVal = saturate_cast<Npp8u>(value[0]);
nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
break;
}
case CV_8UC4:
{
Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
break;
}
case CV_32SC1:
{
Npp32s nVal = saturate_cast<Npp32s>(value[0]);
nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
break;
}
case CV_32FC1:
{
Npp32f val = saturate_cast<Npp32f>(value[0]);
Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
break;
}
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
else
{
typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
static const caller_t callers[6][4] =
{
{ copyMakeBorder_caller<uchar, 1> , copyMakeBorder_caller<uchar, 2> , copyMakeBorder_caller<uchar, 3> , copyMakeBorder_caller<uchar, 4>},
{0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
{ copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/, copyMakeBorder_caller<ushort, 3> , copyMakeBorder_caller<ushort, 4>},
{ copyMakeBorder_caller<short, 1> , 0/*copyMakeBorder_caller<short, 2>*/ , copyMakeBorder_caller<short, 3> , copyMakeBorder_caller<short, 4>},
{0/*copyMakeBorder_caller<int, 1>*/, 0/*copyMakeBorder_caller<int, 2>*/ , 0/*copyMakeBorder_caller<int, 3>*/, 0/*copyMakeBorder_caller<int , 4>*/},
{ copyMakeBorder_caller<float, 1> , 0/*copyMakeBorder_caller<float, 2>*/ , copyMakeBorder_caller<float, 3> , copyMakeBorder_caller<float ,4>}
};
caller_t func = callers[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
func(src, dst, top, left, borderType, value, stream);
}
}
#endif /* !defined (HAVE_CUDA) */

@ -40,43 +40,22 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
{
struct VAbsDiff4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff4(a, b);
}
__host__ __device__ __forceinline__ VAbsDiff4() {}
__host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
};
#include "opencv2/cudev.hpp"
struct VAbsDiff2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff2(a, b);
}
using namespace cv::cudev;
__host__ __device__ __forceinline__ VAbsDiff2() {}
__host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
};
void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
namespace
{
__device__ __forceinline__ int _abs(int a)
{
return ::abs(a);
@ -90,58 +69,120 @@ namespace arithm
return ::fabs(a);
}
template <typename T> struct AbsDiffMat : binary_function<T, T, T>
template <typename T> struct AbsDiffOp1 : binary_function<T, T, T>
{
__device__ __forceinline__ T operator ()(T a, T b) const
{
return saturate_cast<T>(_abs(a - b));
}
__host__ __device__ __forceinline__ AbsDiffMat() {}
__host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <typename T>
void absDiffMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), AbsDiffOp1<T>(), stream);
}
struct AbsDiffOp2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff2(a, b);
}
};
}}}
namespace arithm
{
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
void absDiffMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
device::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
const int vcols = src1.cols >> 1;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, AbsDiffOp2(), stream);
}
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
struct AbsDiffOp4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff4(a, b);
}
};
void absDiffMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
device::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
const int vcols = src1.cols >> 2;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, AbsDiffOp4(), stream);
}
}
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
absDiffMat_v1<uchar>,
absDiffMat_v1<schar>,
absDiffMat_v1<ushort>,
absDiffMat_v1<short>,
absDiffMat_v1<int>,
absDiffMat_v1<float>,
absDiffMat_v1<double>
};
const int depth = src1.depth();
CV_DbgAssert( depth <= CV_64F );
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
if (depth == CV_8U || depth == CV_16U)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
absDiffMat_v4(src1_, src2_, dst_, stream);
return;
}
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
absDiffMat_v2(src1_, src2_, dst_, stream);
return;
}
}
}
template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
const func_t func = funcs[depth];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
func(src1_, src2_, dst_, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,59 +40,71 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
namespace
{
template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
template <typename T, typename S> struct AbsDiffScalarOp : unary_function<T, T>
{
S val;
__host__ explicit AbsDiffScalar(S val_) : val(val_) {}
__device__ __forceinline__ T operator ()(T a) const
{
abs_func<S> f;
return saturate_cast<T>(f(a - val));
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
}}}
namespace arithm
template <typename SrcType, typename ScalarDepth>
void absDiffScalarImpl(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
{
AbsDiffScalarOp<SrcType, ScalarDepth> op;
op.val = static_cast<ScalarDepth>(value);
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<SrcType>(dst), op, stream);
}
}
void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
{
template <typename T, typename S>
void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
typedef void (*func_t)(const GpuMat& src, double val, GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
AbsDiffScalar<T, S> op(static_cast<S>(val));
absDiffScalarImpl<uchar, float>,
absDiffScalarImpl<schar, float>,
absDiffScalarImpl<ushort, float>,
absDiffScalarImpl<short, float>,
absDiffScalarImpl<int, float>,
absDiffScalarImpl<float, float>,
absDiffScalarImpl<double, double>
};
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
}
const int depth = src.depth();
CV_DbgAssert( depth <= CV_64F );
template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
funcs[depth](src, val[0], dst, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,146 +40,186 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
namespace
{
struct VAdd4 : binary_function<uint, uint, uint>
template <typename T, typename D> struct AddOp1 : binary_function<T, T, D>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
__device__ __forceinline__ D operator ()(T a, T b) const
{
return vadd4(a, b);
return saturate_cast<D>(a + b);
}
__host__ __device__ __forceinline__ VAdd4() {}
__host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
};
struct VAdd2 : binary_function<uint, uint, uint>
template <typename T, typename D>
void addMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
if (mask.data)
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), AddOp1<T, D>(), globPtr<uchar>(mask), stream);
else
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), AddOp1<T, D>(), stream);
}
struct AddOp2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vadd2(a, b);
}
__host__ __device__ __forceinline__ VAdd2() {}
__host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
};
template <typename T, typename D> struct AddMat : binary_function<T, T, D>
void addMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
__device__ __forceinline__ D operator ()(T a, T b) const
const int vcols = src1.cols >> 1;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, AddOp2(), stream);
}
struct AddOp4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return saturate_cast<D>(a + b);
return vadd4(a, b);
}
__host__ __device__ __forceinline__ AddMat() {}
__host__ __device__ __forceinline__ AddMat(const AddMat&) {}
};
void addMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
const int vcols = src1.cols >> 2;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, AddOp4(), stream);
}
}
namespace cv { namespace cuda { namespace device
void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
{
template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
static const func_t funcs[7][7] =
{
{
addMat_v1<uchar, uchar>,
addMat_v1<uchar, schar>,
addMat_v1<uchar, ushort>,
addMat_v1<uchar, short>,
addMat_v1<uchar, int>,
addMat_v1<uchar, float>,
addMat_v1<uchar, double>
},
{
addMat_v1<schar, uchar>,
addMat_v1<schar, schar>,
addMat_v1<schar, ushort>,
addMat_v1<schar, short>,
addMat_v1<schar, int>,
addMat_v1<schar, float>,
addMat_v1<schar, double>
},
{
0 /*addMat_v1<ushort, uchar>*/,
0 /*addMat_v1<ushort, schar>*/,
addMat_v1<ushort, ushort>,
addMat_v1<ushort, short>,
addMat_v1<ushort, int>,
addMat_v1<ushort, float>,
addMat_v1<ushort, double>
},
{
0 /*addMat_v1<short, uchar>*/,
0 /*addMat_v1<short, schar>*/,
addMat_v1<short, ushort>,
addMat_v1<short, short>,
addMat_v1<short, int>,
addMat_v1<short, float>,
addMat_v1<short, double>
},
{
0 /*addMat_v1<int, uchar>*/,
0 /*addMat_v1<int, schar>*/,
0 /*addMat_v1<int, ushort>*/,
0 /*addMat_v1<int, short>*/,
addMat_v1<int, int>,
addMat_v1<int, float>,
addMat_v1<int, double>
},
{
0 /*addMat_v1<float, uchar>*/,
0 /*addMat_v1<float, schar>*/,
0 /*addMat_v1<float, ushort>*/,
0 /*addMat_v1<float, short>*/,
0 /*addMat_v1<float, int>*/,
addMat_v1<float, float>,
addMat_v1<float, double>
},
{
0 /*addMat_v1<double, uchar>*/,
0 /*addMat_v1<double, schar>*/,
0 /*addMat_v1<double, ushort>*/,
0 /*addMat_v1<double, short>*/,
0 /*addMat_v1<double, int>*/,
0 /*addMat_v1<double, float>*/,
addMat_v1<double, double>
}
};
template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
const int sdepth = src1.depth();
const int ddepth = dst.depth();
template <typename T, typename D> struct TransformFunctorTraits< arithm::AddMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
namespace arithm
{
void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
}
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
{
device::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
}
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
template <typename T, typename D>
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
{
addMat_v4(src1_, src2_, dst_, stream);
return;
}
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
{
addMat_v2(src1_, src2_, dst_, stream);
return;
}
}
}
template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
func(src1_, src2_, dst_, mask, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,109 +40,141 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
namespace
{
template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
template <typename SrcType, typename ScalarType, typename DstType> struct AddScalarOp : unary_function<SrcType, DstType>
{
S val;
__host__ explicit AddScalar(S val_) : val(val_) {}
ScalarType val;
__device__ __forceinline__ D operator ()(T a) const
__device__ __forceinline__ DstType operator ()(SrcType a) const
{
return saturate_cast<D>(a + val);
return saturate_cast<DstType>(saturate_cast<ScalarType>(a) + val);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
template <typename SrcType, typename ScalarDepth, typename DstType>
void addScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
AddScalar<T, S, D> op(static_cast<S>(val));
typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
cv::Scalar_<ScalarDepth> value_ = value;
AddScalarOp<SrcType, ScalarType, DstType> op;
op.val = VecTraits<ScalarType>::make(value_.val);
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
}
}
void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
{
typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, const GpuMat& mask, Stream& stream);
static const func_t funcs[7][7][4] =
{
{
{addScalarImpl<uchar, float, uchar>, addScalarImpl<uchar2, float, uchar2>, addScalarImpl<uchar3, float, uchar3>, addScalarImpl<uchar4, float, uchar4>},
{addScalarImpl<uchar, float, schar>, addScalarImpl<uchar2, float, char2>, addScalarImpl<uchar3, float, char3>, addScalarImpl<uchar4, float, char4>},
{addScalarImpl<uchar, float, ushort>, addScalarImpl<uchar2, float, ushort2>, addScalarImpl<uchar3, float, ushort3>, addScalarImpl<uchar4, float, ushort4>},
{addScalarImpl<uchar, float, short>, addScalarImpl<uchar2, float, short2>, addScalarImpl<uchar3, float, short3>, addScalarImpl<uchar4, float, short4>},
{addScalarImpl<uchar, float, int>, addScalarImpl<uchar2, float, int2>, addScalarImpl<uchar3, float, int3>, addScalarImpl<uchar4, float, int4>},
{addScalarImpl<uchar, float, float>, addScalarImpl<uchar2, float, float2>, addScalarImpl<uchar3, float, float3>, addScalarImpl<uchar4, float, float4>},
{addScalarImpl<uchar, double, double>, addScalarImpl<uchar2, double, double2>, addScalarImpl<uchar3, double, double3>, addScalarImpl<uchar4, double, double4>}
},
{
{addScalarImpl<schar, float, uchar>, addScalarImpl<char2, float, uchar2>, addScalarImpl<char3, float, uchar3>, addScalarImpl<char4, float, uchar4>},
{addScalarImpl<schar, float, schar>, addScalarImpl<char2, float, char2>, addScalarImpl<char3, float, char3>, addScalarImpl<char4, float, char4>},
{addScalarImpl<schar, float, ushort>, addScalarImpl<char2, float, ushort2>, addScalarImpl<char3, float, ushort3>, addScalarImpl<char4, float, ushort4>},
{addScalarImpl<schar, float, short>, addScalarImpl<char2, float, short2>, addScalarImpl<char3, float, short3>, addScalarImpl<char4, float, short4>},
{addScalarImpl<schar, float, int>, addScalarImpl<char2, float, int2>, addScalarImpl<char3, float, int3>, addScalarImpl<char4, float, int4>},
{addScalarImpl<schar, float, float>, addScalarImpl<char2, float, float2>, addScalarImpl<char3, float, float3>, addScalarImpl<char4, float, float4>},
{addScalarImpl<schar, double, double>, addScalarImpl<char2, double, double2>, addScalarImpl<char3, double, double3>, addScalarImpl<char4, double, double4>}
},
{
{0 /*addScalarImpl<ushort, float, uchar>*/, 0 /*addScalarImpl<ushort2, float, uchar2>*/, 0 /*addScalarImpl<ushort3, float, uchar3>*/, 0 /*addScalarImpl<ushort4, float, uchar4>*/},
{0 /*addScalarImpl<ushort, float, schar>*/, 0 /*addScalarImpl<ushort2, float, char2>*/, 0 /*addScalarImpl<ushort3, float, char3>*/, 0 /*addScalarImpl<ushort4, float, char4>*/},
{addScalarImpl<ushort, float, ushort>, addScalarImpl<ushort2, float, ushort2>, addScalarImpl<ushort3, float, ushort3>, addScalarImpl<ushort4, float, ushort4>},
{addScalarImpl<ushort, float, short>, addScalarImpl<ushort2, float, short2>, addScalarImpl<ushort3, float, short3>, addScalarImpl<ushort4, float, short4>},
{addScalarImpl<ushort, float, int>, addScalarImpl<ushort2, float, int2>, addScalarImpl<ushort3, float, int3>, addScalarImpl<ushort4, float, int4>},
{addScalarImpl<ushort, float, float>, addScalarImpl<ushort2, float, float2>, addScalarImpl<ushort3, float, float3>, addScalarImpl<ushort4, float, float4>},
{addScalarImpl<ushort, double, double>, addScalarImpl<ushort2, double, double2>, addScalarImpl<ushort3, double, double3>, addScalarImpl<ushort4, double, double4>}
},
{
{0 /*addScalarImpl<short, float, uchar>*/, 0 /*addScalarImpl<short2, float, uchar2>*/, 0 /*addScalarImpl<short3, float, uchar3>*/, 0 /*addScalarImpl<short4, float, uchar4>*/},
{0 /*addScalarImpl<short, float, schar>*/, 0 /*addScalarImpl<short2, float, char2>*/, 0 /*addScalarImpl<short3, float, char3>*/, 0 /*addScalarImpl<short4, float, char4>*/},
{addScalarImpl<short, float, ushort>, addScalarImpl<short2, float, ushort2>, addScalarImpl<short3, float, ushort3>, addScalarImpl<short4, float, ushort4>},
{addScalarImpl<short, float, short>, addScalarImpl<short2, float, short2>, addScalarImpl<short3, float, short3>, addScalarImpl<short4, float, short4>},
{addScalarImpl<short, float, int>, addScalarImpl<short2, float, int2>, addScalarImpl<short3, float, int3>, addScalarImpl<short4, float, int4>},
{addScalarImpl<short, float, float>, addScalarImpl<short2, float, float2>, addScalarImpl<short3, float, float3>, addScalarImpl<short4, float, float4>},
{addScalarImpl<short, double, double>, addScalarImpl<short2, double, double2>, addScalarImpl<short3, double, double3>, addScalarImpl<short4, double, double4>}
},
{
{0 /*addScalarImpl<int, float, uchar>*/, 0 /*addScalarImpl<int2, float, uchar2>*/, 0 /*addScalarImpl<int3, float, uchar3>*/, 0 /*addScalarImpl<int4, float, uchar4>*/},
{0 /*addScalarImpl<int, float, schar>*/, 0 /*addScalarImpl<int2, float, char2>*/, 0 /*addScalarImpl<int3, float, char3>*/, 0 /*addScalarImpl<int4, float, char4>*/},
{0 /*addScalarImpl<int, float, ushort>*/, 0 /*addScalarImpl<int2, float, ushort2>*/, 0 /*addScalarImpl<int3, float, ushort3>*/, 0 /*addScalarImpl<int4, float, ushort4>*/},
{0 /*addScalarImpl<int, float, short>*/, 0 /*addScalarImpl<int2, float, short2>*/, 0 /*addScalarImpl<int3, float, short3>*/, 0 /*addScalarImpl<int4, float, short4>*/},
{addScalarImpl<int, float, int>, addScalarImpl<int2, float, int2>, addScalarImpl<int3, float, int3>, addScalarImpl<int4, float, int4>},
{addScalarImpl<int, float, float>, addScalarImpl<int2, float, float2>, addScalarImpl<int3, float, float3>, addScalarImpl<int4, float, float4>},
{addScalarImpl<int, double, double>, addScalarImpl<int2, double, double2>, addScalarImpl<int3, double, double3>, addScalarImpl<int4, double, double4>}
},
{
{0 /*addScalarImpl<float, float, uchar>*/, 0 /*addScalarImpl<float2, float, uchar2>*/, 0 /*addScalarImpl<float3, float, uchar3>*/, 0 /*addScalarImpl<float4, float, uchar4>*/},
{0 /*addScalarImpl<float, float, schar>*/, 0 /*addScalarImpl<float2, float, char2>*/, 0 /*addScalarImpl<float3, float, char3>*/, 0 /*addScalarImpl<float4, float, char4>*/},
{0 /*addScalarImpl<float, float, ushort>*/, 0 /*addScalarImpl<float2, float, ushort2>*/, 0 /*addScalarImpl<float3, float, ushort3>*/, 0 /*addScalarImpl<float4, float, ushort4>*/},
{0 /*addScalarImpl<float, float, short>*/, 0 /*addScalarImpl<float2, float, short2>*/, 0 /*addScalarImpl<float3, float, short3>*/, 0 /*addScalarImpl<float4, float, short4>*/},
{0 /*addScalarImpl<float, float, int>*/, 0 /*addScalarImpl<float2, float, int2>*/, 0 /*addScalarImpl<float3, float, int3>*/, 0 /*addScalarImpl<float4, float, int4>*/},
{addScalarImpl<float, float, float>, addScalarImpl<float2, float, float2>, addScalarImpl<float3, float, float3>, addScalarImpl<float4, float, float4>},
{addScalarImpl<float, double, double>, addScalarImpl<float2, double, double2>, addScalarImpl<float3, double, double3>, addScalarImpl<float4, double, double4>}
},
{
{0 /*addScalarImpl<double, double, uchar>*/, 0 /*addScalarImpl<double2, double, uchar2>*/, 0 /*addScalarImpl<double3, double, uchar3>*/, 0 /*addScalarImpl<double4, double, uchar4>*/},
{0 /*addScalarImpl<double, double, schar>*/, 0 /*addScalarImpl<double2, double, char2>*/, 0 /*addScalarImpl<double3, double, char3>*/, 0 /*addScalarImpl<double4, double, char4>*/},
{0 /*addScalarImpl<double, double, ushort>*/, 0 /*addScalarImpl<double2, double, ushort2>*/, 0 /*addScalarImpl<double3, double, ushort3>*/, 0 /*addScalarImpl<double4, double, ushort4>*/},
{0 /*addScalarImpl<double, double, short>*/, 0 /*addScalarImpl<double2, double, short2>*/, 0 /*addScalarImpl<double3, double, short3>*/, 0 /*addScalarImpl<double4, double, short4>*/},
{0 /*addScalarImpl<double, double, int>*/, 0 /*addScalarImpl<double2, double, int2>*/, 0 /*addScalarImpl<double3, double, int3>*/, 0 /*addScalarImpl<double4, double, int4>*/},
{0 /*addScalarImpl<double, double, float>*/, 0 /*addScalarImpl<double2, double, float2>*/, 0 /*addScalarImpl<double3, double, float3>*/, 0 /*addScalarImpl<double4, double, float4>*/},
{addScalarImpl<double, double, double>, addScalarImpl<double2, double, double2>, addScalarImpl<double3, double, double3>, addScalarImpl<double4, double, double4>}
}
};
const int sdepth = src.depth();
const int ddepth = dst.depth();
const int cn = src.channels();
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
const func_t func = funcs[sdepth][ddepth][cn - 1];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
func(src, val, dst, mask, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,325 +40,553 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
{
template <typename T> struct UseDouble_
{
enum {value = 0};
};
template <> struct UseDouble_<double>
{
enum {value = 1};
};
template <typename T1, typename T2, typename D> struct UseDouble
{
enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)};
};
template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_;
template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D>
{
float alpha;
float beta;
float gamma;
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
__host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
using namespace cv::cudev;
__device__ __forceinline__ D operator ()(T1 a, T2 b) const
{
return saturate_cast<D>(a * alpha + b * beta + gamma);
}
};
template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D>
namespace
{
template <typename T1, typename T2, typename D, typename S> struct AddWeightedOp : binary_function<T1, T2, D>
{
double alpha;
double beta;
double gamma;
__host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
S alpha;
S beta;
S gamma;
__device__ __forceinline__ D operator ()(T1 a, T2 b) const
{
return saturate_cast<D>(a * alpha + b * beta + gamma);
}
};
template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>
{
AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T1, typename T2, typename D, size_t src1_size, size_t src2_size, size_t dst_size> struct AddWeightedTraits : DefaultTransformFunctorTraits< arithm::AddWeighted<T1, T2, D> >
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <typename T1, typename T2, typename D, size_t src_size, size_t dst_size> struct AddWeightedTraits<T1, T2, D, src_size, src_size, dst_size> : arithm::ArithmFuncTraits<src_size, dst_size>
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
template <typename T1, typename T2, typename D> struct TransformFunctorTraits< arithm::AddWeighted<T1, T2, D> > : AddWeightedTraits<T1, T2, D, sizeof(T1), sizeof(T2), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T1, typename T2, typename D>
void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream)
void addWeightedImpl(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, Stream& stream)
{
AddWeighted<T1, T2, D> op(alpha, beta, gamma);
device::transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
typedef typename LargerType<T1, T2>::type larger_type1;
typedef typename LargerType<larger_type1, D>::type larger_type2;
typedef typename LargerType<larger_type2, float>::type scalar_type;
AddWeightedOp<T1, T2, D, scalar_type> op;
op.alpha = static_cast<scalar_type>(alpha);
op.beta = static_cast<scalar_type>(beta);
op.gamma = static_cast<scalar_type>(gamma);
template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
gridTransformBinary_< TransformPolicy<scalar_type> >(globPtr<T1>(src1), globPtr<T2>(src2), globPtr<D>(dst), op, stream);
}
}
void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, double beta, double gamma, OutputArray _dst, int ddepth, Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, Stream& stream);
static const func_t funcs[7][7][7] =
{
{
{
addWeightedImpl<uchar, uchar, uchar >,
addWeightedImpl<uchar, uchar, schar >,
addWeightedImpl<uchar, uchar, ushort>,
addWeightedImpl<uchar, uchar, short >,
addWeightedImpl<uchar, uchar, int >,
addWeightedImpl<uchar, uchar, float >,
addWeightedImpl<uchar, uchar, double>
},
{
addWeightedImpl<uchar, schar, uchar >,
addWeightedImpl<uchar, schar, schar >,
addWeightedImpl<uchar, schar, ushort>,
addWeightedImpl<uchar, schar, short >,
addWeightedImpl<uchar, schar, int >,
addWeightedImpl<uchar, schar, float >,
addWeightedImpl<uchar, schar, double>
},
{
addWeightedImpl<uchar, ushort, uchar >,
addWeightedImpl<uchar, ushort, schar >,
addWeightedImpl<uchar, ushort, ushort>,
addWeightedImpl<uchar, ushort, short >,
addWeightedImpl<uchar, ushort, int >,
addWeightedImpl<uchar, ushort, float >,
addWeightedImpl<uchar, ushort, double>
},
{
addWeightedImpl<uchar, short, uchar >,
addWeightedImpl<uchar, short, schar >,
addWeightedImpl<uchar, short, ushort>,
addWeightedImpl<uchar, short, short >,
addWeightedImpl<uchar, short, int >,
addWeightedImpl<uchar, short, float >,
addWeightedImpl<uchar, short, double>
},
{
addWeightedImpl<uchar, int, uchar >,
addWeightedImpl<uchar, int, schar >,
addWeightedImpl<uchar, int, ushort>,
addWeightedImpl<uchar, int, short >,
addWeightedImpl<uchar, int, int >,
addWeightedImpl<uchar, int, float >,
addWeightedImpl<uchar, int, double>
},
{
addWeightedImpl<uchar, float, uchar >,
addWeightedImpl<uchar, float, schar >,
addWeightedImpl<uchar, float, ushort>,
addWeightedImpl<uchar, float, short >,
addWeightedImpl<uchar, float, int >,
addWeightedImpl<uchar, float, float >,
addWeightedImpl<uchar, float, double>
},
{
addWeightedImpl<uchar, double, uchar >,
addWeightedImpl<uchar, double, schar >,
addWeightedImpl<uchar, double, ushort>,
addWeightedImpl<uchar, double, short >,
addWeightedImpl<uchar, double, int >,
addWeightedImpl<uchar, double, float >,
addWeightedImpl<uchar, double, double>
}
},
{
{
0/*addWeightedImpl<schar, uchar, uchar >*/,
0/*addWeightedImpl<schar, uchar, schar >*/,
0/*addWeightedImpl<schar, uchar, ushort>*/,
0/*addWeightedImpl<schar, uchar, short >*/,
0/*addWeightedImpl<schar, uchar, int >*/,
0/*addWeightedImpl<schar, uchar, float >*/,
0/*addWeightedImpl<schar, uchar, double>*/
},
{
addWeightedImpl<schar, schar, uchar >,
addWeightedImpl<schar, schar, schar >,
addWeightedImpl<schar, schar, ushort>,
addWeightedImpl<schar, schar, short >,
addWeightedImpl<schar, schar, int >,
addWeightedImpl<schar, schar, float >,
addWeightedImpl<schar, schar, double>
},
{
addWeightedImpl<schar, ushort, uchar >,
addWeightedImpl<schar, ushort, schar >,
addWeightedImpl<schar, ushort, ushort>,
addWeightedImpl<schar, ushort, short >,
addWeightedImpl<schar, ushort, int >,
addWeightedImpl<schar, ushort, float >,
addWeightedImpl<schar, ushort, double>
},
{
addWeightedImpl<schar, short, uchar >,
addWeightedImpl<schar, short, schar >,
addWeightedImpl<schar, short, ushort>,
addWeightedImpl<schar, short, short >,
addWeightedImpl<schar, short, int >,
addWeightedImpl<schar, short, float >,
addWeightedImpl<schar, short, double>
},
{
addWeightedImpl<schar, int, uchar >,
addWeightedImpl<schar, int, schar >,
addWeightedImpl<schar, int, ushort>,
addWeightedImpl<schar, int, short >,
addWeightedImpl<schar, int, int >,
addWeightedImpl<schar, int, float >,
addWeightedImpl<schar, int, double>
},
{
addWeightedImpl<schar, float, uchar >,
addWeightedImpl<schar, float, schar >,
addWeightedImpl<schar, float, ushort>,
addWeightedImpl<schar, float, short >,
addWeightedImpl<schar, float, int >,
addWeightedImpl<schar, float, float >,
addWeightedImpl<schar, float, double>
},
{
addWeightedImpl<schar, double, uchar >,
addWeightedImpl<schar, double, schar >,
addWeightedImpl<schar, double, ushort>,
addWeightedImpl<schar, double, short >,
addWeightedImpl<schar, double, int >,
addWeightedImpl<schar, double, float >,
addWeightedImpl<schar, double, double>
}
},
{
{
0/*addWeightedImpl<ushort, uchar, uchar >*/,
0/*addWeightedImpl<ushort, uchar, schar >*/,
0/*addWeightedImpl<ushort, uchar, ushort>*/,
0/*addWeightedImpl<ushort, uchar, short >*/,
0/*addWeightedImpl<ushort, uchar, int >*/,
0/*addWeightedImpl<ushort, uchar, float >*/,
0/*addWeightedImpl<ushort, uchar, double>*/
},
{
0/*addWeightedImpl<ushort, schar, uchar >*/,
0/*addWeightedImpl<ushort, schar, schar >*/,
0/*addWeightedImpl<ushort, schar, ushort>*/,
0/*addWeightedImpl<ushort, schar, short >*/,
0/*addWeightedImpl<ushort, schar, int >*/,
0/*addWeightedImpl<ushort, schar, float >*/,
0/*addWeightedImpl<ushort, schar, double>*/
},
{
addWeightedImpl<ushort, ushort, uchar >,
addWeightedImpl<ushort, ushort, schar >,
addWeightedImpl<ushort, ushort, ushort>,
addWeightedImpl<ushort, ushort, short >,
addWeightedImpl<ushort, ushort, int >,
addWeightedImpl<ushort, ushort, float >,
addWeightedImpl<ushort, ushort, double>
},
{
addWeightedImpl<ushort, short, uchar >,
addWeightedImpl<ushort, short, schar >,
addWeightedImpl<ushort, short, ushort>,
addWeightedImpl<ushort, short, short >,
addWeightedImpl<ushort, short, int >,
addWeightedImpl<ushort, short, float >,
addWeightedImpl<ushort, short, double>
},
{
addWeightedImpl<ushort, int, uchar >,
addWeightedImpl<ushort, int, schar >,
addWeightedImpl<ushort, int, ushort>,
addWeightedImpl<ushort, int, short >,
addWeightedImpl<ushort, int, int >,
addWeightedImpl<ushort, int, float >,
addWeightedImpl<ushort, int, double>
},
{
addWeightedImpl<ushort, float, uchar >,
addWeightedImpl<ushort, float, schar >,
addWeightedImpl<ushort, float, ushort>,
addWeightedImpl<ushort, float, short >,
addWeightedImpl<ushort, float, int >,
addWeightedImpl<ushort, float, float >,
addWeightedImpl<ushort, float, double>
},
{
addWeightedImpl<ushort, double, uchar >,
addWeightedImpl<ushort, double, schar >,
addWeightedImpl<ushort, double, ushort>,
addWeightedImpl<ushort, double, short >,
addWeightedImpl<ushort, double, int >,
addWeightedImpl<ushort, double, float >,
addWeightedImpl<ushort, double, double>
}
},
{
{
0/*addWeightedImpl<short, uchar, uchar >*/,
0/*addWeightedImpl<short, uchar, schar >*/,
0/*addWeightedImpl<short, uchar, ushort>*/,
0/*addWeightedImpl<short, uchar, short >*/,
0/*addWeightedImpl<short, uchar, int >*/,
0/*addWeightedImpl<short, uchar, float >*/,
0/*addWeightedImpl<short, uchar, double>*/
},
{
0/*addWeightedImpl<short, schar, uchar >*/,
0/*addWeightedImpl<short, schar, schar >*/,
0/*addWeightedImpl<short, schar, ushort>*/,
0/*addWeightedImpl<short, schar, short >*/,
0/*addWeightedImpl<short, schar, int >*/,
0/*addWeightedImpl<short, schar, float >*/,
0/*addWeightedImpl<short, schar, double>*/
},
{
0/*addWeightedImpl<short, ushort, uchar >*/,
0/*addWeightedImpl<short, ushort, schar >*/,
0/*addWeightedImpl<short, ushort, ushort>*/,
0/*addWeightedImpl<short, ushort, short >*/,
0/*addWeightedImpl<short, ushort, int >*/,
0/*addWeightedImpl<short, ushort, float >*/,
0/*addWeightedImpl<short, ushort, double>*/
},
{
addWeightedImpl<short, short, uchar >,
addWeightedImpl<short, short, schar >,
addWeightedImpl<short, short, ushort>,
addWeightedImpl<short, short, short >,
addWeightedImpl<short, short, int >,
addWeightedImpl<short, short, float >,
addWeightedImpl<short, short, double>
},
{
addWeightedImpl<short, int, uchar >,
addWeightedImpl<short, int, schar >,
addWeightedImpl<short, int, ushort>,
addWeightedImpl<short, int, short >,
addWeightedImpl<short, int, int >,
addWeightedImpl<short, int, float >,
addWeightedImpl<short, int, double>
},
{
addWeightedImpl<short, float, uchar >,
addWeightedImpl<short, float, schar >,
addWeightedImpl<short, float, ushort>,
addWeightedImpl<short, float, short >,
addWeightedImpl<short, float, int >,
addWeightedImpl<short, float, float >,
addWeightedImpl<short, float, double>
},
{
addWeightedImpl<short, double, uchar >,
addWeightedImpl<short, double, schar >,
addWeightedImpl<short, double, ushort>,
addWeightedImpl<short, double, short >,
addWeightedImpl<short, double, int >,
addWeightedImpl<short, double, float >,
addWeightedImpl<short, double, double>
}
},
{
{
0/*addWeightedImpl<int, uchar, uchar >*/,
0/*addWeightedImpl<int, uchar, schar >*/,
0/*addWeightedImpl<int, uchar, ushort>*/,
0/*addWeightedImpl<int, uchar, short >*/,
0/*addWeightedImpl<int, uchar, int >*/,
0/*addWeightedImpl<int, uchar, float >*/,
0/*addWeightedImpl<int, uchar, double>*/
},
{
0/*addWeightedImpl<int, schar, uchar >*/,
0/*addWeightedImpl<int, schar, schar >*/,
0/*addWeightedImpl<int, schar, ushort>*/,
0/*addWeightedImpl<int, schar, short >*/,
0/*addWeightedImpl<int, schar, int >*/,
0/*addWeightedImpl<int, schar, float >*/,
0/*addWeightedImpl<int, schar, double>*/
},
{
0/*addWeightedImpl<int, ushort, uchar >*/,
0/*addWeightedImpl<int, ushort, schar >*/,
0/*addWeightedImpl<int, ushort, ushort>*/,
0/*addWeightedImpl<int, ushort, short >*/,
0/*addWeightedImpl<int, ushort, int >*/,
0/*addWeightedImpl<int, ushort, float >*/,
0/*addWeightedImpl<int, ushort, double>*/
},
{
0/*addWeightedImpl<int, short, uchar >*/,
0/*addWeightedImpl<int, short, schar >*/,
0/*addWeightedImpl<int, short, ushort>*/,
0/*addWeightedImpl<int, short, short >*/,
0/*addWeightedImpl<int, short, int >*/,
0/*addWeightedImpl<int, short, float >*/,
0/*addWeightedImpl<int, short, double>*/
},
{
addWeightedImpl<int, int, uchar >,
addWeightedImpl<int, int, schar >,
addWeightedImpl<int, int, ushort>,
addWeightedImpl<int, int, short >,
addWeightedImpl<int, int, int >,
addWeightedImpl<int, int, float >,
addWeightedImpl<int, int, double>
},
{
addWeightedImpl<int, float, uchar >,
addWeightedImpl<int, float, schar >,
addWeightedImpl<int, float, ushort>,
addWeightedImpl<int, float, short >,
addWeightedImpl<int, float, int >,
addWeightedImpl<int, float, float >,
addWeightedImpl<int, float, double>
},
{
addWeightedImpl<int, double, uchar >,
addWeightedImpl<int, double, schar >,
addWeightedImpl<int, double, ushort>,
addWeightedImpl<int, double, short >,
addWeightedImpl<int, double, int >,
addWeightedImpl<int, double, float >,
addWeightedImpl<int, double, double>
}
},
{
{
0/*addWeightedImpl<float, uchar, uchar >*/,
0/*addWeightedImpl<float, uchar, schar >*/,
0/*addWeightedImpl<float, uchar, ushort>*/,
0/*addWeightedImpl<float, uchar, short >*/,
0/*addWeightedImpl<float, uchar, int >*/,
0/*addWeightedImpl<float, uchar, float >*/,
0/*addWeightedImpl<float, uchar, double>*/
},
{
0/*addWeightedImpl<float, schar, uchar >*/,
0/*addWeightedImpl<float, schar, schar >*/,
0/*addWeightedImpl<float, schar, ushort>*/,
0/*addWeightedImpl<float, schar, short >*/,
0/*addWeightedImpl<float, schar, int >*/,
0/*addWeightedImpl<float, schar, float >*/,
0/*addWeightedImpl<float, schar, double>*/
},
{
0/*addWeightedImpl<float, ushort, uchar >*/,
0/*addWeightedImpl<float, ushort, schar >*/,
0/*addWeightedImpl<float, ushort, ushort>*/,
0/*addWeightedImpl<float, ushort, short >*/,
0/*addWeightedImpl<float, ushort, int >*/,
0/*addWeightedImpl<float, ushort, float >*/,
0/*addWeightedImpl<float, ushort, double>*/
},
{
0/*addWeightedImpl<float, short, uchar >*/,
0/*addWeightedImpl<float, short, schar >*/,
0/*addWeightedImpl<float, short, ushort>*/,
0/*addWeightedImpl<float, short, short >*/,
0/*addWeightedImpl<float, short, int >*/,
0/*addWeightedImpl<float, short, float >*/,
0/*addWeightedImpl<float, short, double>*/
},
{
0/*addWeightedImpl<float, int, uchar >*/,
0/*addWeightedImpl<float, int, schar >*/,
0/*addWeightedImpl<float, int, ushort>*/,
0/*addWeightedImpl<float, int, short >*/,
0/*addWeightedImpl<float, int, int >*/,
0/*addWeightedImpl<float, int, float >*/,
0/*addWeightedImpl<float, int, double>*/
},
{
addWeightedImpl<float, float, uchar >,
addWeightedImpl<float, float, schar >,
addWeightedImpl<float, float, ushort>,
addWeightedImpl<float, float, short >,
addWeightedImpl<float, float, int >,
addWeightedImpl<float, float, float >,
addWeightedImpl<float, float, double>
},
{
addWeightedImpl<float, double, uchar >,
addWeightedImpl<float, double, schar >,
addWeightedImpl<float, double, ushort>,
addWeightedImpl<float, double, short >,
addWeightedImpl<float, double, int >,
addWeightedImpl<float, double, float >,
addWeightedImpl<float, double, double>
}
},
{
{
0/*addWeightedImpl<double, uchar, uchar >*/,
0/*addWeightedImpl<double, uchar, schar >*/,
0/*addWeightedImpl<double, uchar, ushort>*/,
0/*addWeightedImpl<double, uchar, short >*/,
0/*addWeightedImpl<double, uchar, int >*/,
0/*addWeightedImpl<double, uchar, float >*/,
0/*addWeightedImpl<double, uchar, double>*/
},
{
0/*addWeightedImpl<double, schar, uchar >*/,
0/*addWeightedImpl<double, schar, schar >*/,
0/*addWeightedImpl<double, schar, ushort>*/,
0/*addWeightedImpl<double, schar, short >*/,
0/*addWeightedImpl<double, schar, int >*/,
0/*addWeightedImpl<double, schar, float >*/,
0/*addWeightedImpl<double, schar, double>*/
},
{
0/*addWeightedImpl<double, ushort, uchar >*/,
0/*addWeightedImpl<double, ushort, schar >*/,
0/*addWeightedImpl<double, ushort, ushort>*/,
0/*addWeightedImpl<double, ushort, short >*/,
0/*addWeightedImpl<double, ushort, int >*/,
0/*addWeightedImpl<double, ushort, float >*/,
0/*addWeightedImpl<double, ushort, double>*/
},
{
0/*addWeightedImpl<double, short, uchar >*/,
0/*addWeightedImpl<double, short, schar >*/,
0/*addWeightedImpl<double, short, ushort>*/,
0/*addWeightedImpl<double, short, short >*/,
0/*addWeightedImpl<double, short, int >*/,
0/*addWeightedImpl<double, short, float >*/,
0/*addWeightedImpl<double, short, double>*/
},
{
0/*addWeightedImpl<double, int, uchar >*/,
0/*addWeightedImpl<double, int, schar >*/,
0/*addWeightedImpl<double, int, ushort>*/,
0/*addWeightedImpl<double, int, short >*/,
0/*addWeightedImpl<double, int, int >*/,
0/*addWeightedImpl<double, int, float >*/,
0/*addWeightedImpl<double, int, double>*/
},
{
0/*addWeightedImpl<double, float, uchar >*/,
0/*addWeightedImpl<double, float, schar >*/,
0/*addWeightedImpl<double, float, ushort>*/,
0/*addWeightedImpl<double, float, short >*/,
0/*addWeightedImpl<double, float, int >*/,
0/*addWeightedImpl<double, float, float >*/,
0/*addWeightedImpl<double, float, double>*/
},
{
addWeightedImpl<double, double, uchar >,
addWeightedImpl<double, double, schar >,
addWeightedImpl<double, double, ushort>,
addWeightedImpl<double, double, short >,
addWeightedImpl<double, double, int >,
addWeightedImpl<double, double, float >,
addWeightedImpl<double, double, double>
}
}
};
template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
int sdepth1 = src1.depth();
int sdepth2 = src2.depth();
template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
const int cn = src1.channels();
CV_DbgAssert( src2.size() == src1.size() && src2.channels() == cn );
CV_DbgAssert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
_dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
GpuMat dst = _dst.getGpuMat();
template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
if (sdepth1 > sdepth2)
{
src1_.swap(src2_);
std::swap(alpha, beta);
std::swap(sdepth1, sdepth2);
}
const func_t func = funcs[sdepth1][sdepth2][ddepth];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
func(src1_, alpha, src2_, beta, gamma, dst_, stream);
}
#endif /* CUDA_DISABLER */
#endif

@ -1,145 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __ARITHM_FUNC_TRAITS_HPP__
#define __ARITHM_FUNC_TRAITS_HPP__
#include <cstddef>
namespace arithm
{
template <size_t src_size, size_t dst_size> struct ArithmFuncTraits
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 1 };
};
template <> struct ArithmFuncTraits<1, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<1, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<1, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
}
#endif // __ARITHM_FUNC_TRAITS_HPP__

@ -40,87 +40,187 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
template <typename T> struct TransformFunctorTraits< bit_and<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
using namespace cv::cudev;
template <typename T> struct TransformFunctorTraits< bit_or<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
template <typename T> struct TransformFunctorTraits< bit_xor<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
//////////////////////////////////////////////////////////////////////////////
/// bitwise_not
namespace arithm
void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
{
template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
const int depth = src.depth();
CV_DbgAssert( depth <= CV_32F );
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
if (mask.empty())
{
if (mask.data)
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
const int bcols = (int) (src.cols * src.elemSize());
if ((bcols & 3) == 0)
{
const int vcols = bcols >> 2;
GlobPtrSz<uint> vsrc = globPtr((uint*) src.data, src.step, src.rows, vcols);
GlobPtrSz<uint> vdst = globPtr((uint*) dst.data, dst.step, src.rows, vcols);
gridTransformUnary(vsrc, vdst, bit_not<uint>(), stream);
}
else if ((bcols & 1) == 0)
{
const int vcols = bcols >> 1;
GlobPtrSz<ushort> vsrc = globPtr((ushort*) src.data, src.step, src.rows, vcols);
GlobPtrSz<ushort> vdst = globPtr((ushort*) dst.data, dst.step, src.rows, vcols);
gridTransformUnary(vsrc, vdst, bit_not<ushort>(), stream);
}
else
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
}
{
GlobPtrSz<uchar> vsrc = globPtr((uchar*) src.data, src.step, src.rows, bcols);
GlobPtrSz<uchar> vdst = globPtr((uchar*) dst.data, dst.step, src.rows, bcols);
template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
gridTransformUnary(vsrc, vdst, bit_not<uchar>(), stream);
}
}
else
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
if (depth == CV_32F || depth == CV_32S)
{
GlobPtrSz<uint> vsrc = globPtr((uint*) src.data, src.step, src.rows, src.cols * src.channels());
GlobPtrSz<uint> vdst = globPtr((uint*) dst.data, dst.step, src.rows, src.cols * src.channels());
gridTransformUnary(vsrc, vdst, bit_not<uint>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
}
else if (depth == CV_16S || depth == CV_16U)
{
GlobPtrSz<ushort> vsrc = globPtr((ushort*) src.data, src.step, src.rows, src.cols * src.channels());
GlobPtrSz<ushort> vdst = globPtr((ushort*) dst.data, dst.step, src.rows, src.cols * src.channels());
gridTransformUnary(vsrc, vdst, bit_not<ushort>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
}
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
{
GlobPtrSz<uchar> vsrc = globPtr((uchar*) src.data, src.step, src.rows, src.cols * src.channels());
GlobPtrSz<uchar> vdst = globPtr((uchar*) dst.data, dst.step, src.rows, src.cols * src.channels());
gridTransformUnary(vsrc, vdst, bit_not<uchar>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
}
}
}
template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
//////////////////////////////////////////////////////////////////////////////
/// Binary bitwise logical operations
namespace
{
template <template <typename> class Op, typename T>
void bitMatOp(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
GlobPtrSz<T> vsrc1 = globPtr((T*) src1.data, src1.step, src1.rows, src1.cols * src1.channels());
GlobPtrSz<T> vsrc2 = globPtr((T*) src2.data, src2.step, src1.rows, src1.cols * src1.channels());
GlobPtrSz<T> vdst = globPtr((T*) dst.data, dst.step, src1.rows, src1.cols * src1.channels());
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), singleMaskChannels(globPtr<uchar>(mask), src1.channels()), stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), stream);
}
}
template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
static const func_t funcs32[] =
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
}
bitMatOp<bit_and, uint>,
bitMatOp<bit_or, uint>,
bitMatOp<bit_xor, uint>
};
static const func_t funcs16[] =
{
bitMatOp<bit_and, ushort>,
bitMatOp<bit_or, ushort>,
bitMatOp<bit_xor, ushort>
};
static const func_t funcs8[] =
{
bitMatOp<bit_and, uchar>,
bitMatOp<bit_or, uchar>,
bitMatOp<bit_xor, uchar>
};
const int depth = src1.depth();
CV_DbgAssert( depth <= CV_32F );
CV_DbgAssert( op >= 0 && op < 3 );
if (mask.empty())
{
const int bcols = (int) (src1.cols * src1.elemSize());
if ((bcols & 3) == 0)
{
const int vcols = bcols >> 2;
template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
GpuMat vsrc1(src1.rows, vcols, CV_32SC1, src1.data, src1.step);
GpuMat vsrc2(src1.rows, vcols, CV_32SC1, src2.data, src2.step);
GpuMat vdst(src1.rows, vcols, CV_32SC1, dst.data, dst.step);
template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
funcs32[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
}
else if ((bcols & 1) == 0)
{
const int vcols = bcols >> 1;
template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
GpuMat vsrc1(src1.rows, vcols, CV_16UC1, src1.data, src1.step);
GpuMat vsrc2(src1.rows, vcols, CV_16UC1, src2.data, src2.step);
GpuMat vdst(src1.rows, vcols, CV_16UC1, dst.data, dst.step);
template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
funcs16[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
}
else
{
GpuMat vsrc1(src1.rows, bcols, CV_8UC1, src1.data, src1.step);
GpuMat vsrc2(src1.rows, bcols, CV_8UC1, src2.data, src2.step);
GpuMat vdst(src1.rows, bcols, CV_8UC1, dst.data, dst.step);
funcs8[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
}
}
else
{
if (depth == CV_32F || depth == CV_32S)
{
funcs32[op](src1, src2, dst, mask, stream);
}
else if (depth == CV_16S || depth == CV_16U)
{
funcs16[op](src1, src2, dst, mask, stream);
}
else
{
funcs8[op](src1, src2, dst, mask, stream);
}
}
}
#endif // CUDA_DISABLER
#endif

@ -40,65 +40,132 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace cv { namespace cuda { namespace device
#include "opencv2/cudev.hpp"
#include "opencv2/core/private.cuda.hpp"
using namespace cv::cudev;
void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
namespace
{
template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <template <typename> class Op, typename T>
void bitScalarOp(const GpuMat& src, uint value, GpuMat& dst, Stream& stream)
{
};
gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), value), stream);
}
typedef void (*bit_scalar_func_t)(const GpuMat& src, uint value, GpuMat& dst, Stream& stream);
template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <typename T, bit_scalar_func_t func> struct BitScalar
{
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
{
func(src, cv::saturate_cast<T>(value[0]), dst, stream);
}
};
template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <bit_scalar_func_t func> struct BitScalar4
{
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
{
uint packedVal = 0;
packedVal |= cv::saturate_cast<uchar>(value[0]);
packedVal |= cv::saturate_cast<uchar>(value[1]) << 8;
packedVal |= cv::saturate_cast<uchar>(value[2]) << 16;
packedVal |= cv::saturate_cast<uchar>(value[3]) << 24;
func(src, packedVal, dst, stream);
}
};
}}}
namespace arithm
{
template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
template <int DEPTH, int cn> struct NppBitwiseCFunc
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
}
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
};
template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
}
typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_type npp_type;
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& _stream)
{
cudaStream_t stream = StreamAccessor::getStream(_stream);
NppStreamHandler h(stream);
NppiSize oSizeROI;
oSizeROI.width = src.cols;
oSizeROI.height = src.rows;
const npp_type pConstants[] =
{
cv::saturate_cast<npp_type>(value[0]),
cv::saturate_cast<npp_type>(value[1]),
cv::saturate_cast<npp_type>(value[2]),
cv::saturate_cast<npp_type>(value[3])
};
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
if (stream == 0)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
};
}
void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
{
(void) mask;
template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
static const func_t funcs[3][6][4] =
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
}
{
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
},
{
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
},
{
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
}
};
template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
const int depth = src.depth();
const int cn = src.channels();
template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
CV_DbgAssert( depth <= CV_32F );
CV_DbgAssert( cn == 1 || cn == 3 || cn == 4 );
CV_DbgAssert( mask.empty() );
CV_DbgAssert( op >= 0 && op < 3 );
template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
funcs[op][depth][cn - 1](src, value, dst, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,30 +40,54 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
namespace
{
template <class Op, typename T> struct CmpOp : binary_function<T, T, uchar>
{
__device__ __forceinline__ uchar operator()(T a, T b) const
{
Op op;
return -op(a, b);
}
};
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
template <template <typename> class Op, typename T>
void cmpMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
CmpOp<Op<T>, T> op;
gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<uchar>(dst), op, stream);
}
struct VCmpEq4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpeq4(a, b);
}
__host__ __device__ __forceinline__ VCmpEq4() {}
__host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
};
struct VCmpNe4 : binary_function<uint, uint, uint>
{
@ -71,9 +95,6 @@ namespace arithm
{
return vcmpne4(a, b);
}
__host__ __device__ __forceinline__ VCmpNe4() {}
__host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
};
struct VCmpLt4 : binary_function<uint, uint, uint>
{
@ -81,9 +102,6 @@ namespace arithm
{
return vcmplt4(a, b);
}
__host__ __device__ __forceinline__ VCmpLt4() {}
__host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
};
struct VCmpLe4 : binary_function<uint, uint, uint>
{
@ -91,116 +109,111 @@ namespace arithm
{
return vcmple4(a, b);
}
__host__ __device__ __forceinline__ VCmpLe4() {}
__host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
};
template <class Op, typename T>
struct Cmp : binary_function<T, T, uchar>
{
__device__ __forceinline__ uchar operator()(T a, T b) const
{
Op op;
return -op(a, b);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
void cmpMatEq_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
};
const int vcols = src1.cols >> 2;
template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
{
};
}}}
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
namespace arithm
{
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
gridTransformBinary(src1_, src2_, dst_, VCmpEq4(), stream);
}
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
void cmpMatNe_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
device::transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
const int vcols = src1.cols >> 2;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, VCmpNe4(), stream);
}
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
void cmpMatLt_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
device::transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
const int vcols = src1.cols >> 2;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, VCmpLt4(), stream);
}
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
void cmpMatLe_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
device::transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
const int vcols = src1.cols >> 2;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, VCmpLe4(), stream);
}
}
template <template <typename> class Op, typename T>
void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
static const func_t funcs[7][4] =
{
{cmpMat_v1<equal_to, uchar> , cmpMat_v1<not_equal_to, uchar> , cmpMat_v1<less, uchar> , cmpMat_v1<less_equal, uchar> },
{cmpMat_v1<equal_to, schar> , cmpMat_v1<not_equal_to, schar> , cmpMat_v1<less, schar> , cmpMat_v1<less_equal, schar> },
{cmpMat_v1<equal_to, ushort>, cmpMat_v1<not_equal_to, ushort>, cmpMat_v1<less, ushort>, cmpMat_v1<less_equal, ushort>},
{cmpMat_v1<equal_to, short> , cmpMat_v1<not_equal_to, short> , cmpMat_v1<less, short> , cmpMat_v1<less_equal, short> },
{cmpMat_v1<equal_to, int> , cmpMat_v1<not_equal_to, int> , cmpMat_v1<less, int> , cmpMat_v1<less_equal, int> },
{cmpMat_v1<equal_to, float> , cmpMat_v1<not_equal_to, float> , cmpMat_v1<less, float> , cmpMat_v1<less_equal, float> },
{cmpMat_v1<equal_to, double>, cmpMat_v1<not_equal_to, double>, cmpMat_v1<less, double>, cmpMat_v1<less_equal, double>}
};
typedef void (*func_v4_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
static const func_v4_t funcs_v4[] =
{
Cmp<Op<T>, T> op;
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
}
cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
};
const int depth = src1.depth();
template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
CV_DbgAssert( depth <= CV_64F );
static const int codes[] =
{
cmpMat<equal_to, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
0, 2, 3, 2, 3, 1
};
const GpuMat* psrc1[] =
{
cmpMat<not_equal_to, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
&src1, &src2, &src2, &src1, &src1, &src1
};
const GpuMat* psrc2[] =
{
cmpMat<less, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
&src2, &src1, &src1, &src2, &src2, &src2
};
const int code = codes[cmpop];
GpuMat src1_ = psrc1[cmpop]->reshape(1);
GpuMat src2_ = psrc2[cmpop]->reshape(1);
GpuMat dst_ = dst.reshape(1);
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
cmpMat<less_equal, T>(src1, src2, dst, stream);
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
funcs_v4[code](src1_, src2_, dst_, stream);
return;
}
}
template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
const func_t func = funcs[depth][code];
func(src1_, src2_, dst_, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,24 +40,23 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void cmpScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
namespace
{
template <class Op, typename T>
struct Cmp : binary_function<T, T, uchar>
template <class Op, typename T> struct CmpOp : binary_function<T, T, uchar>
{
__device__ __forceinline__ uchar operator()(T a, T b) const
{
@ -66,219 +65,161 @@ namespace arithm
}
};
#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
#define MAKE_VEC(_type, _cn) typename MakeVec<_type, _cn>::type
template <class Op, typename T, int cn> struct CmpScalarOp;
template <class Op, typename T, int cn> struct CmpScalar;
template <class Op, typename T>
struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
struct CmpScalarOp<Op, T, 1> : unary_function<T, uchar>
{
T val;
__host__ explicit CmpScalar(T val_) : val(val_) {}
__device__ __forceinline__ uchar operator()(T src) const
{
Cmp<Op, T> op;
CmpOp<Op, T> op;
return op(src, val);
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
struct CmpScalarOp<Op, T, 2> : unary_function<MAKE_VEC(T, 2), MAKE_VEC(uchar, 2)>
{
TYPE_VEC(T, 2) val;
MAKE_VEC(T, 2) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
__device__ __forceinline__ MAKE_VEC(uchar, 2) operator()(const MAKE_VEC(T, 2) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
CmpOp<Op, T> op;
return VecTraits<MAKE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
struct CmpScalarOp<Op, T, 3> : unary_function<MAKE_VEC(T, 3), MAKE_VEC(uchar, 3)>
{
TYPE_VEC(T, 3) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
MAKE_VEC(T, 3) val;
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
__device__ __forceinline__ MAKE_VEC(uchar, 3) operator()(const MAKE_VEC(T, 3) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
CmpOp<Op, T> op;
return VecTraits<MAKE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
struct CmpScalarOp<Op, T, 4> : unary_function<MAKE_VEC(T, 4), MAKE_VEC(uchar, 4)>
{
TYPE_VEC(T, 4) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
MAKE_VEC(T, 4) val;
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
__device__ __forceinline__ MAKE_VEC(uchar, 4) operator()(const MAKE_VEC(T, 4) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
CmpOp<Op, T> op;
return VecTraits<MAKE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
}
};
#undef TYPE_VEC
}
namespace cv { namespace cuda { namespace device
{
template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
}}}
namespace arithm
{
template <template <typename> class Op, typename T, int cn>
void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
void cmpScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
{
typedef typename TypeVec<T, cn>::vec_type src_t;
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<uchar, cn>::type dst_type;
cv::Scalar_<T> value_ = value;
T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
src_t val1 = VecTraits<src_t>::make(sval);
CmpScalarOp<Op<T>, T, cn> op;
op.val = VecTraits<src_type>::make(value_.val);
CmpScalar<Op<T>, T, cn> op(val1);
device::transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
gridTransformUnary_< TransformPolicy<T> >(globPtr<src_type>(src), globPtr<dst_type>(dst), op, stream);
}
}
template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
void cmpScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
{
typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
static const func_t funcs[7][6][4] =
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<equal_to, T, 1>,
cmpScalar<equal_to, T, 2>,
cmpScalar<equal_to, T, 3>,
cmpScalar<equal_to, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{cmpScalarImpl<equal_to, uchar, 1>, cmpScalarImpl<equal_to, uchar, 2>, cmpScalarImpl<equal_to, uchar, 3>, cmpScalarImpl<equal_to, uchar, 4>},
{cmpScalarImpl<greater, uchar, 1>, cmpScalarImpl<greater, uchar, 2>, cmpScalarImpl<greater, uchar, 3>, cmpScalarImpl<greater, uchar, 4>},
{cmpScalarImpl<greater_equal, uchar, 1>, cmpScalarImpl<greater_equal, uchar, 2>, cmpScalarImpl<greater_equal, uchar, 3>, cmpScalarImpl<greater_equal, uchar, 4>},
{cmpScalarImpl<less, uchar, 1>, cmpScalarImpl<less, uchar, 2>, cmpScalarImpl<less, uchar, 3>, cmpScalarImpl<less, uchar, 4>},
{cmpScalarImpl<less_equal, uchar, 1>, cmpScalarImpl<less_equal, uchar, 2>, cmpScalarImpl<less_equal, uchar, 3>, cmpScalarImpl<less_equal, uchar, 4>},
{cmpScalarImpl<not_equal_to, uchar, 1>, cmpScalarImpl<not_equal_to, uchar, 2>, cmpScalarImpl<not_equal_to, uchar, 3>, cmpScalarImpl<not_equal_to, uchar, 4>}
},
{
0,
cmpScalar<not_equal_to, T, 1>,
cmpScalar<not_equal_to, T, 2>,
cmpScalar<not_equal_to, T, 3>,
cmpScalar<not_equal_to, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{cmpScalarImpl<equal_to, schar, 1>, cmpScalarImpl<equal_to, schar, 2>, cmpScalarImpl<equal_to, schar, 3>, cmpScalarImpl<equal_to, schar, 4>},
{cmpScalarImpl<greater, schar, 1>, cmpScalarImpl<greater, schar, 2>, cmpScalarImpl<greater, schar, 3>, cmpScalarImpl<greater, schar, 4>},
{cmpScalarImpl<greater_equal, schar, 1>, cmpScalarImpl<greater_equal, schar, 2>, cmpScalarImpl<greater_equal, schar, 3>, cmpScalarImpl<greater_equal, schar, 4>},
{cmpScalarImpl<less, schar, 1>, cmpScalarImpl<less, schar, 2>, cmpScalarImpl<less, schar, 3>, cmpScalarImpl<less, schar, 4>},
{cmpScalarImpl<less_equal, schar, 1>, cmpScalarImpl<less_equal, schar, 2>, cmpScalarImpl<less_equal, schar, 3>, cmpScalarImpl<less_equal, schar, 4>},
{cmpScalarImpl<not_equal_to, schar, 1>, cmpScalarImpl<not_equal_to, schar, 2>, cmpScalarImpl<not_equal_to, schar, 3>, cmpScalarImpl<not_equal_to, schar, 4>}
},
{
0,
cmpScalar<less, T, 1>,
cmpScalar<less, T, 2>,
cmpScalar<less, T, 3>,
cmpScalar<less, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{cmpScalarImpl<equal_to, ushort, 1>, cmpScalarImpl<equal_to, ushort, 2>, cmpScalarImpl<equal_to, ushort, 3>, cmpScalarImpl<equal_to, ushort, 4>},
{cmpScalarImpl<greater, ushort, 1>, cmpScalarImpl<greater, ushort, 2>, cmpScalarImpl<greater, ushort, 3>, cmpScalarImpl<greater, ushort, 4>},
{cmpScalarImpl<greater_equal, ushort, 1>, cmpScalarImpl<greater_equal, ushort, 2>, cmpScalarImpl<greater_equal, ushort, 3>, cmpScalarImpl<greater_equal, ushort, 4>},
{cmpScalarImpl<less, ushort, 1>, cmpScalarImpl<less, ushort, 2>, cmpScalarImpl<less, ushort, 3>, cmpScalarImpl<less, ushort, 4>},
{cmpScalarImpl<less_equal, ushort, 1>, cmpScalarImpl<less_equal, ushort, 2>, cmpScalarImpl<less_equal, ushort, 3>, cmpScalarImpl<less_equal, ushort, 4>},
{cmpScalarImpl<not_equal_to, ushort, 1>, cmpScalarImpl<not_equal_to, ushort, 2>, cmpScalarImpl<not_equal_to, ushort, 3>, cmpScalarImpl<not_equal_to, ushort, 4>}
},
{
0,
cmpScalar<less_equal, T, 1>,
cmpScalar<less_equal, T, 2>,
cmpScalar<less_equal, T, 3>,
cmpScalar<less_equal, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{cmpScalarImpl<equal_to, short, 1>, cmpScalarImpl<equal_to, short, 2>, cmpScalarImpl<equal_to, short, 3>, cmpScalarImpl<equal_to, short, 4>},
{cmpScalarImpl<greater, short, 1>, cmpScalarImpl<greater, short, 2>, cmpScalarImpl<greater, short, 3>, cmpScalarImpl<greater, short, 4>},
{cmpScalarImpl<greater_equal, short, 1>, cmpScalarImpl<greater_equal, short, 2>, cmpScalarImpl<greater_equal, short, 3>, cmpScalarImpl<greater_equal, short, 4>},
{cmpScalarImpl<less, short, 1>, cmpScalarImpl<less, short, 2>, cmpScalarImpl<less, short, 3>, cmpScalarImpl<less, short, 4>},
{cmpScalarImpl<less_equal, short, 1>, cmpScalarImpl<less_equal, short, 2>, cmpScalarImpl<less_equal, short, 3>, cmpScalarImpl<less_equal, short, 4>},
{cmpScalarImpl<not_equal_to, short, 1>, cmpScalarImpl<not_equal_to, short, 2>, cmpScalarImpl<not_equal_to, short, 3>, cmpScalarImpl<not_equal_to, short, 4>}
},
{
0,
cmpScalar<greater, T, 1>,
cmpScalar<greater, T, 2>,
cmpScalar<greater, T, 3>,
cmpScalar<greater, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{cmpScalarImpl<equal_to, int, 1>, cmpScalarImpl<equal_to, int, 2>, cmpScalarImpl<equal_to, int, 3>, cmpScalarImpl<equal_to, int, 4>},
{cmpScalarImpl<greater, int, 1>, cmpScalarImpl<greater, int, 2>, cmpScalarImpl<greater, int, 3>, cmpScalarImpl<greater, int, 4>},
{cmpScalarImpl<greater_equal, int, 1>, cmpScalarImpl<greater_equal, int, 2>, cmpScalarImpl<greater_equal, int, 3>, cmpScalarImpl<greater_equal, int, 4>},
{cmpScalarImpl<less, int, 1>, cmpScalarImpl<less, int, 2>, cmpScalarImpl<less, int, 3>, cmpScalarImpl<less, int, 4>},
{cmpScalarImpl<less_equal, int, 1>, cmpScalarImpl<less_equal, int, 2>, cmpScalarImpl<less_equal, int, 3>, cmpScalarImpl<less_equal, int, 4>},
{cmpScalarImpl<not_equal_to, int, 1>, cmpScalarImpl<not_equal_to, int, 2>, cmpScalarImpl<not_equal_to, int, 3>, cmpScalarImpl<not_equal_to, int, 4>}
},
{
0,
cmpScalar<greater_equal, T, 1>,
cmpScalar<greater_equal, T, 2>,
cmpScalar<greater_equal, T, 3>,
cmpScalar<greater_equal, T, 4>
};
{cmpScalarImpl<equal_to, float, 1>, cmpScalarImpl<equal_to, float, 2>, cmpScalarImpl<equal_to, float, 3>, cmpScalarImpl<equal_to, float, 4>},
{cmpScalarImpl<greater, float, 1>, cmpScalarImpl<greater, float, 2>, cmpScalarImpl<greater, float, 3>, cmpScalarImpl<greater, float, 4>},
{cmpScalarImpl<greater_equal, float, 1>, cmpScalarImpl<greater_equal, float, 2>, cmpScalarImpl<greater_equal, float, 3>, cmpScalarImpl<greater_equal, float, 4>},
{cmpScalarImpl<less, float, 1>, cmpScalarImpl<less, float, 2>, cmpScalarImpl<less, float, 3>, cmpScalarImpl<less, float, 4>},
{cmpScalarImpl<less_equal, float, 1>, cmpScalarImpl<less_equal, float, 2>, cmpScalarImpl<less_equal, float, 3>, cmpScalarImpl<less_equal, float, 4>},
{cmpScalarImpl<not_equal_to, float, 1>, cmpScalarImpl<not_equal_to, float, 2>, cmpScalarImpl<not_equal_to, float, 3>, cmpScalarImpl<not_equal_to, float, 4>}
},
{
{cmpScalarImpl<equal_to, double, 1>, cmpScalarImpl<equal_to, double, 2>, cmpScalarImpl<equal_to, double, 3>, cmpScalarImpl<equal_to, double, 4>},
{cmpScalarImpl<greater, double, 1>, cmpScalarImpl<greater, double, 2>, cmpScalarImpl<greater, double, 3>, cmpScalarImpl<greater, double, 4>},
{cmpScalarImpl<greater_equal, double, 1>, cmpScalarImpl<greater_equal, double, 2>, cmpScalarImpl<greater_equal, double, 3>, cmpScalarImpl<greater_equal, double, 4>},
{cmpScalarImpl<less, double, 1>, cmpScalarImpl<less, double, 2>, cmpScalarImpl<less, double, 3>, cmpScalarImpl<less, double, 4>},
{cmpScalarImpl<less_equal, double, 1>, cmpScalarImpl<less_equal, double, 2>, cmpScalarImpl<less_equal, double, 3>, cmpScalarImpl<less_equal, double, 4>},
{cmpScalarImpl<not_equal_to, double, 1>, cmpScalarImpl<not_equal_to, double, 2>, cmpScalarImpl<not_equal_to, double, 3>, cmpScalarImpl<not_equal_to, double, 4>}
}
};
funcs[cn](src, val, dst, stream);
if (inv)
{
// src1 is a scalar; swap it with src2
cmpop = cmpop == cv::CMP_LT ? cv::CMP_GT : cmpop == cv::CMP_LE ? cv::CMP_GE :
cmpop == cv::CMP_GE ? cv::CMP_LE : cmpop == cv::CMP_GT ? cv::CMP_LT : cmpop;
}
template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
const int depth = src.depth();
const int cn = src.channels();
template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
CV_DbgAssert( depth <= CV_64F && cn <= 4 );
template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
funcs[depth][cmpop][cn - 1](src, val, dst, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,92 +40,116 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#ifndef HAVE_OPENCV_CUDEV
namespace cv { namespace cuda { namespace device
#error "opencv_cudev is required"
#else
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
namespace
{
namespace imgproc
struct ShiftMap
{
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
typedef int2 value_type;
typedef int index_type;
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = src(y - top, x - left);
}
int top;
int left;
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
__device__ __forceinline__ int2 operator ()(int y, int x) const
{
static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
return make_int2(x - left, y - top);
}
};
B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
struct ShiftMapSz : ShiftMap
{
int rows, cols;
};
}
copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
cudaSafeCall( cudaGetLastError() );
namespace cv { namespace cudev {
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <> struct PtrTraits<ShiftMapSz> : PtrTraitsBase<ShiftMapSz, ShiftMap>
{
};
template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
const T* borderValue, cudaStream_t stream)
}}
namespace
{
template <typename T, int cn>
void copyMakeBorderImpl(const GpuMat& src, GpuMat& dst, int top, int left, int borderMode, cv::Scalar borderValue, Stream& stream)
{
typedef typename MakeVec<T, cn>::type src_type;
cv::Scalar_<T> borderValue_ = borderValue;
const src_type brdVal = VecTraits<src_type>::make(borderValue_.val);
ShiftMapSz map;
map.top = top;
map.left = left;
map.rows = dst.rows;
map.cols = dst.cols;
switch (borderMode)
{
typedef typename TypeVec<T, cn>::vec_type vec_type;
case cv::BORDER_CONSTANT:
gridCopy(remapPtr(brdConstant(globPtr<src_type>(src), brdVal), map), globPtr<src_type>(dst), stream);
break;
case cv::BORDER_REPLICATE:
gridCopy(remapPtr(brdReplicate(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
break;
case cv::BORDER_REFLECT:
gridCopy(remapPtr(brdReflect(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
break;
case cv::BORDER_WRAP:
gridCopy(remapPtr(brdWrap(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
break;
case cv::BORDER_REFLECT_101:
gridCopy(remapPtr(brdReflect101(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
break;
};
}
}
void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& stream)
{
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int top, int left, int borderMode, cv::Scalar borderValue, Stream& stream);
static const func_t funcs[6][4] =
{
{ copyMakeBorderImpl<uchar , 1> , copyMakeBorderImpl<uchar , 2> , copyMakeBorderImpl<uchar , 3> , copyMakeBorderImpl<uchar , 4> },
{0 /*copyMakeBorderImpl<schar , 1>*/, 0 /*copyMakeBorderImpl<schar , 2>*/, 0 /*copyMakeBorderImpl<schar , 3>*/, 0 /*copyMakeBorderImpl<schar , 4>*/},
{ copyMakeBorderImpl<ushort, 1> , 0 /*copyMakeBorderImpl<ushort, 2>*/, copyMakeBorderImpl<ushort, 3> , copyMakeBorderImpl<ushort, 4> },
{ copyMakeBorderImpl<short , 1> , 0 /*copyMakeBorderImpl<short , 2>*/, copyMakeBorderImpl<short , 3> , copyMakeBorderImpl<short , 4> },
{0 /*copyMakeBorderImpl<int , 1>*/, 0 /*copyMakeBorderImpl<int , 2>*/, 0 /*copyMakeBorderImpl<int , 3>*/, 0 /*copyMakeBorderImpl<int , 4>*/},
{ copyMakeBorderImpl<float , 1> , 0 /*copyMakeBorderImpl<float , 2>*/, copyMakeBorderImpl<float , 3> , copyMakeBorderImpl<float ,4> }
};
typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
GpuMat src = _src.getGpuMat();
static const caller_t callers[5] =
{
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call
};
const int depth = src.depth();
const int cn = src.channels();
callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
}
CV_Assert( depth <= CV_32F && cn <= 4 );
CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
_dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
GpuMat dst = _dst.getGpuMat();
const func_t func = funcs[depth][cn - 1];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
func(src, dst, top, left, borderType, value, stream);
}
template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */
#endif

@ -40,137 +40,57 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#ifndef HAVE_OPENCV_CUDEV
using namespace cv::cuda;
using namespace cv::cuda::device;
#error "opencv_cudev is required"
namespace countNonZero
{
__device__ unsigned int blocks_finished = 0;
template <int BLOCK_SIZE, typename T>
__global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
{
__shared__ unsigned int scount[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int mycount = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
const T srcVal = ptr[x];
mycount += (srcVal != 0);
}
}
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#if __CUDA_ARCH__ >= 200
if (tid == 0)
::atomicAdd(count, mycount);
#else
__shared__ bool is_last;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
if (tid == 0)
{
count[bid] = mycount;
__threadfence();
unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
#else
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
if (tid == 0)
{
count[0] = mycount;
using namespace cv::cudev;
blocks_finished = 0;
}
}
#endif
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
namespace
{
template <typename T>
int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
gridCountNonZero(src, buf);
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
int data;
buf.download(cv::Mat(1, 1, buf.type(), &data));
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
return data;
}
}
template <typename T>
int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
{
typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf);
static const func_t funcs[] =
{
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
countNonZeroImpl<uchar>,
countNonZeroImpl<schar>,
countNonZeroImpl<ushort>,
countNonZeroImpl<short>,
countNonZeroImpl<int>,
countNonZeroImpl<float>,
countNonZeroImpl<double>
};
unsigned int* count_buf = buf.ptr(0);
GpuMat src = _src.getGpuMat();
cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
CV_Assert( src.channels() == 1 );
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
return count;
}
const func_t func = funcs[src.depth()];
template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
return func(src, buf);
}
#endif // CUDA_DISABLER
#endif

@ -40,191 +40,203 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
{
struct Div_8uc4_32f : binary_function<uint, float, uint>
{
__device__ __forceinline__ uint operator ()(uint a, float b) const
{
uint res = 0;
#include "opencv2/cudev.hpp"
if (b != 0)
{
b = 1.0f / b;
res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
}
return res;
}
};
using namespace cv::cudev;
struct Div_16sc4_32f : binary_function<short4, float, short4>
{
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
{
return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
: make_short4(0,0,0,0);
}
};
void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int);
void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
template <typename T, typename D> struct Div : binary_function<T, T, D>
namespace
{
template <typename T, typename D> struct DivOp : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return b != 0 ? saturate_cast<D>(a / b) : 0;
}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T> struct Div<T, float> : binary_function<T, T, float>
template <typename T> struct DivOp<T, float> : binary_function<T, T, float>
{
__device__ __forceinline__ float operator ()(T a, T b) const
{
return b != 0 ? static_cast<float>(a) / b : 0;
return b != 0 ? static_cast<float>(a) / b : 0.0f;
}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T> struct Div<T, double> : binary_function<T, T, double>
template <typename T> struct DivOp<T, double> : binary_function<T, T, double>
{
__device__ __forceinline__ double operator ()(T a, T b) const
{
return b != 0 ? static_cast<double>(a) / b : 0;
return b != 0 ? static_cast<double>(a) / b : 0.0;
}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
template <typename T, typename S, typename D> struct DivScaleOp : binary_function<T, T, D>
{
S scale;
__host__ explicit DivScale(S scale_) : scale(scale_) {}
__device__ __forceinline__ D operator ()(T a, T b) const
{
return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits<arithm::Div_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::Div<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
}
void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
}
template <typename T, typename S, typename D>
void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
void divMatImpl(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream)
{
if (scale == 1)
{
Div<T, D> op;
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
DivOp<T, D> op;
gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
}
else
{
DivScale<T, S, D> op(static_cast<S>(scale));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
DivScaleOp<T, S, D> op;
op.scale = static_cast<S>(scale);
gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
}
}
}
void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream);
static const func_t funcs[7][7] =
{
{
divMatImpl<uchar, float, uchar>,
divMatImpl<uchar, float, schar>,
divMatImpl<uchar, float, ushort>,
divMatImpl<uchar, float, short>,
divMatImpl<uchar, float, int>,
divMatImpl<uchar, float, float>,
divMatImpl<uchar, double, double>
},
{
divMatImpl<schar, float, uchar>,
divMatImpl<schar, float, schar>,
divMatImpl<schar, float, ushort>,
divMatImpl<schar, float, short>,
divMatImpl<schar, float, int>,
divMatImpl<schar, float, float>,
divMatImpl<schar, double, double>
},
{
0 /*divMatImpl<ushort, float, uchar>*/,
0 /*divMatImpl<ushort, float, schar>*/,
divMatImpl<ushort, float, ushort>,
divMatImpl<ushort, float, short>,
divMatImpl<ushort, float, int>,
divMatImpl<ushort, float, float>,
divMatImpl<ushort, double, double>
},
{
0 /*divMatImpl<short, float, uchar>*/,
0 /*divMatImpl<short, float, schar>*/,
divMatImpl<short, float, ushort>,
divMatImpl<short, float, short>,
divMatImpl<short, float, int>,
divMatImpl<short, float, float>,
divMatImpl<short, double, double>
},
{
0 /*divMatImpl<int, float, uchar>*/,
0 /*divMatImpl<int, float, schar>*/,
0 /*divMatImpl<int, float, ushort>*/,
0 /*divMatImpl<int, float, short>*/,
divMatImpl<int, float, int>,
divMatImpl<int, float, float>,
divMatImpl<int, double, double>
},
{
0 /*divMatImpl<float, float, uchar>*/,
0 /*divMatImpl<float, float, schar>*/,
0 /*divMatImpl<float, float, ushort>*/,
0 /*divMatImpl<float, float, short>*/,
0 /*divMatImpl<float, float, int>*/,
divMatImpl<float, float, float>,
divMatImpl<float, double, double>
},
{
0 /*divMatImpl<double, double, uchar>*/,
0 /*divMatImpl<double, double, schar>*/,
0 /*divMatImpl<double, double, ushort>*/,
0 /*divMatImpl<double, double, short>*/,
0 /*divMatImpl<double, double, int>*/,
0 /*divMatImpl<double, double, float>*/,
divMatImpl<double, double, double>
}
};
const int sdepth = src1.depth();
const int ddepth = dst.depth();
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
func(src1_, src2_, dst_, scale, stream);
}
namespace
{
template <typename T>
struct DivOpSpecial : binary_function<T, float, T>
{
__device__ __forceinline__ T operator ()(const T& a, float b) const
{
typedef typename VecTraits<T>::elem_type elem_type;
T res = VecTraits<T>::all(0);
if (b != 0)
{
b = 1.0f / b;
res.x = saturate_cast<elem_type>(a.x * b);
res.y = saturate_cast<elem_type>(a.y * b);
res.z = saturate_cast<elem_type>(a.z * b);
res.w = saturate_cast<elem_type>(a.w * b);
}
template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
return res;
}
};
}
void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
gridTransformBinary(globPtr<uchar4>(src1), globPtr<float>(src2), globPtr<uchar4>(dst), DivOpSpecial<uchar4>(), stream);
}
void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
gridTransformBinary(globPtr<short4>(src1), globPtr<float>(src2), globPtr<short4>(dst), DivOpSpecial<short4>(), stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,129 +40,221 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int);
namespace
{
template <typename T, typename S, typename D> struct DivScalar : unary_function<T, D>
template <typename T, int cn> struct SafeDiv;
template <typename T> struct SafeDiv<T, 1>
{
__device__ __forceinline__ static T op(T a, T b)
{
return b != 0 ? a / b : 0;
}
};
template <typename T> struct SafeDiv<T, 2>
{
S val;
__device__ __forceinline__ static T op(const T& a, const T& b)
{
T res;
__host__ explicit DivScalar(S val_) : val(val_) {}
res.x = b.x != 0 ? a.x / b.x : 0;
res.y = b.y != 0 ? a.y / b.y : 0;
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a / val);
return res;
}
};
template <typename T> struct SafeDiv<T, 3>
{
__device__ __forceinline__ static T op(const T& a, const T& b)
{
T res;
template <typename T, typename S, typename D> struct DivScalarInv : unary_function<T, D>
res.x = b.x != 0 ? a.x / b.x : 0;
res.y = b.y != 0 ? a.y / b.y : 0;
res.z = b.z != 0 ? a.z / b.z : 0;
return res;
}
};
template <typename T> struct SafeDiv<T, 4>
{
S val;
__device__ __forceinline__ static T op(const T& a, const T& b)
{
T res;
res.x = b.x != 0 ? a.x / b.x : 0;
res.y = b.y != 0 ? a.y / b.y : 0;
res.z = b.z != 0 ? a.z / b.z : 0;
res.w = b.w != 0 ? a.w / b.w : 0;
explicit DivScalarInv(S val_) : val(val_) {}
return res;
}
};
__device__ __forceinline__ D operator ()(T a) const
template <typename SrcType, typename ScalarType, typename DstType> struct DivScalarOp : unary_function<SrcType, DstType>
{
ScalarType val;
__device__ __forceinline__ DstType operator ()(SrcType a) const
{
return a != 0 ? saturate_cast<D>(val / a) : 0;
return saturate_cast<DstType>(SafeDiv<ScalarType, VecTraits<ScalarType>::cn>::op(saturate_cast<ScalarType>(a), val));
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <typename SrcType, typename ScalarType, typename DstType> struct DivScalarOpInv : unary_function<SrcType, DstType>
{
ScalarType val;
__device__ __forceinline__ DstType operator ()(SrcType a) const
{
return saturate_cast<DstType>(SafeDiv<ScalarType, VecTraits<ScalarType>::cn>::op(val, saturate_cast<ScalarType>(a)));
}
};
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalarInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream)
template <typename SrcType, typename ScalarDepth, typename DstType>
void divScalarImpl(const GpuMat& src, cv::Scalar value, bool inv, GpuMat& dst, Stream& stream)
{
typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
cv::Scalar_<ScalarDepth> value_ = value;
if (inv)
{
DivScalarInv<T, S, D> op(static_cast<S>(val));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
DivScalarOpInv<SrcType, ScalarType, DstType> op;
op.val = VecTraits<ScalarType>::make(value_.val);
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
}
else
{
DivScalar<T, S, D> op(static_cast<S>(val));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
DivScalarOp<SrcType, ScalarType, DstType> op;
op.val = VecTraits<ScalarType>::make(value_.val);
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
}
}
}
void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
{
typedef void (*func_t)(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, Stream& stream);
static const func_t funcs[7][7][4] =
{
{
{divScalarImpl<uchar, float, uchar>, divScalarImpl<uchar2, float, uchar2>, divScalarImpl<uchar3, float, uchar3>, divScalarImpl<uchar4, float, uchar4>},
{divScalarImpl<uchar, float, schar>, divScalarImpl<uchar2, float, char2>, divScalarImpl<uchar3, float, char3>, divScalarImpl<uchar4, float, char4>},
{divScalarImpl<uchar, float, ushort>, divScalarImpl<uchar2, float, ushort2>, divScalarImpl<uchar3, float, ushort3>, divScalarImpl<uchar4, float, ushort4>},
{divScalarImpl<uchar, float, short>, divScalarImpl<uchar2, float, short2>, divScalarImpl<uchar3, float, short3>, divScalarImpl<uchar4, float, short4>},
{divScalarImpl<uchar, float, int>, divScalarImpl<uchar2, float, int2>, divScalarImpl<uchar3, float, int3>, divScalarImpl<uchar4, float, int4>},
{divScalarImpl<uchar, float, float>, divScalarImpl<uchar2, float, float2>, divScalarImpl<uchar3, float, float3>, divScalarImpl<uchar4, float, float4>},
{divScalarImpl<uchar, double, double>, divScalarImpl<uchar2, double, double2>, divScalarImpl<uchar3, double, double3>, divScalarImpl<uchar4, double, double4>}
},
{
{divScalarImpl<schar, float, uchar>, divScalarImpl<char2, float, uchar2>, divScalarImpl<char3, float, uchar3>, divScalarImpl<char4, float, uchar4>},
{divScalarImpl<schar, float, schar>, divScalarImpl<char2, float, char2>, divScalarImpl<char3, float, char3>, divScalarImpl<char4, float, char4>},
{divScalarImpl<schar, float, ushort>, divScalarImpl<char2, float, ushort2>, divScalarImpl<char3, float, ushort3>, divScalarImpl<char4, float, ushort4>},
{divScalarImpl<schar, float, short>, divScalarImpl<char2, float, short2>, divScalarImpl<char3, float, short3>, divScalarImpl<char4, float, short4>},
{divScalarImpl<schar, float, int>, divScalarImpl<char2, float, int2>, divScalarImpl<char3, float, int3>, divScalarImpl<char4, float, int4>},
{divScalarImpl<schar, float, float>, divScalarImpl<char2, float, float2>, divScalarImpl<char3, float, float3>, divScalarImpl<char4, float, float4>},
{divScalarImpl<schar, double, double>, divScalarImpl<char2, double, double2>, divScalarImpl<char3, double, double3>, divScalarImpl<char4, double, double4>}
},
{
{0 /*divScalarImpl<ushort, float, uchar>*/, 0 /*divScalarImpl<ushort2, float, uchar2>*/, 0 /*divScalarImpl<ushort3, float, uchar3>*/, 0 /*divScalarImpl<ushort4, float, uchar4>*/},
{0 /*divScalarImpl<ushort, float, schar>*/, 0 /*divScalarImpl<ushort2, float, char2>*/, 0 /*divScalarImpl<ushort3, float, char3>*/, 0 /*divScalarImpl<ushort4, float, char4>*/},
{divScalarImpl<ushort, float, ushort>, divScalarImpl<ushort2, float, ushort2>, divScalarImpl<ushort3, float, ushort3>, divScalarImpl<ushort4, float, ushort4>},
{divScalarImpl<ushort, float, short>, divScalarImpl<ushort2, float, short2>, divScalarImpl<ushort3, float, short3>, divScalarImpl<ushort4, float, short4>},
{divScalarImpl<ushort, float, int>, divScalarImpl<ushort2, float, int2>, divScalarImpl<ushort3, float, int3>, divScalarImpl<ushort4, float, int4>},
{divScalarImpl<ushort, float, float>, divScalarImpl<ushort2, float, float2>, divScalarImpl<ushort3, float, float3>, divScalarImpl<ushort4, float, float4>},
{divScalarImpl<ushort, double, double>, divScalarImpl<ushort2, double, double2>, divScalarImpl<ushort3, double, double3>, divScalarImpl<ushort4, double, double4>}
},
{
{0 /*divScalarImpl<short, float, uchar>*/, 0 /*divScalarImpl<short2, float, uchar2>*/, 0 /*divScalarImpl<short3, float, uchar3>*/, 0 /*divScalarImpl<short4, float, uchar4>*/},
{0 /*divScalarImpl<short, float, schar>*/, 0 /*divScalarImpl<short2, float, char2>*/, 0 /*divScalarImpl<short3, float, char3>*/, 0 /*divScalarImpl<short4, float, char4>*/},
{divScalarImpl<short, float, ushort>, divScalarImpl<short2, float, ushort2>, divScalarImpl<short3, float, ushort3>, divScalarImpl<short4, float, ushort4>},
{divScalarImpl<short, float, short>, divScalarImpl<short2, float, short2>, divScalarImpl<short3, float, short3>, divScalarImpl<short4, float, short4>},
{divScalarImpl<short, float, int>, divScalarImpl<short2, float, int2>, divScalarImpl<short3, float, int3>, divScalarImpl<short4, float, int4>},
{divScalarImpl<short, float, float>, divScalarImpl<short2, float, float2>, divScalarImpl<short3, float, float3>, divScalarImpl<short4, float, float4>},
{divScalarImpl<short, double, double>, divScalarImpl<short2, double, double2>, divScalarImpl<short3, double, double3>, divScalarImpl<short4, double, double4>}
},
{
{0 /*divScalarImpl<int, float, uchar>*/, 0 /*divScalarImpl<int2, float, uchar2>*/, 0 /*divScalarImpl<int3, float, uchar3>*/, 0 /*divScalarImpl<int4, float, uchar4>*/},
{0 /*divScalarImpl<int, float, schar>*/, 0 /*divScalarImpl<int2, float, char2>*/, 0 /*divScalarImpl<int3, float, char3>*/, 0 /*divScalarImpl<int4, float, char4>*/},
{0 /*divScalarImpl<int, float, ushort>*/, 0 /*divScalarImpl<int2, float, ushort2>*/, 0 /*divScalarImpl<int3, float, ushort3>*/, 0 /*divScalarImpl<int4, float, ushort4>*/},
{0 /*divScalarImpl<int, float, short>*/, 0 /*divScalarImpl<int2, float, short2>*/, 0 /*divScalarImpl<int3, float, short3>*/, 0 /*divScalarImpl<int4, float, short4>*/},
{divScalarImpl<int, float, int>, divScalarImpl<int2, float, int2>, divScalarImpl<int3, float, int3>, divScalarImpl<int4, float, int4>},
{divScalarImpl<int, float, float>, divScalarImpl<int2, float, float2>, divScalarImpl<int3, float, float3>, divScalarImpl<int4, float, float4>},
{divScalarImpl<int, double, double>, divScalarImpl<int2, double, double2>, divScalarImpl<int3, double, double3>, divScalarImpl<int4, double, double4>}
},
{
{0 /*divScalarImpl<float, float, uchar>*/, 0 /*divScalarImpl<float2, float, uchar2>*/, 0 /*divScalarImpl<float3, float, uchar3>*/, 0 /*divScalarImpl<float4, float, uchar4>*/},
{0 /*divScalarImpl<float, float, schar>*/, 0 /*divScalarImpl<float2, float, char2>*/, 0 /*divScalarImpl<float3, float, char3>*/, 0 /*divScalarImpl<float4, float, char4>*/},
{0 /*divScalarImpl<float, float, ushort>*/, 0 /*divScalarImpl<float2, float, ushort2>*/, 0 /*divScalarImpl<float3, float, ushort3>*/, 0 /*divScalarImpl<float4, float, ushort4>*/},
{0 /*divScalarImpl<float, float, short>*/, 0 /*divScalarImpl<float2, float, short2>*/, 0 /*divScalarImpl<float3, float, short3>*/, 0 /*divScalarImpl<float4, float, short4>*/},
{0 /*divScalarImpl<float, float, int>*/, 0 /*divScalarImpl<float2, float, int2>*/, 0 /*divScalarImpl<float3, float, int3>*/, 0 /*divScalarImpl<float4, float, int4>*/},
{divScalarImpl<float, float, float>, divScalarImpl<float2, float, float2>, divScalarImpl<float3, float, float3>, divScalarImpl<float4, float, float4>},
{divScalarImpl<float, double, double>, divScalarImpl<float2, double, double2>, divScalarImpl<float3, double, double3>, divScalarImpl<float4, double, double4>}
},
{
{0 /*divScalarImpl<double, double, uchar>*/, 0 /*divScalarImpl<double2, double, uchar2>*/, 0 /*divScalarImpl<double3, double, uchar3>*/, 0 /*divScalarImpl<double4, double, uchar4>*/},
{0 /*divScalarImpl<double, double, schar>*/, 0 /*divScalarImpl<double2, double, char2>*/, 0 /*divScalarImpl<double3, double, char3>*/, 0 /*divScalarImpl<double4, double, char4>*/},
{0 /*divScalarImpl<double, double, ushort>*/, 0 /*divScalarImpl<double2, double, ushort2>*/, 0 /*divScalarImpl<double3, double, ushort3>*/, 0 /*divScalarImpl<double4, double, ushort4>*/},
{0 /*divScalarImpl<double, double, short>*/, 0 /*divScalarImpl<double2, double, short2>*/, 0 /*divScalarImpl<double3, double, short3>*/, 0 /*divScalarImpl<double4, double, short4>*/},
{0 /*divScalarImpl<double, double, int>*/, 0 /*divScalarImpl<double2, double, int2>*/, 0 /*divScalarImpl<double3, double, int3>*/, 0 /*divScalarImpl<double4, double, int4>*/},
{0 /*divScalarImpl<double, double, float>*/, 0 /*divScalarImpl<double2, double, float2>*/, 0 /*divScalarImpl<double3, double, float3>*/, 0 /*divScalarImpl<double4, double, float4>*/},
{divScalarImpl<double, double, double>, divScalarImpl<double2, double, double2>, divScalarImpl<double3, double, double3>, divScalarImpl<double4, double, double4>}
}
};
const int sdepth = src.depth();
const int ddepth = dst.depth();
const int cn = src.channels();
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
if (inv)
{
val[0] *= scale;
val[1] *= scale;
val[2] *= scale;
val[3] *= scale;
}
else
{
val[0] /= scale;
val[1] /= scale;
val[2] /= scale;
val[3] /= scale;
}
const func_t func = funcs[sdepth][ddepth][cn - 1];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
func(src, val, inv, dst, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,433 +40,61 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#ifndef HAVE_OPENCV_CUDEV
namespace cv { namespace cuda { namespace device
{
namespace imgproc
{
// Utility function to extract unsigned chars from an unsigned integer
__device__ uchar4 int_to_uchar4(unsigned int in)
{
uchar4 bytes;
bytes.x = (in & 0x000000ff) >> 0;
bytes.y = (in & 0x0000ff00) >> 8;
bytes.z = (in & 0x00ff0000) >> 16;
bytes.w = (in & 0xff000000) >> 24;
return bytes;
}
__global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ int sums[128];
const int id = threadIdx.x;
const int lane_id = id % warpSize;
const int warp_id = id / warpSize;
const uint4 data = img(blockIdx.x, id);
const uchar4 a = int_to_uchar4(data.x);
const uchar4 b = int_to_uchar4(data.y);
const uchar4 c = int_to_uchar4(data.z);
const uchar4 d = int_to_uchar4(data.w);
int result[16];
result[0] = a.x;
result[1] = result[0] + a.y;
result[2] = result[1] + a.z;
result[3] = result[2] + a.w;
result[4] = result[3] + b.x;
result[5] = result[4] + b.y;
result[6] = result[5] + b.z;
result[7] = result[6] + b.w;
result[8] = result[7] + c.x;
result[9] = result[8] + c.y;
result[10] = result[9] + c.z;
result[11] = result[10] + c.w;
result[12] = result[11] + d.x;
result[13] = result[12] + d.y;
result[14] = result[13] + d.z;
result[15] = result[14] + d.w;
int sum = result[15];
// the prefix sum for each thread's 16 value is computed,
// now the final sums (result[15]) need to be shared
// with the other threads and add. To do this,
// the __shfl_up() instruction is used and a shuffle scan
// operation is performed to distribute the sums to the correct
// threads
#pragma unroll
for (int i = 1; i < 32; i *= 2)
{
const int n = __shfl_up(sum, i, 32);
if (lane_id >= i)
{
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] += n;
sum += n;
}
}
// Now the final sum for the warp must be shared
// between warps. This is done by each warp
// having a thread store to shared memory, then
// having some other warp load the values and
// compute a prefix sum, again by using __shfl_up.
// The results are uniformly added back to the warps.
// last thread in the warp holding sum of the warp
// places that in shared
if (threadIdx.x % warpSize == warpSize - 1)
sums[warp_id] = result[15];
__syncthreads();
if (warp_id == 0)
{
int warp_sum = sums[lane_id];
#pragma unroll
for (int i = 1; i <= 32; i *= 2)
{
const int n = __shfl_up(warp_sum, i, 32);
if (lane_id >= i)
warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
int blockSum = 0;
// fold in unused warp
if (warp_id > 0)
{
blockSum = sums[warp_id - 1];
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] += blockSum;
}
// assemble result
// Each thread has 16 values to write, which are
// now integer data (to avoid overflow). Instead of
// each thread writing consecutive uint4s, the
// approach shown here experiments using
// the shuffle command to reformat the data
// inside the registers so that each thread holds
// consecutive data to be written so larger contiguous
// segments can be assembled for writing.
/*
For example data that needs to be written as
GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
but is stored in registers (r0..r3), in four threads (0..3) as:
threadId 0 1 2 3
r0 x0 y0 z0 w0
r1 x1 y1 z1 w1
r2 x2 y2 z2 w2
r3 x3 y3 z3 w3
after apply __shfl_xor operations to move data between registers r1..r3:
threadId 00 01 10 11
x0 y0 z0 w0
xor(01)->y1 x1 w1 z1
xor(10)->z2 w2 x2 y2
xor(11)->w3 z3 y3 x3
and now x0..x3, and z0..z3 can be written out in order by all threads.
In the current code, each register above is actually representing
four integers to be written as uint4's to GMEM.
*/
result[4] = __shfl_xor(result[4] , 1, 32);
result[5] = __shfl_xor(result[5] , 1, 32);
result[6] = __shfl_xor(result[6] , 1, 32);
result[7] = __shfl_xor(result[7] , 1, 32);
result[8] = __shfl_xor(result[8] , 2, 32);
result[9] = __shfl_xor(result[9] , 2, 32);
result[10] = __shfl_xor(result[10], 2, 32);
result[11] = __shfl_xor(result[11], 2, 32);
result[12] = __shfl_xor(result[12], 3, 32);
result[13] = __shfl_xor(result[13], 3, 32);
result[14] = __shfl_xor(result[14], 3, 32);
result[15] = __shfl_xor(result[15], 3, 32);
uint4* integral_row = integral.ptr(blockIdx.x);
uint4 output;
///////
if (threadIdx.x % 4 == 0)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 2)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
///////
if (threadIdx.x % 4 == 2)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[4], result[5], result[6], result[7]);
#error "opencv_cudev is required"
if (threadIdx.x % 4 == 0)
output = make_uint4(result[8], result[9], result[10], result[11]);
#else
if (threadIdx.x % 4 == 1)
output = make_uint4(result[12], result[13], result[14], result[15]);
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
using namespace cv::cudev;
// continuning from the above example,
// this use of __shfl_xor() places the y0..y3 and w0..w3 data
// in order.
////////////////////////////////////////////////////////////////////////
// integral
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] = __shfl_xor(result[i], 1, 32);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 2)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
///////
if (threadIdx.x % 4 == 2)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
#endif
}
// This kernel computes columnwise prefix sums. When the data input is
// the row sums from above, this completes the integral image.
// The approach here is to have each block compute a local set of sums.
// First , the data covered by the block is loaded into shared memory,
// then instead of performing a sum in shared memory using __syncthreads
// between stages, the data is reformatted so that the necessary sums
// occur inside warps and the shuffle scan operation is used.
// The final set of sums from the block is then propgated, with the block
// computing "down" the image and adding the running sum to the local
// block sums.
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ unsigned int sums[32][9];
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int lane_id = tidx % 8;
if (tidx >= integral.cols)
return;
sums[threadIdx.x][threadIdx.y] = 0;
__syncthreads();
unsigned int stepSum = 0;
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
{
unsigned int* p = integral.ptr(y) + tidx;
unsigned int sum = *p;
sums[threadIdx.x][threadIdx.y] = sum;
__syncthreads();
// place into SMEM
// shfl scan reduce the SMEM, reformating so the column
// sums are computed in a warp
// then read out properly
const int j = threadIdx.x % 8;
const int k = threadIdx.x / 8 + threadIdx.y * 4;
int partial_sum = sums[k][j];
for (int i = 1; i <= 8; i *= 2)
{
int n = __shfl_up(partial_sum, i, 32);
if (lane_id >= i)
partial_sum += n;
}
sums[k][j] = partial_sum;
__syncthreads();
if (threadIdx.y > 0)
sum += sums[threadIdx.x][threadIdx.y - 1];
sum += stepSum;
stepSum += sums[threadIdx.x][blockDim.y - 1];
__syncthreads();
*p = sum;
}
#endif
}
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
{
{
// each thread handles 16 values, use 1 block/row
// save, becouse step is actually can't be less 512 bytes
int block = integral.cols / 16;
// launch 1 block / row
const int grid = img.rows;
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
cudaSafeCall( cudaGetLastError() );
}
{
const dim3 block(32, 8);
const dim3 grid(divUp(integral.cols, block.x), 1);
shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ unsigned int sums[32][9];
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int lane_id = tidx % 8;
if (tidx >= integral.cols)
return;
sums[threadIdx.x][threadIdx.y] = 0;
__syncthreads();
unsigned int stepSum = 0;
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
{
unsigned int* p = buffer.ptr(y) + tidx;
unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
unsigned int sum = *p;
sums[threadIdx.x][threadIdx.y] = sum;
__syncthreads();
void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& stream)
{
GpuMat src = _src.getGpuMat();
// place into SMEM
// shfl scan reduce the SMEM, reformating so the column
// sums are computed in a warp
// then read out properly
const int j = threadIdx.x % 8;
const int k = threadIdx.x / 8 + threadIdx.y * 4;
CV_Assert( src.type() == CV_8UC1 );
int partial_sum = sums[k][j];
GpuMat_<int>& res = (GpuMat_<int>&) buffer;
for (int i = 1; i <= 8; i *= 2)
{
int n = __shfl_up(partial_sum, i, 32);
gridIntegral(globPtr<uchar>(src), res, stream);
if (lane_id >= i)
partial_sum += n;
}
_dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
GpuMat dst = _dst.getGpuMat();
sums[k][j] = partial_sum;
__syncthreads();
dst.setTo(Scalar::all(0), stream);
if (threadIdx.y > 0)
sum += sums[threadIdx.x][threadIdx.y - 1];
GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
res.copyTo(inner, stream);
}
sum += stepSum;
stepSum += sums[threadIdx.x][blockDim.y - 1];
//////////////////////////////////////////////////////////////////////////////
// sqrIntegral
__syncthreads();
void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& stream)
{
GpuMat src = _src.getGpuMat();
*dst = sum;
}
#endif
}
CV_Assert( src.type() == CV_8UC1 );
// used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
int blockStep, cudaStream_t stream)
{
{
const int block = blockStep;
const int grid = img.rows;
GpuMat_<double>& res = (GpuMat_<double>&) buf;
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
gridIntegral(sqr_(cvt_<int>(globPtr<uchar>(src))), res, stream);
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
cudaSafeCall( cudaGetLastError() );
}
_dst.create(src.rows + 1, src.cols + 1, CV_64FC1);
GpuMat dst = _dst.getGpuMat();
{
const dim3 block(32, 8);
const dim3 grid(divUp(integral.cols, block.x), 1);
dst.setTo(Scalar::all(0), stream);
shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
cudaSafeCall( cudaGetLastError() );
}
}
}
}}}
GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
res.copyTo(inner, stream);
}
#endif /* CUDA_DISABLER */
#endif

@ -0,0 +1,207 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "opencv2/opencv_modules.hpp"
#ifndef HAVE_OPENCV_CUDEV
#error "opencv_cudev is required"
#else
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
using namespace cv;
using namespace cv::cudev;
namespace
{
texture<uchar, cudaTextureType1D, cudaReadModeElementType> texLutTable;
class LookUpTableImpl : public LookUpTable
{
public:
LookUpTableImpl(InputArray lut);
~LookUpTableImpl();
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
private:
GpuMat d_lut;
cudaTextureObject_t texLutTableObj;
bool cc30;
};
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
{
if (_lut.kind() == _InputArray::GPU_MAT)
{
d_lut = _lut.getGpuMat();
}
else
{
Mat h_lut = _lut.getMat();
d_lut.upload(Mat(1, 256, h_lut.type(), h_lut.data));
}
CV_Assert( d_lut.depth() == CV_8U );
CV_Assert( d_lut.rows == 1 && d_lut.cols == 256 );
cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
if (cc30)
{
// Use the texture object
cudaResourceDesc texRes;
std::memset(&texRes, 0, sizeof(texRes));
texRes.resType = cudaResourceTypeLinear;
texRes.res.linear.devPtr = d_lut.data;
texRes.res.linear.desc = cudaCreateChannelDesc<uchar>();
texRes.res.linear.sizeInBytes = 256 * d_lut.channels() * sizeof(uchar);
cudaTextureDesc texDescr;
std::memset(&texDescr, 0, sizeof(texDescr));
CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&texLutTableObj, &texRes, &texDescr, 0) );
}
else
{
// Use the texture reference
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
CV_CUDEV_SAFE_CALL( cudaBindTexture(0, &texLutTable, d_lut.data, &desc) );
}
}
LookUpTableImpl::~LookUpTableImpl()
{
if (cc30)
{
// Use the texture object
cudaDestroyTextureObject(texLutTableObj);
}
else
{
// Use the texture reference
cudaUnbindTexture(texLutTable);
}
}
struct LutTablePtrC1
{
typedef uchar value_type;
typedef uchar index_type;
cudaTextureObject_t texLutTableObj;
__device__ __forceinline__ uchar operator ()(uchar, uchar x) const
{
#if CV_CUDEV_ARCH < 300
// Use the texture reference
return tex1Dfetch(texLutTable, x);
#else
// Use the texture object
return tex1Dfetch<uchar>(texLutTableObj, x);
#endif
}
};
struct LutTablePtrC3
{
typedef uchar3 value_type;
typedef uchar3 index_type;
cudaTextureObject_t texLutTableObj;
__device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const
{
#if CV_CUDEV_ARCH < 300
// Use the texture reference
return make_uchar3(tex1Dfetch(texLutTable, x.x * 3), tex1Dfetch(texLutTable, x.y * 3 + 1), tex1Dfetch(texLutTable, x.z * 3 + 2));
#else
// Use the texture object
return make_uchar3(tex1Dfetch<uchar>(texLutTableObj, x.x * 3), tex1Dfetch<uchar>(texLutTableObj, x.y * 3 + 1), tex1Dfetch<uchar>(texLutTableObj, x.z * 3 + 2));
#endif
}
};
void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& stream)
{
GpuMat src = _src.getGpuMat();
const int cn = src.channels();
const int lut_cn = d_lut.channels();
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
CV_Assert( lut_cn == 1 || lut_cn == cn );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
if (lut_cn == 1)
{
GpuMat_<uchar> src1(src.reshape(1));
GpuMat_<uchar> dst1(dst.reshape(1));
LutTablePtrC1 tbl;
tbl.texLutTableObj = texLutTableObj;
dst1.assign(lut_(src1, tbl), stream);
}
else if (lut_cn == 3)
{
GpuMat_<uchar3>& src3 = (GpuMat_<uchar3>&) src;
GpuMat_<uchar3>& dst3 = (GpuMat_<uchar3>&) dst;
LutTablePtrC3 tbl;
tbl.texLutTableObj = texLutTableObj;
dst3.assign(lut_(src3, tbl), stream);
}
}
}
Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray lut)
{
return makePtr<LookUpTableImpl>(lut);
}
#endif

@ -40,196 +40,248 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/type_traits.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
//////////////////////////////////////////////////////////////////////////
// absMat
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
namespace cv { namespace cuda { namespace device
using namespace cv::cudev;
namespace
{
template <typename T> struct TransformFunctorTraits< abs_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
}}}
}
//////////////////////////////////////////////////////////////////////////////
/// abs
namespace arithm
namespace
{
template <typename T>
void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
void absMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), abs_func<T>(), stream);
}
}
void cv::cuda::abs(InputArray _src, OutputArray _dst, Stream& stream)
{
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
absMat<uchar>,
absMat<schar>,
absMat<ushort>,
absMat<short>,
absMat<int>,
absMat<float>,
absMat<double>
};
template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
GpuMat src = _src.getGpuMat();
const int depth = src.depth();
CV_DbgAssert( depth <= CV_64F );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
funcs[depth](src.reshape(1), dst.reshape(1), stream);
}
//////////////////////////////////////////////////////////////////////////
// sqrMat
//////////////////////////////////////////////////////////////////////////////
/// sqr
namespace arithm
namespace
{
template <typename T> struct Sqr : unary_function<T, T>
template <typename T> struct SqrOp : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
return saturate_cast<T>(x * x);
}
__host__ __device__ __forceinline__ Sqr() {}
__host__ __device__ __forceinline__ Sqr(const Sqr&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< arithm::Sqr<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
void sqrMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), SqrOp<T>(), stream);
}
template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// sqrtMat
namespace cv { namespace cuda { namespace device
void cv::cuda::sqr(InputArray _src, OutputArray _dst, Stream& stream)
{
template <typename T> struct TransformFunctorTraits< sqrt_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
sqrMat<uchar>,
sqrMat<schar>,
sqrMat<ushort>,
sqrMat<short>,
sqrMat<int>,
sqrMat<float>,
sqrMat<double>
};
}}}
namespace arithm
GpuMat src = _src.getGpuMat();
const int depth = src.depth();
CV_DbgAssert( depth <= CV_64F );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
funcs[depth](src.reshape(1), dst.reshape(1), stream);
}
//////////////////////////////////////////////////////////////////////////////
/// sqrt
namespace
{
template <typename T>
void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
void sqrtMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), sqrt_func<T>(), stream);
}
template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// logMat
namespace cv { namespace cuda { namespace device
void cv::cuda::sqrt(InputArray _src, OutputArray _dst, Stream& stream)
{
template <typename T> struct TransformFunctorTraits< log_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
sqrtMat<uchar>,
sqrtMat<schar>,
sqrtMat<ushort>,
sqrtMat<short>,
sqrtMat<int>,
sqrtMat<float>,
sqrtMat<double>
};
}}}
namespace arithm
{
template <typename T>
void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
}
GpuMat src = _src.getGpuMat();
const int depth = src.depth();
CV_DbgAssert( depth <= CV_64F );
template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
funcs[depth](src.reshape(1), dst.reshape(1), stream);
}
//////////////////////////////////////////////////////////////////////////
// expMat
////////////////////////////////////////////////////////////////////////
/// exp
namespace arithm
namespace
{
template <typename T> struct Exp : unary_function<T, T>
template <typename T> struct ExpOp : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
exp_func<T> f;
return saturate_cast<T>(f(x));
}
__host__ __device__ __forceinline__ Exp() {}
__host__ __device__ __forceinline__ Exp(const Exp&) {}
};
template <typename T>
void expMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
{
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), ExpOp<T>(), stream);
}
}
namespace cv { namespace cuda { namespace device
void cv::cuda::exp(InputArray _src, OutputArray _dst, Stream& stream)
{
template <typename T> struct TransformFunctorTraits< arithm::Exp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
expMat<uchar>,
expMat<schar>,
expMat<ushort>,
expMat<short>,
expMat<int>,
expMat<float>,
expMat<double>
};
}}}
namespace arithm
GpuMat src = _src.getGpuMat();
const int depth = src.depth();
CV_DbgAssert( depth <= CV_64F );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
funcs[depth](src.reshape(1), dst.reshape(1), stream);
}
////////////////////////////////////////////////////////////////////////
// log
namespace
{
template <typename T>
void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
void logMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), log_func<T>(), stream);
}
}
void cv::cuda::log(InputArray _src, OutputArray _dst, Stream& stream)
{
typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
logMat<uchar>,
logMat<schar>,
logMat<ushort>,
logMat<short>,
logMat<int>,
logMat<float>,
logMat<double>
};
GpuMat src = _src.getGpuMat();
template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
const int depth = src.depth();
CV_DbgAssert( depth <= CV_64F );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
funcs[depth](src.reshape(1), dst.reshape(1), stream);
}
//////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// pow
namespace arithm
namespace
{
template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
{
float power;
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ T operator()(T e) const
{
return saturate_cast<T>(__powf((float)e, power));
@ -239,8 +291,6 @@ namespace arithm
{
float power;
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ T operator()(T e) const
{
T res = saturate_cast<T>(__powf((float)e, power));
@ -255,8 +305,6 @@ namespace arithm
{
float power;
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ float operator()(float e) const
{
return __powf(::fabs(e), power);
@ -266,37 +314,46 @@ namespace arithm
{
double power;
__host__ explicit PowOp(double power_) : power(power_) {}
__device__ __forceinline__ double operator()(double e) const
{
return ::pow(::fabs(e), power);
}
};
template<typename T>
void powMat(const GpuMat& src, double power, const GpuMat& dst, Stream& stream)
{
PowOp<T> op;
op.power = static_cast<typename LargerType<T, float>::type>(power);
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), op, stream);
}
}
namespace cv { namespace cuda { namespace device
void cv::cuda::pow(InputArray _src, double power, OutputArray _dst, Stream& stream)
{
template <typename T> struct TransformFunctorTraits< arithm::PowOp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
typedef void (*func_t)(const GpuMat& src, double power, const GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
powMat<uchar>,
powMat<schar>,
powMat<ushort>,
powMat<short>,
powMat<int>,
powMat<float>,
powMat<double>
};
}}}
namespace arithm
{
template<typename T>
void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
}
GpuMat src = _src.getGpuMat();
const int depth = src.depth();
CV_DbgAssert(depth <= CV_64F);
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
funcs[depth](src.reshape(1), power, dst.reshape(1), stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,208 +40,72 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/utility.hpp"
#ifndef HAVE_OPENCV_CUDEV
using namespace cv::cuda;
using namespace cv::cuda::device;
#error "opencv_cudev is required"
namespace minMax
{
__device__ unsigned int blocks_finished = 0;
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits;
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
template <int BLOCK_SIZE, typename R>
struct GlobalReduce
{
static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
{
Emulation::glob::atomicMin(minval, mymin);
Emulation::glob::atomicMax(maxval, mymax);
}
#else
__shared__ bool is_last;
if (tid == 0)
{
minval[bid] = mymin;
maxval[bid] = mymax;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
int idx = ::min(tid, gridDim.x * gridDim.y - 1);
mymin = minval[idx];
mymax = maxval[idx];
const minimum<R> minOp;
const maximum<R> maxOp;
device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
if (tid == 0)
{
minval[0] = mymin;
maxval[0] = mymax;
blocks_finished = 0;
}
}
#endif
}
};
template <int BLOCK_SIZE, typename T, typename R, class Mask>
__global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
{
__shared__ R sminval[BLOCK_SIZE];
__shared__ R smaxval[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
R mymin = numeric_limits<R>::max();
R mymax = -numeric_limits<R>::max();
const minimum<R> minOp;
const maximum<R> maxOp;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
#else
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const R srcVal = ptr[x];
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
mymin = minOp(mymin, srcVal);
mymax = maxOp(mymax, srcVal);
}
}
}
using namespace cv::cudev;
device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
namespace
{
template <typename T>
void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal)
{
block = dim3(threads_x, threads_y);
typedef typename SelectIf<
TypesEquals<T, double>::value,
double,
typename SelectIf<TypesEquals<T, float>::value, float, int>::type
>::type work_type;
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<work_type>& buf = (GpuMat_<work_type>&) _buf;
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
if (mask.empty())
gridFindMinMaxVal(src, buf);
else
gridFindMinMaxVal(src, buf, globPtr<uchar>(mask));
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
work_type data[2];
buf.download(cv::Mat(1, 2, buf.type(), data));
bufcols = grid.x * grid.y * sizeof(double);
bufrows = 2;
}
if (minVal)
*minVal = data[0];
__global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
{
*minval_buf = numeric_limits<int>::max();
*maxval_buf = numeric_limits<int>::min();
}
__global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
{
*minval_buf = numeric_limits<float>::max();
*maxval_buf = -numeric_limits<float>::max();
}
__global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
{
*minval_buf = numeric_limits<double>::max();
*maxval_buf = -numeric_limits<double>::max();
}
template <typename R>
void setDefault(R* minval_buf, R* maxval_buf)
{
setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
if (maxVal)
*maxVal = data[1];
}
}
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
{
typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal);
static const func_t funcs[] =
{
typedef typename MinMaxTypeTraits<T>::best_type R;
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
R* minval_buf = (R*) buf.ptr(0);
R* maxval_buf = (R*) buf.ptr(1);
setDefault(minval_buf, maxval_buf);
if (mask.data)
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
else
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
minMaxImpl<uchar>,
minMaxImpl<schar>,
minMaxImpl<ushort>,
minMaxImpl<short>,
minMaxImpl<int>,
minMaxImpl<float>,
minMaxImpl<double>
};
cudaSafeCall( cudaGetLastError() );
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
cudaSafeCall( cudaDeviceSynchronize() );
CV_Assert( src.channels() == 1 );
CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
R minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
}
const func_t func = funcs[src.depth()];
template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
func(src, mask, buf, minVal, maxVal);
}
#endif // CUDA_DISABLER
#endif

@ -40,189 +40,204 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
//////////////////////////////////////////////////////////////////////////
// min
#include "opencv2/cudev.hpp"
namespace arithm
using namespace cv::cudev;
void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int op);
void minMaxScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op);
///////////////////////////////////////////////////////////////////////
/// minMaxMat
namespace
{
struct VMin4 : binary_function<uint, uint, uint>
template <template <typename> class Op, typename T>
void minMaxMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), Op<T>(), stream);
}
struct MinOp2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmin4(a, b);
return vmin2(a, b);
}
__host__ __device__ __forceinline__ VMin4() {}
__host__ __device__ __forceinline__ VMin4(const VMin4&) {}
};
struct VMin2 : binary_function<uint, uint, uint>
struct MaxOp2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmin2(a, b);
return vmax2(a, b);
}
__host__ __device__ __forceinline__ VMin2() {}
__host__ __device__ __forceinline__ VMin2(const VMin2&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
template <class Op2>
void minMaxMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
};
const int vcols = src1.cols >> 1;
template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
template <typename T> struct TransformFunctorTraits< minimum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
gridTransformBinary(src1_, src2_, dst_, Op2(), stream);
}
template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
struct MinOp4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmin4(a, b);
}
};
}}}
namespace arithm
{
void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
}
void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
struct MaxOp4 : binary_function<uint, uint, uint>
{
device::transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
}
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmax4(a, b);
}
};
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
template <class Op4>
void minMaxMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
}
const int vcols = src1.cols >> 2;
template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
gridTransformBinary(src1_, src2_, dst_, Op4(), stream);
}
template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// max
namespace arithm
void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int op)
{
struct VMax4 : binary_function<uint, uint, uint>
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
static const func_t funcs_v1[2][7] =
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmax4(a, b);
}
__host__ __device__ __forceinline__ VMax4() {}
__host__ __device__ __forceinline__ VMax4(const VMax4&) {}
};
struct VMax2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
minMaxMat_v1<minimum, uchar>,
minMaxMat_v1<minimum, schar>,
minMaxMat_v1<minimum, ushort>,
minMaxMat_v1<minimum, short>,
minMaxMat_v1<minimum, int>,
minMaxMat_v1<minimum, float>,
minMaxMat_v1<minimum, double>
},
{
return vmax2(a, b);
minMaxMat_v1<maximum, uchar>,
minMaxMat_v1<maximum, schar>,
minMaxMat_v1<maximum, ushort>,
minMaxMat_v1<maximum, short>,
minMaxMat_v1<maximum, int>,
minMaxMat_v1<maximum, float>,
minMaxMat_v1<maximum, double>
}
__host__ __device__ __forceinline__ VMax2() {}
__host__ __device__ __forceinline__ VMax2(const VMax2&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
static const func_t funcs_v2[2] =
{
minMaxMat_v2<MinOp2>, minMaxMat_v2<MaxOp2>
};
template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
static const func_t funcs_v4[2] =
{
minMaxMat_v4<MinOp4>, minMaxMat_v4<MaxOp4>
};
template <typename T> struct TransformFunctorTraits< maximum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
const int depth = src1.depth();
template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
CV_DbgAssert( depth <= CV_64F );
namespace arithm
{
void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
if (depth == CV_8U || depth == CV_16U)
{
device::transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
funcs_v4[op](src1_, src2_, dst_, stream);
return;
}
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
funcs_v2[op](src1_, src2_, dst_, stream);
return;
}
}
}
void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
const func_t func = funcs_v1[op][depth];
func(src1_, src2_, dst_, stream);
}
///////////////////////////////////////////////////////////////////////
/// minMaxScalar
namespace
{
template <template <typename> class Op, typename T>
void minMaxScalar(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
{
device::transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), cv::saturate_cast<T>(value)), stream);
}
}
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void minMaxScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op)
{
typedef void (*func_t)(const GpuMat& src, double value, GpuMat& dst, Stream& stream);
static const func_t funcs[2][7] =
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
}
{
minMaxScalar<minimum, uchar>,
minMaxScalar<minimum, schar>,
minMaxScalar<minimum, ushort>,
minMaxScalar<minimum, short>,
minMaxScalar<minimum, int>,
minMaxScalar<minimum, float>,
minMaxScalar<minimum, double>
},
{
minMaxScalar<maximum, uchar>,
minMaxScalar<maximum, schar>,
minMaxScalar<maximum, ushort>,
minMaxScalar<maximum, short>,
minMaxScalar<maximum, int>,
minMaxScalar<maximum, float>,
minMaxScalar<maximum, double>
}
};
template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
const int depth = src.depth();
template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
}
CV_DbgAssert( depth <= CV_64F );
CV_DbgAssert( src.channels() == 1 );
template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
funcs[op][depth](src, value[0], dst, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,197 +40,88 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/utility.hpp"
#ifndef HAVE_OPENCV_CUDEV
using namespace cv::cuda;
using namespace cv::cuda::device;
#error "opencv_cudev is required"
namespace minMaxLoc
{
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits;
template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
template <int BLOCK_SIZE, typename T, class Mask>
__global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
__shared__ work_type sminval[BLOCK_SIZE];
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
#else
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
using namespace cv::cudev;
work_type mymin = numeric_limits<work_type>::max();
work_type mymax = -numeric_limits<work_type>::max();
unsigned int myminloc = 0;
unsigned int mymaxloc = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const work_type srcVal = ptr[x];
if (srcVal < mymin)
{
mymin = srcVal;
myminloc = y * src.cols + x;
}
if (srcVal > mymax)
{
mymax = srcVal;
mymaxloc = y * src.cols + x;
}
}
}
}
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
tid,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
if (tid == 0)
{
minval[bid] = (T) mymin;
maxval[bid] = (T) mymax;
minloc[bid] = myminloc;
maxloc[bid] = mymaxloc;
}
}
template <int BLOCK_SIZE, typename T>
__global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
namespace
{
template <typename T>
void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
typedef typename SelectIf<
TypesEquals<T, double>::value,
double,
typename SelectIf<TypesEquals<T, float>::value, float, int>::type
>::type work_type;
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<work_type>& valBuf = (GpuMat_<work_type>&) _valBuf;
GpuMat_<int>& locBuf = (GpuMat_<int>&) _locBuf;
if (mask.empty())
gridMinMaxLoc(src, valBuf, locBuf);
else
gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask));
__shared__ work_type sminval[BLOCK_SIZE];
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
cv::Mat_<work_type> h_valBuf;
cv::Mat_<int> h_locBuf;
unsigned int idx = ::min(threadIdx.x, count - 1);
valBuf.download(h_valBuf);
locBuf.download(h_locBuf);
work_type mymin = minval[idx];
work_type mymax = maxval[idx];
unsigned int myminloc = minloc[idx];
unsigned int mymaxloc = maxloc[idx];
if (minVal)
*minVal = h_valBuf(0, 0);
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
threadIdx.x,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
if (maxVal)
*maxVal = h_valBuf(1, 0);
if (threadIdx.x == 0)
if (minLoc)
{
minval[0] = (T) mymin;
maxval[0] = (T) mymax;
minloc[0] = myminloc;
maxloc[0] = mymaxloc;
const int idx = h_locBuf(0, 0);
*minLoc = cv::Point(idx % src.cols, idx / src.cols);
}
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
// For values
b1cols = (int)(grid.x * grid.y * elem_size);
b1rows = 2;
// For locations
b2cols = grid.x * grid.y * sizeof(int);
b2rows = 2;
if (maxLoc)
{
const int idx = h_locBuf(1, 0);
*maxLoc = cv::Point(idx % src.cols, idx / src.cols);
}
}
}
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
{
typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc);
static const func_t funcs[] =
{
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
minMaxLocImpl<uchar>,
minMaxLocImpl<schar>,
minMaxLocImpl<ushort>,
minMaxLocImpl<short>,
minMaxLocImpl<int>,
minMaxLocImpl<float>,
minMaxLocImpl<double>
};
T* minval_buf = (T*) valbuf.ptr(0);
T* maxval_buf = (T*) valbuf.ptr(1);
unsigned int* minloc_buf = locbuf.ptr(0);
unsigned int* maxloc_buf = locbuf.ptr(1);
if (mask.data)
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
else
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
cudaSafeCall( cudaGetLastError() );
CV_Assert( src.channels() == 1 );
CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
unsigned int minloc_, maxloc_;
cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
const func_t func = funcs[src.depth()];
template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
func(src, mask, valBuf, locBuf, minVal, maxVal, minLoc, maxLoc);
}
#endif // CUDA_DISABLER
#endif

@ -40,172 +40,185 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
{
struct Mul_8uc4_32f : binary_function<uint, float, uint>
{
__device__ __forceinline__ uint operator ()(uint a, float b) const
{
uint res = 0;
res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
return res;
}
__host__ __device__ __forceinline__ Mul_8uc4_32f() {}
__host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
};
#include "opencv2/cudev.hpp"
struct Mul_16sc4_32f : binary_function<short4, float, short4>
{
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
{
return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
}
using namespace cv::cudev;
__host__ __device__ __forceinline__ Mul_16sc4_32f() {}
__host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
};
void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int);
void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
template <typename T, typename D> struct Mul : binary_function<T, T, D>
namespace
{
template <typename T, typename D> struct MulOp : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a * b);
}
__host__ __device__ __forceinline__ Mul() {}
__host__ __device__ __forceinline__ Mul(const Mul&) {}
};
template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
template <typename T, typename S, typename D> struct MulScaleOp : binary_function<T, T, D>
{
S scale;
__host__ explicit MulScale(S scale_) : scale(scale_) {}
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(scale * a * b);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits<arithm::Mul_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::Mul<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
}}}
namespace arithm
{
void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
}
void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
}
template <typename T, typename S, typename D>
void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
void mulMatImpl(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream)
{
if (scale == 1)
{
Mul<T, D> op;
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
MulOp<T, D> op;
gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
}
else
{
MulScale<T, S, D> op(static_cast<S>(scale));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
MulScaleOp<T, S, D> op;
op.scale = static_cast<S>(scale);
gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
}
}
}
void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream);
static const func_t funcs[7][7] =
{
{
mulMatImpl<uchar, float, uchar>,
mulMatImpl<uchar, float, schar>,
mulMatImpl<uchar, float, ushort>,
mulMatImpl<uchar, float, short>,
mulMatImpl<uchar, float, int>,
mulMatImpl<uchar, float, float>,
mulMatImpl<uchar, double, double>
},
{
mulMatImpl<schar, float, uchar>,
mulMatImpl<schar, float, schar>,
mulMatImpl<schar, float, ushort>,
mulMatImpl<schar, float, short>,
mulMatImpl<schar, float, int>,
mulMatImpl<schar, float, float>,
mulMatImpl<schar, double, double>
},
{
0 /*mulMatImpl<ushort, float, uchar>*/,
0 /*mulMatImpl<ushort, float, schar>*/,
mulMatImpl<ushort, float, ushort>,
mulMatImpl<ushort, float, short>,
mulMatImpl<ushort, float, int>,
mulMatImpl<ushort, float, float>,
mulMatImpl<ushort, double, double>
},
{
0 /*mulMatImpl<short, float, uchar>*/,
0 /*mulMatImpl<short, float, schar>*/,
mulMatImpl<short, float, ushort>,
mulMatImpl<short, float, short>,
mulMatImpl<short, float, int>,
mulMatImpl<short, float, float>,
mulMatImpl<short, double, double>
},
{
0 /*mulMatImpl<int, float, uchar>*/,
0 /*mulMatImpl<int, float, schar>*/,
0 /*mulMatImpl<int, float, ushort>*/,
0 /*mulMatImpl<int, float, short>*/,
mulMatImpl<int, float, int>,
mulMatImpl<int, float, float>,
mulMatImpl<int, double, double>
},
{
0 /*mulMatImpl<float, float, uchar>*/,
0 /*mulMatImpl<float, float, schar>*/,
0 /*mulMatImpl<float, float, ushort>*/,
0 /*mulMatImpl<float, float, short>*/,
0 /*mulMatImpl<float, float, int>*/,
mulMatImpl<float, float, float>,
mulMatImpl<float, double, double>
},
{
0 /*mulMatImpl<double, double, uchar>*/,
0 /*mulMatImpl<double, double, schar>*/,
0 /*mulMatImpl<double, double, ushort>*/,
0 /*mulMatImpl<double, double, short>*/,
0 /*mulMatImpl<double, double, int>*/,
0 /*mulMatImpl<double, double, float>*/,
mulMatImpl<double, double, double>
}
};
const int sdepth = src1.depth();
const int ddepth = dst.depth();
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
func(src1_, src2_, dst_, scale, stream);
}
namespace
{
template <typename T>
struct MulOpSpecial : binary_function<T, float, T>
{
__device__ __forceinline__ T operator ()(const T& a, float b) const
{
typedef typename VecTraits<T>::elem_type elem_type;
T res;
res.x = saturate_cast<elem_type>(a.x * b);
res.y = saturate_cast<elem_type>(a.y * b);
res.z = saturate_cast<elem_type>(a.z * b);
res.w = saturate_cast<elem_type>(a.w * b);
return res;
}
};
}
template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
gridTransformBinary(globPtr<uchar4>(src1), globPtr<float>(src2), globPtr<uchar4>(dst), MulOpSpecial<uchar4>(), stream);
}
void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
gridTransformBinary(globPtr<short4>(src1), globPtr<float>(src2), globPtr<short4>(dst), MulOpSpecial<short4>(), stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,105 +40,143 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int);
namespace
{
template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D>
template <typename SrcType, typename ScalarType, typename DstType> struct MulScalarOp : unary_function<SrcType, DstType>
{
S val;
__host__ explicit MulScalar(S val_) : val(val_) {}
ScalarType val;
__device__ __forceinline__ D operator ()(T a) const
__device__ __forceinline__ DstType operator ()(SrcType a) const
{
return saturate_cast<D>(a * val);
return saturate_cast<DstType>(saturate_cast<ScalarType>(a) * val);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
template <typename SrcType, typename ScalarDepth, typename DstType>
void mulScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
{
MulScalar<T, S, D> op(static_cast<S>(val));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
cv::Scalar_<ScalarDepth> value_ = value;
MulScalarOp<SrcType, ScalarType, DstType> op;
op.val = VecTraits<ScalarType>::make(value_.val);
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
}
}
void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
{
typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, Stream& stream);
static const func_t funcs[7][7][4] =
{
{
{mulScalarImpl<uchar, float, uchar>, mulScalarImpl<uchar2, float, uchar2>, mulScalarImpl<uchar3, float, uchar3>, mulScalarImpl<uchar4, float, uchar4>},
{mulScalarImpl<uchar, float, schar>, mulScalarImpl<uchar2, float, char2>, mulScalarImpl<uchar3, float, char3>, mulScalarImpl<uchar4, float, char4>},
{mulScalarImpl<uchar, float, ushort>, mulScalarImpl<uchar2, float, ushort2>, mulScalarImpl<uchar3, float, ushort3>, mulScalarImpl<uchar4, float, ushort4>},
{mulScalarImpl<uchar, float, short>, mulScalarImpl<uchar2, float, short2>, mulScalarImpl<uchar3, float, short3>, mulScalarImpl<uchar4, float, short4>},
{mulScalarImpl<uchar, float, int>, mulScalarImpl<uchar2, float, int2>, mulScalarImpl<uchar3, float, int3>, mulScalarImpl<uchar4, float, int4>},
{mulScalarImpl<uchar, float, float>, mulScalarImpl<uchar2, float, float2>, mulScalarImpl<uchar3, float, float3>, mulScalarImpl<uchar4, float, float4>},
{mulScalarImpl<uchar, double, double>, mulScalarImpl<uchar2, double, double2>, mulScalarImpl<uchar3, double, double3>, mulScalarImpl<uchar4, double, double4>}
},
{
{mulScalarImpl<schar, float, uchar>, mulScalarImpl<char2, float, uchar2>, mulScalarImpl<char3, float, uchar3>, mulScalarImpl<char4, float, uchar4>},
{mulScalarImpl<schar, float, schar>, mulScalarImpl<char2, float, char2>, mulScalarImpl<char3, float, char3>, mulScalarImpl<char4, float, char4>},
{mulScalarImpl<schar, float, ushort>, mulScalarImpl<char2, float, ushort2>, mulScalarImpl<char3, float, ushort3>, mulScalarImpl<char4, float, ushort4>},
{mulScalarImpl<schar, float, short>, mulScalarImpl<char2, float, short2>, mulScalarImpl<char3, float, short3>, mulScalarImpl<char4, float, short4>},
{mulScalarImpl<schar, float, int>, mulScalarImpl<char2, float, int2>, mulScalarImpl<char3, float, int3>, mulScalarImpl<char4, float, int4>},
{mulScalarImpl<schar, float, float>, mulScalarImpl<char2, float, float2>, mulScalarImpl<char3, float, float3>, mulScalarImpl<char4, float, float4>},
{mulScalarImpl<schar, double, double>, mulScalarImpl<char2, double, double2>, mulScalarImpl<char3, double, double3>, mulScalarImpl<char4, double, double4>}
},
{
{0 /*mulScalarImpl<ushort, float, uchar>*/, 0 /*mulScalarImpl<ushort2, float, uchar2>*/, 0 /*mulScalarImpl<ushort3, float, uchar3>*/, 0 /*mulScalarImpl<ushort4, float, uchar4>*/},
{0 /*mulScalarImpl<ushort, float, schar>*/, 0 /*mulScalarImpl<ushort2, float, char2>*/, 0 /*mulScalarImpl<ushort3, float, char3>*/, 0 /*mulScalarImpl<ushort4, float, char4>*/},
{mulScalarImpl<ushort, float, ushort>, mulScalarImpl<ushort2, float, ushort2>, mulScalarImpl<ushort3, float, ushort3>, mulScalarImpl<ushort4, float, ushort4>},
{mulScalarImpl<ushort, float, short>, mulScalarImpl<ushort2, float, short2>, mulScalarImpl<ushort3, float, short3>, mulScalarImpl<ushort4, float, short4>},
{mulScalarImpl<ushort, float, int>, mulScalarImpl<ushort2, float, int2>, mulScalarImpl<ushort3, float, int3>, mulScalarImpl<ushort4, float, int4>},
{mulScalarImpl<ushort, float, float>, mulScalarImpl<ushort2, float, float2>, mulScalarImpl<ushort3, float, float3>, mulScalarImpl<ushort4, float, float4>},
{mulScalarImpl<ushort, double, double>, mulScalarImpl<ushort2, double, double2>, mulScalarImpl<ushort3, double, double3>, mulScalarImpl<ushort4, double, double4>}
},
{
{0 /*mulScalarImpl<short, float, uchar>*/, 0 /*mulScalarImpl<short2, float, uchar2>*/, 0 /*mulScalarImpl<short3, float, uchar3>*/, 0 /*mulScalarImpl<short4, float, uchar4>*/},
{0 /*mulScalarImpl<short, float, schar>*/, 0 /*mulScalarImpl<short2, float, char2>*/, 0 /*mulScalarImpl<short3, float, char3>*/, 0 /*mulScalarImpl<short4, float, char4>*/},
{mulScalarImpl<short, float, ushort>, mulScalarImpl<short2, float, ushort2>, mulScalarImpl<short3, float, ushort3>, mulScalarImpl<short4, float, ushort4>},
{mulScalarImpl<short, float, short>, mulScalarImpl<short2, float, short2>, mulScalarImpl<short3, float, short3>, mulScalarImpl<short4, float, short4>},
{mulScalarImpl<short, float, int>, mulScalarImpl<short2, float, int2>, mulScalarImpl<short3, float, int3>, mulScalarImpl<short4, float, int4>},
{mulScalarImpl<short, float, float>, mulScalarImpl<short2, float, float2>, mulScalarImpl<short3, float, float3>, mulScalarImpl<short4, float, float4>},
{mulScalarImpl<short, double, double>, mulScalarImpl<short2, double, double2>, mulScalarImpl<short3, double, double3>, mulScalarImpl<short4, double, double4>}
},
{
{0 /*mulScalarImpl<int, float, uchar>*/, 0 /*mulScalarImpl<int2, float, uchar2>*/, 0 /*mulScalarImpl<int3, float, uchar3>*/, 0 /*mulScalarImpl<int4, float, uchar4>*/},
{0 /*mulScalarImpl<int, float, schar>*/, 0 /*mulScalarImpl<int2, float, char2>*/, 0 /*mulScalarImpl<int3, float, char3>*/, 0 /*mulScalarImpl<int4, float, char4>*/},
{0 /*mulScalarImpl<int, float, ushort>*/, 0 /*mulScalarImpl<int2, float, ushort2>*/, 0 /*mulScalarImpl<int3, float, ushort3>*/, 0 /*mulScalarImpl<int4, float, ushort4>*/},
{0 /*mulScalarImpl<int, float, short>*/, 0 /*mulScalarImpl<int2, float, short2>*/, 0 /*mulScalarImpl<int3, float, short3>*/, 0 /*mulScalarImpl<int4, float, short4>*/},
{mulScalarImpl<int, float, int>, mulScalarImpl<int2, float, int2>, mulScalarImpl<int3, float, int3>, mulScalarImpl<int4, float, int4>},
{mulScalarImpl<int, float, float>, mulScalarImpl<int2, float, float2>, mulScalarImpl<int3, float, float3>, mulScalarImpl<int4, float, float4>},
{mulScalarImpl<int, double, double>, mulScalarImpl<int2, double, double2>, mulScalarImpl<int3, double, double3>, mulScalarImpl<int4, double, double4>}
},
{
{0 /*mulScalarImpl<float, float, uchar>*/, 0 /*mulScalarImpl<float2, float, uchar2>*/, 0 /*mulScalarImpl<float3, float, uchar3>*/, 0 /*mulScalarImpl<float4, float, uchar4>*/},
{0 /*mulScalarImpl<float, float, schar>*/, 0 /*mulScalarImpl<float2, float, char2>*/, 0 /*mulScalarImpl<float3, float, char3>*/, 0 /*mulScalarImpl<float4, float, char4>*/},
{0 /*mulScalarImpl<float, float, ushort>*/, 0 /*mulScalarImpl<float2, float, ushort2>*/, 0 /*mulScalarImpl<float3, float, ushort3>*/, 0 /*mulScalarImpl<float4, float, ushort4>*/},
{0 /*mulScalarImpl<float, float, short>*/, 0 /*mulScalarImpl<float2, float, short2>*/, 0 /*mulScalarImpl<float3, float, short3>*/, 0 /*mulScalarImpl<float4, float, short4>*/},
{0 /*mulScalarImpl<float, float, int>*/, 0 /*mulScalarImpl<float2, float, int2>*/, 0 /*mulScalarImpl<float3, float, int3>*/, 0 /*mulScalarImpl<float4, float, int4>*/},
{mulScalarImpl<float, float, float>, mulScalarImpl<float2, float, float2>, mulScalarImpl<float3, float, float3>, mulScalarImpl<float4, float, float4>},
{mulScalarImpl<float, double, double>, mulScalarImpl<float2, double, double2>, mulScalarImpl<float3, double, double3>, mulScalarImpl<float4, double, double4>}
},
{
{0 /*mulScalarImpl<double, double, uchar>*/, 0 /*mulScalarImpl<double2, double, uchar2>*/, 0 /*mulScalarImpl<double3, double, uchar3>*/, 0 /*mulScalarImpl<double4, double, uchar4>*/},
{0 /*mulScalarImpl<double, double, schar>*/, 0 /*mulScalarImpl<double2, double, char2>*/, 0 /*mulScalarImpl<double3, double, char3>*/, 0 /*mulScalarImpl<double4, double, char4>*/},
{0 /*mulScalarImpl<double, double, ushort>*/, 0 /*mulScalarImpl<double2, double, ushort2>*/, 0 /*mulScalarImpl<double3, double, ushort3>*/, 0 /*mulScalarImpl<double4, double, ushort4>*/},
{0 /*mulScalarImpl<double, double, short>*/, 0 /*mulScalarImpl<double2, double, short2>*/, 0 /*mulScalarImpl<double3, double, short3>*/, 0 /*mulScalarImpl<double4, double, short4>*/},
{0 /*mulScalarImpl<double, double, int>*/, 0 /*mulScalarImpl<double2, double, int2>*/, 0 /*mulScalarImpl<double3, double, int3>*/, 0 /*mulScalarImpl<double4, double, int4>*/},
{0 /*mulScalarImpl<double, double, float>*/, 0 /*mulScalarImpl<double2, double, float2>*/, 0 /*mulScalarImpl<double3, double, float3>*/, 0 /*mulScalarImpl<double4, double, float4>*/},
{mulScalarImpl<double, double, double>, mulScalarImpl<double2, double, double2>, mulScalarImpl<double3, double, double3>, mulScalarImpl<double4, double, double4>}
}
};
const int sdepth = src.depth();
const int ddepth = dst.depth();
const int cn = src.channels();
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
val[0] *= scale;
val[1] *= scale;
val[2] *= scale;
val[3] *= scale;
const func_t func = funcs[sdepth][ddepth][cn - 1];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
func(src, val, dst, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,132 +40,126 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "cvconfig.h"
#ifndef HAVE_OPENCV_CUDEV
#ifdef HAVE_CUFFT
#error "opencv_cudev is required"
#include <cufft.h>
#else
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
namespace cv { namespace cuda { namespace device
{
//////////////////////////////////////////////////////////////////////////
// mulSpectrums
using namespace cv::cudev;
__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
//////////////////////////////////////////////////////////////////////////////
// mulSpectrums
if (x < c.cols && y < c.rows)
{
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
}
namespace
{
__device__ __forceinline__ float real(const float2& val)
{
return val.x;
}
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
__device__ __forceinline__ float imag(const float2& val)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
return val.y;
}
__device__ __forceinline__ float2 cmul(const float2& a, const float2& b)
{
return make_float2((real(a) * real(b)) - (imag(a) * imag(b)),
(real(a) * imag(b)) + (imag(a) * real(b)));
}
//////////////////////////////////////////////////////////////////////////
// mulSpectrums_CONJ
__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
__device__ __forceinline__ float2 conj(const float2& a)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
return make_float2(real(a), -imag(a));
}
if (x < c.cols && y < c.rows)
struct comlex_mul : binary_function<float2, float2, float2>
{
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
{
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
return cmul(a, b);
}
}
};
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
struct comlex_mul_conj : binary_function<float2, float2, float2>
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
{
return cmul(a, conj(b));
}
};
struct comlex_mul_scale : binary_function<float2, float2, float2>
{
float scale;
//////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
{
return scale * cmul(a, b);
}
};
__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
struct comlex_mul_conj_scale : binary_function<float2, float2, float2>
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
float scale;
if (x < c.cols && y < c.rows)
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
{
cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
return scale * cmul(a, conj(b));
}
}
};
}
void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
{
(void) flags;
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
CV_Assert( src1.size() == src2.size() );
if (stream)
cudaSafeCall( cudaDeviceSynchronize() );
}
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
if (conjB)
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul_conj(), stream);
else
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul(), stream);
}
//////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums_CONJ
void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
{
(void) flags;
__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
if (x < c.cols && y < c.rows)
{
cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
}
}
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
if (conjB)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
comlex_mul_conj_scale op;
op.scale = scale;
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
}
}}} // namespace cv { namespace cuda { namespace cudev
#endif // HAVE_CUFFT
else
{
comlex_mul_scale op;
op.scale = scale;
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
}
}
#endif /* CUDA_DISABLER */
#endif

@ -0,0 +1,119 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "opencv2/opencv_modules.hpp"
#ifndef HAVE_OPENCV_CUDEV
#error "opencv_cudev is required"
#else
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
namespace
{
double normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
{
const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
gridFindMinMaxVal(abs_(cvt_<int>(src1) - cvt_<int>(src2)), buf);
int data[2];
buf.download(cv::Mat(1, 2, buf.type(), data));
return data[1];
}
double normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
{
const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
gridCalcSum(abs_(cvt_<int>(src1) - cvt_<int>(src2)), buf);
int data;
buf.download(cv::Mat(1, 1, buf.type(), &data));
return data;
}
double normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
{
const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
GpuMat_<double>& buf = (GpuMat_<double>&) _buf;
gridCalcSum(sqr_(cvt_<double>(src1) - cvt_<double>(src2)), buf);
double data;
buf.download(cv::Mat(1, 1, buf.type(), &data));
return std::sqrt(data);
}
}
double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
{
typedef double (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf);
static const func_t funcs[] =
{
0, normDiffInf, normDiffL1, 0, normDiffL2
};
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == CV_8UC1 );
CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
return funcs[normType](src1, src2, buf);
}
#endif

@ -40,178 +40,172 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#ifndef HAVE_OPENCV_CUDEV
namespace cv { namespace cuda { namespace device
#error "opencv_cudev is required"
#else
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void cv::cuda::magnitude(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
{
GpuMat x = _x.getGpuMat();
GpuMat y = _y.getGpuMat();
CV_DbgAssert( x.depth() == CV_32F );
CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
_dst.create(x.size(), CV_32FC1);
GpuMat dst = _dst.getGpuMat();
GpuMat_<float> xc(x.reshape(1));
GpuMat_<float> yc(y.reshape(1));
GpuMat_<float> magc(dst.reshape(1));
gridTransformBinary(xc, yc, magc, magnitude_func<float>(), stream);
}
void cv::cuda::magnitudeSqr(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
{
GpuMat x = _x.getGpuMat();
GpuMat y = _y.getGpuMat();
CV_DbgAssert( x.depth() == CV_32F );
CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
_dst.create(x.size(), CV_32FC1);
GpuMat dst = _dst.getGpuMat();
GpuMat_<float> xc(x.reshape(1));
GpuMat_<float> yc(y.reshape(1));
GpuMat_<float> magc(dst.reshape(1));
gridTransformBinary(xc, yc, magc, magnitude_sqr_func<float>(), stream);
}
void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleInDegrees, Stream& stream)
{
GpuMat x = _x.getGpuMat();
GpuMat y = _y.getGpuMat();
CV_DbgAssert( x.depth() == CV_32F );
CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
_dst.create(x.size(), CV_32FC1);
GpuMat dst = _dst.getGpuMat();
GpuMat_<float> xc(x.reshape(1));
GpuMat_<float> yc(y.reshape(1));
GpuMat_<float> anglec(dst.reshape(1));
if (angleInDegrees)
gridTransformBinary(xc, yc, anglec, direction_func<float, true>(), stream);
else
gridTransformBinary(xc, yc, anglec, direction_func<float, false>(), stream);
}
void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, OutputArray _angle, bool angleInDegrees, Stream& stream)
{
namespace mathfunc
GpuMat x = _x.getGpuMat();
GpuMat y = _y.getGpuMat();
CV_DbgAssert( x.depth() == CV_32F );
CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
_mag.create(x.size(), CV_32FC1);
GpuMat mag = _mag.getGpuMat();
_angle.create(x.size(), CV_32FC1);
GpuMat angle = _angle.getGpuMat();
GpuMat_<float> xc(x.reshape(1));
GpuMat_<float> yc(y.reshape(1));
GpuMat_<float> magc(mag.reshape(1));
GpuMat_<float> anglec(angle.reshape(1));
if (angleInDegrees)
{
gridTransformTuple(zipPtr(xc, yc),
tie(magc, anglec),
make_tuple(
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
binaryTupleAdapter<0, 1>(direction_func<float, true>())),
stream);
}
else
{
gridTransformTuple(zipPtr(xc, yc),
tie(magc, anglec),
make_tuple(
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
binaryTupleAdapter<0, 1>(direction_func<float, false>())),
stream);
}
}
namespace
{
template <bool useMag>
__global__ void polarToCartImpl(const GlobPtr<float> mag, const GlobPtr<float> angle, GlobPtr<float> xmat, GlobPtr<float> ymat, const float scale, const int rows, const int cols)
{
//////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar
struct Nothing
{
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
{
}
};
struct Magnitude
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
}
};
struct MagnitudeSqr
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
}
};
struct Atan2
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
{
float angle = ::atan2f(y_data, x_data);
angle += (angle < 0) * 2.0f * CV_PI_F;
dst[y * dst_step + x] = scale * angle;
}
};
template <typename Mag, typename Angle>
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
float x_data = xptr[y * x_step + x];
float y_data = yptr[y * y_step + x];
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
}
}
struct NonEmptyMag
{
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
{
return mag[y * mag_step + x];
}
};
struct EmptyMag
{
static __device__ __forceinline__ float get(const float*, size_t, int, int)
{
return 1.0f;
}
};
template <typename Mag>
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
float mag_data = Mag::get(mag, mag_step, x, y);
float angle_data = angle[y * angle_step + x];
float sin_a, cos_a;
::sincosf(scale * angle_data, &sin_a, &cos_a);
xptr[y * x_step + x] = mag_data * cos_a;
yptr[y * y_step + x] = mag_data * sin_a;
}
}
template <typename Mag, typename Angle>
void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(x.cols, threads.x);
grid.y = divUp(x.rows, threads.y);
const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2][2][2] =
{
{
{
cartToPolar_caller<Magnitude, Atan2>,
cartToPolar_caller<Magnitude, Nothing>
},
{
cartToPolar_caller<MagnitudeSqr, Atan2>,
cartToPolar_caller<MagnitudeSqr, Nothing>,
}
},
{
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>
},
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>,
}
}
};
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
}
template <typename Mag>
void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(mag.cols, threads.x);
grid.y = divUp(mag.rows, threads.y);
const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] =
{
polarToCart_caller<NonEmptyMag>,
polarToCart_caller<EmptyMag>
};
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
}
} // namespace mathfunc
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= cols || y >= rows)
return;
const float mag_val = useMag ? mag(y, x) : 1.0f;
const float angle_val = angle(y, x);
float sin_a, cos_a;
::sincosf(scale * angle_val, &sin_a, &cos_a);
xmat(y, x) = mag_val * cos_a;
ymat(y, x) = mag_val * sin_a;
}
}
void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, OutputArray _y, bool angleInDegrees, Stream& _stream)
{
GpuMat mag = _mag.getGpuMat();
GpuMat angle = _angle.getGpuMat();
CV_DbgAssert( angle.depth() == CV_32F );
CV_DbgAssert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) );
_x.create(angle.size(), CV_32FC1);
GpuMat x = _x.getGpuMat();
_y.create(angle.size(), CV_32FC1);
GpuMat y = _y.getGpuMat();
GpuMat_<float> xc(x.reshape(1));
GpuMat_<float> yc(y.reshape(1));
GpuMat_<float> magc(mag.reshape(1));
GpuMat_<float> anglec(angle.reshape(1));
const dim3 block(32, 8);
const dim3 grid(divUp(anglec.cols, block.x), divUp(anglec.rows, block.y));
const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
cudaStream_t stream = StreamAccessor::getStream(_stream);
if (magc.empty())
polarToCartImpl<false><<<grid, block, 0, stream>>>(shrinkPtr(magc), shrinkPtr(anglec), shrinkPtr(xc), shrinkPtr(yc), scale, anglec.rows, anglec.cols);
else
polarToCartImpl<true><<<grid, block, 0, stream>>>(shrinkPtr(magc), shrinkPtr(anglec), shrinkPtr(xc), shrinkPtr(yc), scale, anglec.rows, anglec.cols);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
if (stream == 0)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
#endif

@ -40,301 +40,258 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/limits.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "unroll_detail.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace reduce
{
struct Sum
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(0);
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
return a + b;
}
template <typename T>
__device__ __forceinline__ T result(T r, int) const
{
return r;
}
__host__ __device__ __forceinline__ Sum() {}
__host__ __device__ __forceinline__ Sum(const Sum&) {}
};
template <typename T> struct OutputType
{
typedef float type;
};
template <> struct OutputType<double>
{
typedef double type;
};
struct Avg
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(0);
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
return a + b;
}
template <typename T>
__device__ __forceinline__ typename TypeVec<typename OutputType<typename VecTraits<T>::elem_type>::type, VecTraits<T>::cn>::vec_type result(T r, float sz) const
{
return r / sz;
}
__host__ __device__ __forceinline__ Avg() {}
__host__ __device__ __forceinline__ Avg(const Avg&) {}
};
struct Min
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
}
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
minimum<T> minOp;
return minOp(a, b);
}
template <typename T>
__device__ __forceinline__ T result(T r, int) const
{
return r;
}
using namespace cv::cudev;
__host__ __device__ __forceinline__ Min() {}
__host__ __device__ __forceinline__ Min(const Min&) {}
};
struct Max
namespace
{
template <typename T, typename S, typename D>
void reduceToRowImpl(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
}
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
switch (reduceOp)
{
maximum<T> maxOp;
return maxOp(a, b);
}
case cv::REDUCE_SUM:
gridReduceToRow< Sum<S> >(src, dst, stream);
break;
template <typename T>
__device__ __forceinline__ T result(T r, int) const
{
return r;
}
case cv::REDUCE_AVG:
gridReduceToRow< Avg<S> >(src, dst, stream);
break;
__host__ __device__ __forceinline__ Max() {}
__host__ __device__ __forceinline__ Max(const Max&) {}
};
case cv::REDUCE_MIN:
gridReduceToRow< Min<S> >(src, dst, stream);
break;
///////////////////////////////////////////////////////////
case cv::REDUCE_MAX:
gridReduceToRow< Max<S> >(src, dst, stream);
break;
};
}
template <typename T, typename S, typename D, class Op>
__global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
template <typename T, typename S, typename D>
void reduceToColumnImpl_(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
{
__shared__ S smem[16 * 16];
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
const int x = blockIdx.x * 16 + threadIdx.x;
S myVal = op.template startValue<S>();
if (x < src.cols)
switch (reduceOp)
{
for (int y = threadIdx.y; y < src.rows; y += 16)
{
S srcVal = src(y, x);
myVal = op(myVal, srcVal);
}
}
smem[threadIdx.x * 16 + threadIdx.y] = myVal;
__syncthreads();
volatile S* srow = smem + threadIdx.y * 16;
myVal = srow[threadIdx.x];
device::reduce<16>(srow, myVal, threadIdx.x, op);
if (threadIdx.x == 0)
srow[0] = myVal;
__syncthreads();
if (threadIdx.y == 0 && x < src.cols)
dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
}
case cv::REDUCE_SUM:
gridReduceToColumn< Sum<S> >(src, dst, stream);
break;
template <typename T, typename S, typename D, class Op>
void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(src.cols, block.x));
case cv::REDUCE_AVG:
gridReduceToColumn< Avg<S> >(src, dst, stream);
break;
Op op;
rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
cudaSafeCall( cudaGetLastError() );
case cv::REDUCE_MIN:
gridReduceToColumn< Min<S> >(src, dst, stream);
break;
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
case cv::REDUCE_MAX:
gridReduceToColumn< Max<S> >(src, dst, stream);
break;
};
}
template <typename T, typename S, typename D>
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
void reduceToColumnImpl(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream)
{
typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
static const func_t funcs[] =
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream);
static const func_t funcs[4] =
{
rowsCaller<T, S, D, Sum>,
rowsCaller<T, S, D, Avg>,
rowsCaller<T, S, D, Max>,
rowsCaller<T, S, D, Min>
reduceToColumnImpl_<T, S, D>,
reduceToColumnImpl_<typename MakeVec<T, 2>::type, typename MakeVec<S, 2>::type, typename MakeVec<D, 2>::type>,
reduceToColumnImpl_<typename MakeVec<T, 3>::type, typename MakeVec<S, 3>::type, typename MakeVec<D, 3>::type>,
reduceToColumnImpl_<typename MakeVec<T, 4>::type, typename MakeVec<S, 4>::type, typename MakeVec<D, 4>::type>
};
funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
funcs[src.channels() - 1](src, dst, reduceOp, stream);
}
}
template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
///////////////////////////////////////////////////////////
template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
__global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
{
typedef typename TypeVec<T, cn>::vec_type src_type;
typedef typename TypeVec<S, cn>::vec_type work_type;
typedef typename TypeVec<D, cn>::vec_type dst_type;
__shared__ S smem[BLOCK_SIZE * cn];
const int y = blockIdx.x;
const src_type* srcRow = src.ptr(y);
void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
{
GpuMat src = _src.getGpuMat();
work_type myVal = op.template startValue<work_type>();
CV_Assert( src.channels() <= 4 );
CV_Assert( dim == 0 || dim == 1 );
CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
if (dtype < 0)
dtype = src.depth();
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
_dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
GpuMat dst = _dst.getGpuMat();
if (threadIdx.x == 0)
dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
}
template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
if (dim == 0)
{
const int BLOCK_SIZE = 256;
const dim3 block(BLOCK_SIZE);
const dim3 grid(src.rows);
typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
static const func_t funcs[7][7] =
{
{
reduceToRowImpl<uchar, int, uchar>,
0 /*reduceToRowImpl<uchar, int, schar>*/,
0 /*reduceToRowImpl<uchar, int, ushort>*/,
0 /*reduceToRowImpl<uchar, int, short>*/,
reduceToRowImpl<uchar, int, int>,
reduceToRowImpl<uchar, float, float>,
reduceToRowImpl<uchar, double, double>
},
{
0 /*reduceToRowImpl<schar, int, uchar>*/,
0 /*reduceToRowImpl<schar, int, schar>*/,
0 /*reduceToRowImpl<schar, int, ushort>*/,
0 /*reduceToRowImpl<schar, int, short>*/,
0 /*reduceToRowImpl<schar, int, int>*/,
0 /*reduceToRowImpl<schar, float, float>*/,
0 /*reduceToRowImpl<schar, double, double>*/
},
{
0 /*reduceToRowImpl<ushort, int, uchar>*/,
0 /*reduceToRowImpl<ushort, int, schar>*/,
reduceToRowImpl<ushort, int, ushort>,
0 /*reduceToRowImpl<ushort, int, short>*/,
reduceToRowImpl<ushort, int, int>,
reduceToRowImpl<ushort, float, float>,
reduceToRowImpl<ushort, double, double>
},
{
0 /*reduceToRowImpl<short, int, uchar>*/,
0 /*reduceToRowImpl<short, int, schar>*/,
0 /*reduceToRowImpl<short, int, ushort>*/,
reduceToRowImpl<short, int, short>,
reduceToRowImpl<short, int, int>,
reduceToRowImpl<short, float, float>,
reduceToRowImpl<short, double, double>
},
{
0 /*reduceToRowImpl<int, int, uchar>*/,
0 /*reduceToRowImpl<int, int, schar>*/,
0 /*reduceToRowImpl<int, int, ushort>*/,
0 /*reduceToRowImpl<int, int, short>*/,
reduceToRowImpl<int, int, int>,
reduceToRowImpl<int, float, float>,
reduceToRowImpl<int, double, double>
},
{
0 /*reduceToRowImpl<float, float, uchar>*/,
0 /*reduceToRowImpl<float, float, schar>*/,
0 /*reduceToRowImpl<float, float, ushort>*/,
0 /*reduceToRowImpl<float, float, short>*/,
0 /*reduceToRowImpl<float, float, int>*/,
reduceToRowImpl<float, float, float>,
reduceToRowImpl<float, double, double>
},
{
0 /*reduceToRowImpl<double, double, uchar>*/,
0 /*reduceToRowImpl<double, double, schar>*/,
0 /*reduceToRowImpl<double, double, ushort>*/,
0 /*reduceToRowImpl<double, double, short>*/,
0 /*reduceToRowImpl<double, double, int>*/,
0 /*reduceToRowImpl<double, double, float>*/,
reduceToRowImpl<double, double, double>
}
};
Op op;
colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
cudaSafeCall( cudaGetLastError() );
const func_t func = funcs[src.depth()][dst.depth()];
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
GpuMat dst_cont = dst.reshape(1);
func(src.reshape(1), dst_cont, reduceOp, stream);
}
template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
else
{
typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
static const func_t funcs[5][4] =
typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
static const func_t funcs[7][7] =
{
{0,0,0,0},
{colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
{colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
{colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
{colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
{
reduceToColumnImpl<uchar, int, uchar>,
0 /*reduceToColumnImpl<uchar, int, schar>*/,
0 /*reduceToColumnImpl<uchar, int, ushort>*/,
0 /*reduceToColumnImpl<uchar, int, short>*/,
reduceToColumnImpl<uchar, int, int>,
reduceToColumnImpl<uchar, float, float>,
reduceToColumnImpl<uchar, double, double>
},
{
0 /*reduceToColumnImpl<schar, int, uchar>*/,
0 /*reduceToColumnImpl<schar, int, schar>*/,
0 /*reduceToColumnImpl<schar, int, ushort>*/,
0 /*reduceToColumnImpl<schar, int, short>*/,
0 /*reduceToColumnImpl<schar, int, int>*/,
0 /*reduceToColumnImpl<schar, float, float>*/,
0 /*reduceToColumnImpl<schar, double, double>*/
},
{
0 /*reduceToColumnImpl<ushort, int, uchar>*/,
0 /*reduceToColumnImpl<ushort, int, schar>*/,
reduceToColumnImpl<ushort, int, ushort>,
0 /*reduceToColumnImpl<ushort, int, short>*/,
reduceToColumnImpl<ushort, int, int>,
reduceToColumnImpl<ushort, float, float>,
reduceToColumnImpl<ushort, double, double>
},
{
0 /*reduceToColumnImpl<short, int, uchar>*/,
0 /*reduceToColumnImpl<short, int, schar>*/,
0 /*reduceToColumnImpl<short, int, ushort>*/,
reduceToColumnImpl<short, int, short>,
reduceToColumnImpl<short, int, int>,
reduceToColumnImpl<short, float, float>,
reduceToColumnImpl<short, double, double>
},
{
0 /*reduceToColumnImpl<int, int, uchar>*/,
0 /*reduceToColumnImpl<int, int, schar>*/,
0 /*reduceToColumnImpl<int, int, ushort>*/,
0 /*reduceToColumnImpl<int, int, short>*/,
reduceToColumnImpl<int, int, int>,
reduceToColumnImpl<int, float, float>,
reduceToColumnImpl<int, double, double>
},
{
0 /*reduceToColumnImpl<float, float, uchar>*/,
0 /*reduceToColumnImpl<float, float, schar>*/,
0 /*reduceToColumnImpl<float, float, ushort>*/,
0 /*reduceToColumnImpl<float, float, short>*/,
0 /*reduceToColumnImpl<float, float, int>*/,
reduceToColumnImpl<float, float, float>,
reduceToColumnImpl<float, double, double>
},
{
0 /*reduceToColumnImpl<double, double, uchar>*/,
0 /*reduceToColumnImpl<double, double, schar>*/,
0 /*reduceToColumnImpl<double, double, ushort>*/,
0 /*reduceToColumnImpl<double, double, short>*/,
0 /*reduceToColumnImpl<double, double, int>*/,
0 /*reduceToColumnImpl<double, double, float>*/,
reduceToColumnImpl<double, double, double>
}
};
funcs[cn][op](src, dst, stream);
}
const func_t func = funcs[src.depth()][dst.depth()];
template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
func(src, dst, reduceOp, stream);
}
}
#endif /* CUDA_DISABLER */
#endif

@ -40,472 +40,209 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#ifndef HAVE_OPENCV_CUDEV
namespace cv { namespace cuda { namespace device
{
namespace split_merge
{
template <typename T, size_t elem_size = sizeof(T)>
struct TypeTraits
{
typedef T type;
typedef T type2;
typedef T type3;
typedef T type4;
};
#error "opencv_cudev is required"
template <typename T>
struct TypeTraits<T, 1>
{
typedef char type;
typedef char2 type2;
typedef char3 type3;
typedef char4 type4;
};
#else
template <typename T>
struct TypeTraits<T, 2>
{
typedef short type;
typedef short2 type2;
typedef short3 type3;
typedef short4 type4;
};
template <typename T>
struct TypeTraits<T, 4>
{
typedef int type;
typedef int2 type2;
typedef int3 type3;
typedef int4 type4;
};
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
template <typename T>
struct TypeTraits<T, 8>
{
typedef double type;
typedef double2 type2;
//typedef double3 type3;
//typedef double4 type3;
};
using namespace cv::cudev;
typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
////////////////////////////////////////////////////////////////////////
/// merge
//------------------------------------------------------------
// Merge
namespace
{
template <int cn, typename T> struct MergeFunc;
template <typename T>
__global__ void mergeC2_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
int rows, int cols, uchar* dst, size_t dst_step)
template <typename T> struct MergeFunc<2, T>
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
typedef typename TypeTraits<T>::type2 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_y[x] = dst_elem;
}
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1])),
globPtr<typename MakeVec<T, 2>::type>(dst),
stream);
}
};
template <typename T>
__global__ void mergeC3_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type3 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
template <typename T> struct MergeFunc<3, T>
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
double* dst_y = (double*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[3 * x] = src0_y[x];
dst_y[3 * x + 1] = src1_y[x];
dst_y[3 * x + 2] = src2_y[x];
}
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])),
globPtr<typename MakeVec<T, 3>::type>(dst),
stream);
}
};
template <typename T>
__global__ void mergeC4_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
template <typename T> struct MergeFunc<4, T>
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
typedef typename TypeTraits<T>::type4 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
const T* src3_y = (const T*)(src3 + y * src3_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_elem.w = src3_y[x];
dst_y[x] = dst_elem;
}
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])),
globPtr<typename MakeVec<T, 4>::type>(dst),
stream);
}
};
void mergeImpl(const GpuMat* src, size_t n, cv::OutputArray _dst, Stream& stream)
{
CV_DbgAssert( src != 0 );
CV_DbgAssert( n > 0 && n <= 4 );
template <>
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
const double* src3_y = (const double*)(src3 + y * src3_step);
double2* dst_y = (double2*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
}
}
const int depth = src[0].depth();
const cv::Size size = src[0].size();
template <typename T>
static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
#ifdef _DEBUG
for (size_t i = 0; i < n; ++i)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC2_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
CV_Assert( src[i].size() == size );
CV_Assert( src[i].depth() == depth );
CV_Assert( src[i].channels() == 1 );
}
#endif
template <typename T>
static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
if (n == 1)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC3_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
src[0].copyTo(_dst, stream);
}
template <typename T>
static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
else
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC4_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
src[3].data, src[3].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void merge(const PtrStepSzb* src, PtrStepSzb& dst,
int total_channels, size_t elem_size,
const cudaStream_t& stream)
{
static MergeFunction merge_func_tbl[] =
typedef void (*func_t)(const GpuMat* src, GpuMat& dst, Stream& stream);
static const func_t funcs[3][5] =
{
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
{MergeFunc<2, uchar>::call, MergeFunc<2, ushort>::call, MergeFunc<2, int>::call, 0, MergeFunc<2, double>::call},
{MergeFunc<3, uchar>::call, MergeFunc<3, ushort>::call, MergeFunc<3, int>::call, 0, MergeFunc<3, double>::call},
{MergeFunc<4, uchar>::call, MergeFunc<4, ushort>::call, MergeFunc<4, int>::call, 0, MergeFunc<4, double>::call}
};
size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
MergeFunction merge_func = merge_func_tbl[merge_func_id];
if (merge_func == 0)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
merge_func(src, dst, stream);
}
const int channels = static_cast<int>(n);
//------------------------------------------------------------
// Split
_dst.create(size, CV_MAKE_TYPE(depth, channels));
GpuMat dst = _dst.getGpuMat();
const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2];
template <typename T>
__global__ void splitC2_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step)
{
typedef typename TypeTraits<T>::type2 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
if (func == 0)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
}
func(src, dst, stream);
}
}
}
void cv::cuda::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream)
{
mergeImpl(src, n, dst, stream);
}
template <typename T>
__global__ void splitC3_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
typedef typename TypeTraits<T>::type3 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
void cv::cuda::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream)
{
mergeImpl(&src[0], src.size(), dst, stream);
}
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
}
}
////////////////////////////////////////////////////////////////////////
/// split
namespace
{
template <int cn, typename T> struct SplitFunc;
template <>
__global__ void splitC3_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
template <typename T> struct SplitFunc<2, T>
{
static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src_y = (const double*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
if (x < cols && y < rows)
GlobPtrSz<T> dstarr[2] =
{
dst0_y[x] = src_y[3 * x];
dst1_y[x] = src_y[3 * x + 1];
dst2_y[x] = src_y[3 * x + 2];
}
}
globPtr<T>(dst[0]), globPtr<T>(dst[1])
};
gridSplit(globPtr<typename MakeVec<T, 2>::type>(src), dstarr, stream);
}
};
template <typename T>
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
template <typename T> struct SplitFunc<3, T>
{
static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
{
typedef typename TypeTraits<T>::type4 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
T* dst3_y = (T*)(dst3 + y * dst3_step);
if (x < cols && y < rows)
GlobPtrSz<T> dstarr[3] =
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
dst3_y[x] = src_elem.w;
}
}
globPtr<T>(dst[0]), globPtr<T>(dst[1]), globPtr<T>(dst[2])
};
gridSplit(globPtr<typename MakeVec<T, 3>::type>(src), dstarr, stream);
}
};
template <>
__global__ void splitC4_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
template <typename T> struct SplitFunc<4, T>
{
static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double2* src_y = (const double2*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
double* dst3_y = (double*)(dst3 + y * dst3_step);
if (x < cols && y < rows)
GlobPtrSz<T> dstarr[4] =
{
double2 src_elem1 = src_y[2 * x];
double2 src_elem2 = src_y[2 * x + 1];
dst0_y[x] = src_elem1.x;
dst1_y[x] = src_elem1.y;
dst2_y[x] = src_elem2.x;
dst3_y[x] = src_elem2.y;
}
globPtr<T>(dst[0]), globPtr<T>(dst[1]), globPtr<T>(dst[2]), globPtr<T>(dst[3])
};
gridSplit(globPtr<typename MakeVec<T, 4>::type>(src), dstarr, stream);
}
};
template <typename T>
static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
void splitImpl(const GpuMat& src, GpuMat* dst, Stream& stream)
{
typedef void (*func_t)(const GpuMat& src, GpuMat* dst, Stream& stream);
static const func_t funcs[3][5] =
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC2_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
{SplitFunc<2, uchar>::call, SplitFunc<2, ushort>::call, SplitFunc<2, int>::call, 0, SplitFunc<2, double>::call},
{SplitFunc<3, uchar>::call, SplitFunc<3, ushort>::call, SplitFunc<3, int>::call, 0, SplitFunc<3, double>::call},
{SplitFunc<4, uchar>::call, SplitFunc<4, ushort>::call, SplitFunc<4, int>::call, 0, SplitFunc<4, double>::call}
};
CV_DbgAssert( dst != 0 );
template <typename T>
static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC3_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
const int depth = src.depth();
const int channels = src.channels();
CV_DbgAssert( channels <= 4 );
if (channels == 0)
return;
template <typename T>
static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
if (channels == 1)
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC4_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step,
dst[3].data, dst[3].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
src.copyTo(dst[0], stream);
return;
}
for (int i = 0; i < channels; ++i)
dst[i].create(src.size(), depth);
void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
{
static SplitFunction split_func_tbl[] =
{
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
};
const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2];
size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
SplitFunction split_func = split_func_tbl[split_func_id];
if (func == 0)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
if (split_func == 0)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
func(src, dst, stream);
}
}
split_func(src, dst, stream);
}
} // namespace split_merge
}}} // namespace cv { namespace cuda { namespace cudev
void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream)
{
GpuMat src = _src.getGpuMat();
splitImpl(src, dst, stream);
}
void cv::cuda::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
{
GpuMat src = _src.getGpuMat();
dst.resize(src.channels());
if (src.channels() > 0)
splitImpl(src, &dst[0], stream);
}
#endif /* CUDA_DISABLER */
#endif

@ -40,146 +40,186 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
namespace
{
struct VSub4 : binary_function<uint, uint, uint>
template <typename T, typename D> struct SubOp1 : binary_function<T, T, D>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
__device__ __forceinline__ D operator ()(T a, T b) const
{
return vsub4(a, b);
return saturate_cast<D>(a - b);
}
__host__ __device__ __forceinline__ VSub4() {}
__host__ __device__ __forceinline__ VSub4(const VSub4&) {}
};
struct VSub2 : binary_function<uint, uint, uint>
template <typename T, typename D>
void subMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
if (mask.data)
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), SubOp1<T, D>(), globPtr<uchar>(mask), stream);
else
gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), SubOp1<T, D>(), stream);
}
struct SubOp2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vsub2(a, b);
}
__host__ __device__ __forceinline__ VSub2() {}
__host__ __device__ __forceinline__ VSub2(const VSub2&) {}
};
template <typename T, typename D> struct SubMat : binary_function<T, T, D>
void subMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
__device__ __forceinline__ D operator ()(T a, T b) const
const int vcols = src1.cols >> 1;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, SubOp2(), stream);
}
struct SubOp4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return saturate_cast<D>(a - b);
return vsub4(a, b);
}
__host__ __device__ __forceinline__ SubMat() {}
__host__ __device__ __forceinline__ SubMat(const SubMat&) {}
};
void subMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
const int vcols = src1.cols >> 2;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, SubOp4(), stream);
}
}
namespace cv { namespace cuda { namespace device
void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
{
template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
static const func_t funcs[7][7] =
{
{
subMat_v1<uchar, uchar>,
subMat_v1<uchar, schar>,
subMat_v1<uchar, ushort>,
subMat_v1<uchar, short>,
subMat_v1<uchar, int>,
subMat_v1<uchar, float>,
subMat_v1<uchar, double>
},
{
subMat_v1<schar, uchar>,
subMat_v1<schar, schar>,
subMat_v1<schar, ushort>,
subMat_v1<schar, short>,
subMat_v1<schar, int>,
subMat_v1<schar, float>,
subMat_v1<schar, double>
},
{
0 /*subMat_v1<ushort, uchar>*/,
0 /*subMat_v1<ushort, schar>*/,
subMat_v1<ushort, ushort>,
subMat_v1<ushort, short>,
subMat_v1<ushort, int>,
subMat_v1<ushort, float>,
subMat_v1<ushort, double>
},
{
0 /*subMat_v1<short, uchar>*/,
0 /*subMat_v1<short, schar>*/,
subMat_v1<short, ushort>,
subMat_v1<short, short>,
subMat_v1<short, int>,
subMat_v1<short, float>,
subMat_v1<short, double>
},
{
0 /*subMat_v1<int, uchar>*/,
0 /*subMat_v1<int, schar>*/,
0 /*subMat_v1<int, ushort>*/,
0 /*subMat_v1<int, short>*/,
subMat_v1<int, int>,
subMat_v1<int, float>,
subMat_v1<int, double>
},
{
0 /*subMat_v1<float, uchar>*/,
0 /*subMat_v1<float, schar>*/,
0 /*subMat_v1<float, ushort>*/,
0 /*subMat_v1<float, short>*/,
0 /*subMat_v1<float, int>*/,
subMat_v1<float, float>,
subMat_v1<float, double>
},
{
0 /*subMat_v1<double, uchar>*/,
0 /*subMat_v1<double, schar>*/,
0 /*subMat_v1<double, ushort>*/,
0 /*subMat_v1<double, short>*/,
0 /*subMat_v1<double, int>*/,
0 /*subMat_v1<double, float>*/,
subMat_v1<double, double>
}
};
template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
const int sdepth = src1.depth();
const int ddepth = dst.depth();
template <typename T, typename D> struct TransformFunctorTraits< arithm::SubMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
namespace arithm
{
void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
}
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
{
device::transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
}
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
template <typename T, typename D>
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
{
subMat_v4(src1_, src2_, dst_, stream);
return;
}
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
{
subMat_v2(src1_, src2_, dst_, stream);
return;
}
}
}
template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
func(src1_, src2_, dst_, mask, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,110 +40,164 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void subScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
namespace
{
template <typename T, typename S, typename D> struct SubScalar : unary_function<T, D>
template <typename SrcType, typename ScalarType, typename DstType> struct SubScalarOp : unary_function<SrcType, DstType>
{
S val;
int scale;
ScalarType val;
__device__ __forceinline__ DstType operator ()(SrcType a) const
{
return saturate_cast<DstType>(saturate_cast<ScalarType>(a) - val);
}
};
__host__ SubScalar(S val_, int scale_) : val(val_), scale(scale_) {}
template <typename SrcType, typename ScalarType, typename DstType> struct SubScalarOpInv : unary_function<SrcType, DstType>
{
ScalarType val;
__device__ __forceinline__ D operator ()(T a) const
__device__ __forceinline__ DstType operator ()(SrcType a) const
{
return saturate_cast<D>(scale * (a - val));
return saturate_cast<DstType>(val - saturate_cast<ScalarType>(a));
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::SubScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
}}}
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
namespace arithm
{
template <typename T, typename S, typename D>
void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
template <typename SrcType, typename ScalarDepth, typename DstType>
void subScalarImpl(const GpuMat& src, cv::Scalar value, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
SubScalar<T, S, D> op(static_cast<S>(val), inv ? -1 : 1);
typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
cv::Scalar_<ScalarDepth> value_ = value;
if (inv)
{
SubScalarOpInv<SrcType, ScalarType, DstType> op;
op.val = VecTraits<ScalarType>::make(value_.val);
if (mask.data)
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
else
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
}
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
{
SubScalarOp<SrcType, ScalarType, DstType> op;
op.val = VecTraits<ScalarType>::make(value_.val);
if (mask.data)
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
else
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
}
}
}
void subScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
{
typedef void (*func_t)(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream);
static const func_t funcs[7][7][4] =
{
{
{subScalarImpl<uchar, float, uchar>, subScalarImpl<uchar2, float, uchar2>, subScalarImpl<uchar3, float, uchar3>, subScalarImpl<uchar4, float, uchar4>},
{subScalarImpl<uchar, float, schar>, subScalarImpl<uchar2, float, char2>, subScalarImpl<uchar3, float, char3>, subScalarImpl<uchar4, float, char4>},
{subScalarImpl<uchar, float, ushort>, subScalarImpl<uchar2, float, ushort2>, subScalarImpl<uchar3, float, ushort3>, subScalarImpl<uchar4, float, ushort4>},
{subScalarImpl<uchar, float, short>, subScalarImpl<uchar2, float, short2>, subScalarImpl<uchar3, float, short3>, subScalarImpl<uchar4, float, short4>},
{subScalarImpl<uchar, float, int>, subScalarImpl<uchar2, float, int2>, subScalarImpl<uchar3, float, int3>, subScalarImpl<uchar4, float, int4>},
{subScalarImpl<uchar, float, float>, subScalarImpl<uchar2, float, float2>, subScalarImpl<uchar3, float, float3>, subScalarImpl<uchar4, float, float4>},
{subScalarImpl<uchar, double, double>, subScalarImpl<uchar2, double, double2>, subScalarImpl<uchar3, double, double3>, subScalarImpl<uchar4, double, double4>}
},
{
{subScalarImpl<schar, float, uchar>, subScalarImpl<char2, float, uchar2>, subScalarImpl<char3, float, uchar3>, subScalarImpl<char4, float, uchar4>},
{subScalarImpl<schar, float, schar>, subScalarImpl<char2, float, char2>, subScalarImpl<char3, float, char3>, subScalarImpl<char4, float, char4>},
{subScalarImpl<schar, float, ushort>, subScalarImpl<char2, float, ushort2>, subScalarImpl<char3, float, ushort3>, subScalarImpl<char4, float, ushort4>},
{subScalarImpl<schar, float, short>, subScalarImpl<char2, float, short2>, subScalarImpl<char3, float, short3>, subScalarImpl<char4, float, short4>},
{subScalarImpl<schar, float, int>, subScalarImpl<char2, float, int2>, subScalarImpl<char3, float, int3>, subScalarImpl<char4, float, int4>},
{subScalarImpl<schar, float, float>, subScalarImpl<char2, float, float2>, subScalarImpl<char3, float, float3>, subScalarImpl<char4, float, float4>},
{subScalarImpl<schar, double, double>, subScalarImpl<char2, double, double2>, subScalarImpl<char3, double, double3>, subScalarImpl<char4, double, double4>}
},
{
{0 /*subScalarImpl<ushort, float, uchar>*/, 0 /*subScalarImpl<ushort2, float, uchar2>*/, 0 /*subScalarImpl<ushort3, float, uchar3>*/, 0 /*subScalarImpl<ushort4, float, uchar4>*/},
{0 /*subScalarImpl<ushort, float, schar>*/, 0 /*subScalarImpl<ushort2, float, char2>*/, 0 /*subScalarImpl<ushort3, float, char3>*/, 0 /*subScalarImpl<ushort4, float, char4>*/},
{subScalarImpl<ushort, float, ushort>, subScalarImpl<ushort2, float, ushort2>, subScalarImpl<ushort3, float, ushort3>, subScalarImpl<ushort4, float, ushort4>},
{subScalarImpl<ushort, float, short>, subScalarImpl<ushort2, float, short2>, subScalarImpl<ushort3, float, short3>, subScalarImpl<ushort4, float, short4>},
{subScalarImpl<ushort, float, int>, subScalarImpl<ushort2, float, int2>, subScalarImpl<ushort3, float, int3>, subScalarImpl<ushort4, float, int4>},
{subScalarImpl<ushort, float, float>, subScalarImpl<ushort2, float, float2>, subScalarImpl<ushort3, float, float3>, subScalarImpl<ushort4, float, float4>},
{subScalarImpl<ushort, double, double>, subScalarImpl<ushort2, double, double2>, subScalarImpl<ushort3, double, double3>, subScalarImpl<ushort4, double, double4>}
},
{
{0 /*subScalarImpl<short, float, uchar>*/, 0 /*subScalarImpl<short2, float, uchar2>*/, 0 /*subScalarImpl<short3, float, uchar3>*/, 0 /*subScalarImpl<short4, float, uchar4>*/},
{0 /*subScalarImpl<short, float, schar>*/, 0 /*subScalarImpl<short2, float, char2>*/, 0 /*subScalarImpl<short3, float, char3>*/, 0 /*subScalarImpl<short4, float, char4>*/},
{subScalarImpl<short, float, ushort>, subScalarImpl<short2, float, ushort2>, subScalarImpl<short3, float, ushort3>, subScalarImpl<short4, float, ushort4>},
{subScalarImpl<short, float, short>, subScalarImpl<short2, float, short2>, subScalarImpl<short3, float, short3>, subScalarImpl<short4, float, short4>},
{subScalarImpl<short, float, int>, subScalarImpl<short2, float, int2>, subScalarImpl<short3, float, int3>, subScalarImpl<short4, float, int4>},
{subScalarImpl<short, float, float>, subScalarImpl<short2, float, float2>, subScalarImpl<short3, float, float3>, subScalarImpl<short4, float, float4>},
{subScalarImpl<short, double, double>, subScalarImpl<short2, double, double2>, subScalarImpl<short3, double, double3>, subScalarImpl<short4, double, double4>}
},
{
{0 /*subScalarImpl<int, float, uchar>*/, 0 /*subScalarImpl<int2, float, uchar2>*/, 0 /*subScalarImpl<int3, float, uchar3>*/, 0 /*subScalarImpl<int4, float, uchar4>*/},
{0 /*subScalarImpl<int, float, schar>*/, 0 /*subScalarImpl<int2, float, char2>*/, 0 /*subScalarImpl<int3, float, char3>*/, 0 /*subScalarImpl<int4, float, char4>*/},
{0 /*subScalarImpl<int, float, ushort>*/, 0 /*subScalarImpl<int2, float, ushort2>*/, 0 /*subScalarImpl<int3, float, ushort3>*/, 0 /*subScalarImpl<int4, float, ushort4>*/},
{0 /*subScalarImpl<int, float, short>*/, 0 /*subScalarImpl<int2, float, short2>*/, 0 /*subScalarImpl<int3, float, short3>*/, 0 /*subScalarImpl<int4, float, short4>*/},
{subScalarImpl<int, float, int>, subScalarImpl<int2, float, int2>, subScalarImpl<int3, float, int3>, subScalarImpl<int4, float, int4>},
{subScalarImpl<int, float, float>, subScalarImpl<int2, float, float2>, subScalarImpl<int3, float, float3>, subScalarImpl<int4, float, float4>},
{subScalarImpl<int, double, double>, subScalarImpl<int2, double, double2>, subScalarImpl<int3, double, double3>, subScalarImpl<int4, double, double4>}
},
{
{0 /*subScalarImpl<float, float, uchar>*/, 0 /*subScalarImpl<float2, float, uchar2>*/, 0 /*subScalarImpl<float3, float, uchar3>*/, 0 /*subScalarImpl<float4, float, uchar4>*/},
{0 /*subScalarImpl<float, float, schar>*/, 0 /*subScalarImpl<float2, float, char2>*/, 0 /*subScalarImpl<float3, float, char3>*/, 0 /*subScalarImpl<float4, float, char4>*/},
{0 /*subScalarImpl<float, float, ushort>*/, 0 /*subScalarImpl<float2, float, ushort2>*/, 0 /*subScalarImpl<float3, float, ushort3>*/, 0 /*subScalarImpl<float4, float, ushort4>*/},
{0 /*subScalarImpl<float, float, short>*/, 0 /*subScalarImpl<float2, float, short2>*/, 0 /*subScalarImpl<float3, float, short3>*/, 0 /*subScalarImpl<float4, float, short4>*/},
{0 /*subScalarImpl<float, float, int>*/, 0 /*subScalarImpl<float2, float, int2>*/, 0 /*subScalarImpl<float3, float, int3>*/, 0 /*subScalarImpl<float4, float, int4>*/},
{subScalarImpl<float, float, float>, subScalarImpl<float2, float, float2>, subScalarImpl<float3, float, float3>, subScalarImpl<float4, float, float4>},
{subScalarImpl<float, double, double>, subScalarImpl<float2, double, double2>, subScalarImpl<float3, double, double3>, subScalarImpl<float4, double, double4>}
},
{
{0 /*subScalarImpl<double, double, uchar>*/, 0 /*subScalarImpl<double2, double, uchar2>*/, 0 /*subScalarImpl<double3, double, uchar3>*/, 0 /*subScalarImpl<double4, double, uchar4>*/},
{0 /*subScalarImpl<double, double, schar>*/, 0 /*subScalarImpl<double2, double, char2>*/, 0 /*subScalarImpl<double3, double, char3>*/, 0 /*subScalarImpl<double4, double, char4>*/},
{0 /*subScalarImpl<double, double, ushort>*/, 0 /*subScalarImpl<double2, double, ushort2>*/, 0 /*subScalarImpl<double3, double, ushort3>*/, 0 /*subScalarImpl<double4, double, ushort4>*/},
{0 /*subScalarImpl<double, double, short>*/, 0 /*subScalarImpl<double2, double, short2>*/, 0 /*subScalarImpl<double3, double, short3>*/, 0 /*subScalarImpl<double4, double, short4>*/},
{0 /*subScalarImpl<double, double, int>*/, 0 /*subScalarImpl<double2, double, int2>*/, 0 /*subScalarImpl<double3, double, int3>*/, 0 /*subScalarImpl<double4, double, int4>*/},
{0 /*subScalarImpl<double, double, float>*/, 0 /*subScalarImpl<double2, double, float2>*/, 0 /*subScalarImpl<double3, double, float3>*/, 0 /*subScalarImpl<double4, double, float4>*/},
{subScalarImpl<double, double, double>, subScalarImpl<double2, double, double2>, subScalarImpl<double3, double, double3>, subScalarImpl<double4, double, double4>}
}
};
const int sdepth = src.depth();
const int ddepth = dst.depth();
const int cn = src.channels();
CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
const func_t func = funcs[sdepth][ddepth][cn - 1];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
func(src, val, inv, dst, mask, stream);
}
#endif // CUDA_DISABLER
#endif

@ -40,342 +40,155 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/utility.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "unroll_detail.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace sum
{
__device__ unsigned int blocks_finished = 0;
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
template <typename R, int cn> struct AtomicAdd;
template <typename R> struct AtomicAdd<R, 1>
{
static __device__ void run(R* ptr, R val)
{
Emulation::glob::atomicAdd(ptr, val);
}
};
template <typename R> struct AtomicAdd<R, 2>
{
typedef typename TypeVec<R, 2>::vec_type val_type;
using namespace cv::cudev;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
}
};
template <typename R> struct AtomicAdd<R, 3>
{
typedef typename TypeVec<R, 3>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
}
};
template <typename R> struct AtomicAdd<R, 4>
namespace
{
template <typename T, typename R, int cn>
cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
{
typedef typename TypeVec<R, 4>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
Emulation::glob::atomicAdd(ptr + 3, val.w);
}
};
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
template <int BLOCK_SIZE, typename R, int cn>
struct GlobalReduce
{
typedef typename TypeVec<R, cn>::vec_type result_type;
static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
AtomicAdd<R, cn>::run((R*) result, sum);
#else
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = sum;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
if (tid == 0)
{
result[0] = sum;
blocks_finished = 0;
}
}
#endif
}
};
const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
__global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
{
typedef typename VecTraits<src_type>::elem_type T;
typedef typename VecTraits<result_type>::elem_type R;
const int cn = VecTraits<src_type>::cn;
__shared__ R smem[BLOCK_SIZE * cn];
if (mask.empty())
gridCalcSum(src, buf);
else
gridCalcSum(src, buf, globPtr<uchar>(mask));
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
return res;
}
result_type sum = VecTraits<result_type>::all(0);
template <typename T, typename R, int cn>
cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
{
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const src_type* ptr = src.ptr(y);
const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const src_type srcVal = ptr[x];
sum = sum + op(saturate_cast<result_type>(srcVal));
}
}
}
if (mask.empty())
gridCalcSum(abs_(cvt_<res_type>(src)), buf);
else
gridCalcSum(abs_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
return res;
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
template <typename T, typename R, int cn>
cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
{
block = dim3(threads_x, threads_y);
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
if (mask.empty())
gridCalcSum(sqr_(cvt_<res_type>(src)), buf);
else
gridCalcSum(sqr_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
return res;
}
}
template <typename T, typename R, int cn, template <typename> class Op>
void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
typedef typename TypeVec<T, cn>::vec_type src_type;
typedef typename TypeVec<R, cn>::vec_type result_type;
PtrStepSz<src_type> src(src_);
result_type* buf = (result_type*) buf_;
{sumImpl<uchar , uint , 1>, sumImpl<uchar , uint , 2>, sumImpl<uchar , uint , 3>, sumImpl<uchar , uint , 4>},
{sumImpl<schar , int , 1>, sumImpl<schar , int , 2>, sumImpl<schar , int , 3>, sumImpl<schar , int , 4>},
{sumImpl<ushort, uint , 1>, sumImpl<ushort, uint , 2>, sumImpl<ushort, uint , 3>, sumImpl<ushort, uint , 4>},
{sumImpl<short , int , 1>, sumImpl<short , int , 2>, sumImpl<short , int , 3>, sumImpl<short , int , 4>},
{sumImpl<int , int , 1>, sumImpl<int , int , 2>, sumImpl<int , int , 3>, sumImpl<int , int , 4>},
{sumImpl<float , float , 1>, sumImpl<float , float , 2>, sumImpl<float , float , 3>, sumImpl<float , float , 4>},
{sumImpl<double, double, 1>, sumImpl<double, double, 2>, sumImpl<double, double, 3>, sumImpl<double, double, 4>}
};
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
Op<result_type> op;
const func_t func = funcs[src.depth()][src.channels() - 1];
if (mask.data)
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
else
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
cudaSafeCall( cudaGetLastError() );
return func(src, mask, buf);
}
cudaSafeCall( cudaDeviceSynchronize() );
cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
{sumAbsImpl<uchar , uint , 1>, sumAbsImpl<uchar , uint , 2>, sumAbsImpl<uchar , uint , 3>, sumAbsImpl<uchar , uint , 4>},
{sumAbsImpl<schar , int , 1>, sumAbsImpl<schar , int , 2>, sumAbsImpl<schar , int , 3>, sumAbsImpl<schar , int , 4>},
{sumAbsImpl<ushort, uint , 1>, sumAbsImpl<ushort, uint , 2>, sumAbsImpl<ushort, uint , 3>, sumAbsImpl<ushort, uint , 4>},
{sumAbsImpl<short , int , 1>, sumAbsImpl<short , int , 2>, sumAbsImpl<short , int , 3>, sumAbsImpl<short , int , 4>},
{sumAbsImpl<int , int , 1>, sumAbsImpl<int , int , 2>, sumAbsImpl<int , int , 3>, sumAbsImpl<int , int , 4>},
{sumAbsImpl<float , float , 1>, sumAbsImpl<float , float , 2>, sumAbsImpl<float , float , 3>, sumAbsImpl<float , float , 4>},
{sumAbsImpl<double, double, 1>, sumAbsImpl<double, double, 2>, sumAbsImpl<double, double, 3>, sumAbsImpl<double, double, 4>}
};
R result[4] = {0, 0, 0, 0};
cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
out[0] = result[0];
out[1] = result[1];
out[2] = result[2];
out[3] = result[3];
}
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
template <typename T> struct SumType;
template <> struct SumType<uchar> { typedef unsigned int R; };
template <> struct SumType<schar> { typedef int R; };
template <> struct SumType<ushort> { typedef unsigned int R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, identity>(src, buf, out, mask);
}
const func_t func = funcs[src.depth()][src.channels() - 1];
template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, abs_func>(src, buf, out, mask);
}
return func(src, mask, buf);
}
template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T> struct Sqr : unary_function<T, T>
cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
__device__ __forceinline__ T operator ()(T x) const
{
return x * x;
}
{sumSqrImpl<uchar , double, 1>, sumSqrImpl<uchar , double, 2>, sumSqrImpl<uchar , double, 3>, sumSqrImpl<uchar , double, 4>},
{sumSqrImpl<schar , double, 1>, sumSqrImpl<schar , double, 2>, sumSqrImpl<schar , double, 3>, sumSqrImpl<schar , double, 4>},
{sumSqrImpl<ushort, double, 1>, sumSqrImpl<ushort, double, 2>, sumSqrImpl<ushort, double, 3>, sumSqrImpl<ushort, double, 4>},
{sumSqrImpl<short , double, 1>, sumSqrImpl<short , double, 2>, sumSqrImpl<short , double, 3>, sumSqrImpl<short , double, 4>},
{sumSqrImpl<int , double, 1>, sumSqrImpl<int , double, 2>, sumSqrImpl<int , double, 3>, sumSqrImpl<int , double, 4>},
{sumSqrImpl<float , double, 1>, sumSqrImpl<float , double, 2>, sumSqrImpl<float , double, 3>, sumSqrImpl<float , double, 4>},
{sumSqrImpl<double, double, 1>, sumSqrImpl<double, double, 2>, sumSqrImpl<double, double, 3>, sumSqrImpl<double, double, 4>}
};
template <typename T, int cn>
void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
caller<T, double, cn, Sqr>(src, buf, out, mask);
}
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
const func_t func = funcs[src.depth()][src.channels() - 1];
template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
return func(src, mask, buf);
}
#endif // CUDA_DISABLER
#endif

@ -40,75 +40,109 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
#include "opencv2/core/private.cuda.hpp"
template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
using namespace cv::cudev;
template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
namespace
{
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <typename T>
void thresholdImpl(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& stream)
{
};
}}}
const T thresh_ = static_cast<T>(thresh);
const T maxVal_ = static_cast<T>(maxVal);
namespace arithm
{
template <template <typename> class Op, typename T>
void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
{
Op<T> op(thresh, maxVal);
device::transform(src, dst, op, WithOutMask(), stream);
switch (type)
{
case 0:
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_binary_func(thresh_, maxVal_), stream);
break;
case 1:
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_binary_inv_func(thresh_, maxVal_), stream);
break;
case 2:
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_trunc_func(thresh_), stream);
break;
case 3:
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_to_zero_func(thresh_), stream);
break;
case 4:
gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_to_zero_inv_func(thresh_), stream);
break;
};
}
}
template <typename T>
void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream)
double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
{
GpuMat src = _src.getGpuMat();
const int depth = src.depth();
CV_DbgAssert( src.channels() == 1 && depth <= CV_64F );
CV_DbgAssert( type <= 4 /*THRESH_TOZERO_INV*/ );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
if (depth == CV_32F && type == 2 /*THRESH_TRUNC*/)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream);
NppStreamHandler h(StreamAccessor::getStream(stream));
static const caller_t callers[] =
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
if (!stream)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
else
{
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& stream);
static const func_t funcs[] =
{
threshold_caller<thresh_binary_func, T>,
threshold_caller<thresh_binary_inv_func, T>,
threshold_caller<thresh_trunc_func, T>,
threshold_caller<thresh_to_zero_func, T>,
threshold_caller<thresh_to_zero_inv_func, T>
thresholdImpl<uchar>,
thresholdImpl<schar>,
thresholdImpl<ushort>,
thresholdImpl<short>,
thresholdImpl<int>,
thresholdImpl<float>,
thresholdImpl<double>
};
callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream);
if (depth != CV_32F && depth != CV_64F)
{
thresh = cvFloor(thresh);
maxVal = cvRound(maxVal);
}
funcs[depth](src, dst, thresh, maxVal, type, stream);
}
template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
return thresh;
}
#endif // CUDA_DISABLER
#endif

@ -40,83 +40,53 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#ifndef HAVE_OPENCV_CUDEV
using namespace cv::cuda;
using namespace cv::cuda::device;
#error "opencv_cudev is required"
namespace arithm
{
const int TRANSPOSE_TILE_DIM = 16;
const int TRANSPOSE_BLOCK_ROWS = 16;
template <typename T>
__global__ void transposeKernel(const PtrStepSz<T> src, PtrStep<T> dst)
{
__shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
#else
int blockIdx_x, blockIdx_y;
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
#include "opencv2/core/private.cuda.hpp"
// do diagonal reordering
if (gridDim.x == gridDim.y)
{
blockIdx_y = blockIdx.x;
blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
}
else
{
int bid = blockIdx.x + gridDim.x * blockIdx.y;
blockIdx_y = bid % gridDim.y;
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
}
using namespace cv::cudev;
int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
if (xIndex < src.cols)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < src.rows)
{
tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
}
}
}
void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
{
GpuMat src = _src.getGpuMat();
__syncthreads();
const size_t elemSize = src.elemSize();
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
if (xIndex < src.rows)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < src.cols)
{
dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i];
}
}
}
}
_dst.create( src.cols, src.rows, src.type() );
GpuMat dst = _dst.getGpuMat();
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
if (elemSize == 1)
{
const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
NppStreamHandler h(StreamAccessor::getStream(stream));
transposeKernel<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
template void transpose<int>(PtrStepSz<int> src, PtrStepSz<int> dst, cudaStream_t stream);
template void transpose<double>(PtrStepSz<double> src, PtrStepSz<double> dst, cudaStream_t stream);
if (!stream)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
else if (elemSize == 4)
{
gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
}
else // if (elemSize == 8)
{
gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
}
}
#endif // CUDA_DISABLER
#endif

@ -1,135 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __UNROLL_DETAIL_HPP__
#define __UNROLL_DETAIL_HPP__
#include <thrust/tuple.h>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
namespace detail
{
template <int cn> struct Unroll;
template <> struct Unroll<1>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
{
return smem;
}
template <typename R>
static __device__ __forceinline__ R& tie(R& val)
{
return val;
}
template <class Op>
static __device__ __forceinline__ const Op& op(const Op& op)
{
return op;
}
};
template <> struct Unroll<2>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op);
}
};
template <> struct Unroll<3>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y, val.z);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op, op);
}
};
template <> struct Unroll<4>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y, val.z, val.w);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op, op, op);
}
};
}
#endif // __UNROLL_DETAIL_HPP__

File diff suppressed because it is too large Load Diff

@ -52,13 +52,6 @@
#include "opencv2/core/private.cuda.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_CUDALEGACY
# include "opencv2/cudalegacy.hpp"
# include "opencv2/cudalegacy/private.hpp"
#endif
#ifdef HAVE_CUBLAS
# include <cublas.h>
#endif

@ -133,513 +133,6 @@ double cv::cuda::norm(InputArray _src, int normType, InputArray _mask, GpuMat& b
return std::max(std::abs(min_val), std::abs(max_val));
}
double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
{
#if CUDA_VERSION < 5050
(void) buf;
typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);
static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
#else
typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
#endif
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == CV_8UC1 );
CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
NppiSize sz;
sz.width = src1.cols;
sz.height = src1.rows;
const int funcIdx = normType >> 1;
DeviceBuffer dbuf;
#if CUDA_VERSION < 5050
nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
#else
int bufSize;
buf_size_funcs[funcIdx](sz, &bufSize);
ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
#endif
cudaSafeCall( cudaDeviceSynchronize() );
double retVal;
dbuf.download(&retVal);
return retVal;
}
////////////////////////////////////////////////////////////////////////
// Sum
namespace sum
{
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
template <typename T, int cn>
void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
}
Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
{
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
static const func_t funcs[7][5] =
{
{0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
{0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
{0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
{0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
{0, ::sum::run<int , 1>, ::sum::run<int , 2>, ::sum::run<int , 3>, ::sum::run<int , 4>},
{0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
{0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
};
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
func(src, buf.data, result, mask);
return Scalar(result[0], result[1], result[2], result[3]);
}
Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
{
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
static const func_t funcs[7][5] =
{
{0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
{0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
{0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
{0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
{0, ::sum::runAbs<int , 1>, ::sum::runAbs<int , 2>, ::sum::runAbs<int , 3>, ::sum::runAbs<int , 4>},
{0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
{0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
};
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
func(src, buf.data, result, mask);
return Scalar(result[0], result[1], result[2], result[3]);
}
Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
{
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
static const func_t funcs[7][5] =
{
{0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
{0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
{0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
{0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
{0, ::sum::runSqr<int , 1>, ::sum::runSqr<int , 2>, ::sum::runSqr<int , 3>, ::sum::runSqr<int , 4>},
{0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
{0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
};
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
func(src, buf.data, result, mask);
return Scalar(result[0], result[1], result[2], result[3]);
}
////////////////////////////////////////////////////////////////////////
// minMax
namespace minMax
{
void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
{
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
static const func_t funcs[] =
{
::minMax::run<uchar>,
::minMax::run<schar>,
::minMax::run<ushort>,
::minMax::run<short>,
::minMax::run<int>,
::minMax::run<float>,
::minMax::run<double>
};
CV_Assert( src.channels() == 1 );
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
const func_t func = funcs[src.depth()];
double temp1, temp2;
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
}
////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace minMaxLoc
{
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
}
void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
{
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
static const func_t funcs[] =
{
::minMaxLoc::run<uchar>,
::minMaxLoc::run<schar>,
::minMaxLoc::run<ushort>,
::minMaxLoc::run<short>,
::minMaxLoc::run<int>,
::minMaxLoc::run<float>,
::minMaxLoc::run<double>
};
CV_Assert( src.channels() == 1 );
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size valbuf_size, locbuf_size;
::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
const func_t func = funcs[src.depth()];
double temp1, temp2;
Point temp3, temp4;
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
}
//////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace countNonZero
{
void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
}
int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
{
GpuMat src = _src.getGpuMat();
typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
static const func_t funcs[] =
{
::countNonZero::run<uchar>,
::countNonZero::run<schar>,
::countNonZero::run<ushort>,
::countNonZero::run<short>,
::countNonZero::run<int>,
::countNonZero::run<float>,
::countNonZero::run<double>
};
CV_Assert(src.channels() == 1);
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
const func_t func = funcs[src.depth()];
return func(src, buf);
}
//////////////////////////////////////////////////////////////////////////////
// reduce
namespace reduce
{
template <typename T, typename S, typename D>
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template <typename T, typename S, typename D>
void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
}
void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
{
GpuMat src = _src.getGpuMat();
CV_Assert( src.channels() <= 4 );
CV_Assert( dim == 0 || dim == 1 );
CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
if (dtype < 0)
dtype = src.depth();
_dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
GpuMat dst = _dst.getGpuMat();
if (dim == 0)
{
typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
static const func_t funcs[7][7] =
{
{
::reduce::rows<unsigned char, int, unsigned char>,
0/*::reduce::rows<unsigned char, int, signed char>*/,
0/*::reduce::rows<unsigned char, int, unsigned short>*/,
0/*::reduce::rows<unsigned char, int, short>*/,
::reduce::rows<unsigned char, int, int>,
::reduce::rows<unsigned char, float, float>,
::reduce::rows<unsigned char, double, double>
},
{
0/*::reduce::rows<signed char, int, unsigned char>*/,
0/*::reduce::rows<signed char, int, signed char>*/,
0/*::reduce::rows<signed char, int, unsigned short>*/,
0/*::reduce::rows<signed char, int, short>*/,
0/*::reduce::rows<signed char, int, int>*/,
0/*::reduce::rows<signed char, float, float>*/,
0/*::reduce::rows<signed char, double, double>*/
},
{
0/*::reduce::rows<unsigned short, int, unsigned char>*/,
0/*::reduce::rows<unsigned short, int, signed char>*/,
::reduce::rows<unsigned short, int, unsigned short>,
0/*::reduce::rows<unsigned short, int, short>*/,
::reduce::rows<unsigned short, int, int>,
::reduce::rows<unsigned short, float, float>,
::reduce::rows<unsigned short, double, double>
},
{
0/*::reduce::rows<short, int, unsigned char>*/,
0/*::reduce::rows<short, int, signed char>*/,
0/*::reduce::rows<short, int, unsigned short>*/,
::reduce::rows<short, int, short>,
::reduce::rows<short, int, int>,
::reduce::rows<short, float, float>,
::reduce::rows<short, double, double>
},
{
0/*::reduce::rows<int, int, unsigned char>*/,
0/*::reduce::rows<int, int, signed char>*/,
0/*::reduce::rows<int, int, unsigned short>*/,
0/*::reduce::rows<int, int, short>*/,
::reduce::rows<int, int, int>,
::reduce::rows<int, float, float>,
::reduce::rows<int, double, double>
},
{
0/*::reduce::rows<float, float, unsigned char>*/,
0/*::reduce::rows<float, float, signed char>*/,
0/*::reduce::rows<float, float, unsigned short>*/,
0/*::reduce::rows<float, float, short>*/,
0/*::reduce::rows<float, float, int>*/,
::reduce::rows<float, float, float>,
::reduce::rows<float, double, double>
},
{
0/*::reduce::rows<double, double, unsigned char>*/,
0/*::reduce::rows<double, double, signed char>*/,
0/*::reduce::rows<double, double, unsigned short>*/,
0/*::reduce::rows<double, double, short>*/,
0/*::reduce::rows<double, double, int>*/,
0/*::reduce::rows<double, double, float>*/,
::reduce::rows<double, double, double>
}
};
const func_t func = funcs[src.depth()][dst.depth()];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
}
else
{
typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
static const func_t funcs[7][7] =
{
{
::reduce::cols<unsigned char, int, unsigned char>,
0/*::reduce::cols<unsigned char, int, signed char>*/,
0/*::reduce::cols<unsigned char, int, unsigned short>*/,
0/*::reduce::cols<unsigned char, int, short>*/,
::reduce::cols<unsigned char, int, int>,
::reduce::cols<unsigned char, float, float>,
::reduce::cols<unsigned char, double, double>
},
{
0/*::reduce::cols<signed char, int, unsigned char>*/,
0/*::reduce::cols<signed char, int, signed char>*/,
0/*::reduce::cols<signed char, int, unsigned short>*/,
0/*::reduce::cols<signed char, int, short>*/,
0/*::reduce::cols<signed char, int, int>*/,
0/*::reduce::cols<signed char, float, float>*/,
0/*::reduce::cols<signed char, double, double>*/
},
{
0/*::reduce::cols<unsigned short, int, unsigned char>*/,
0/*::reduce::cols<unsigned short, int, signed char>*/,
::reduce::cols<unsigned short, int, unsigned short>,
0/*::reduce::cols<unsigned short, int, short>*/,
::reduce::cols<unsigned short, int, int>,
::reduce::cols<unsigned short, float, float>,
::reduce::cols<unsigned short, double, double>
},
{
0/*::reduce::cols<short, int, unsigned char>*/,
0/*::reduce::cols<short, int, signed char>*/,
0/*::reduce::cols<short, int, unsigned short>*/,
::reduce::cols<short, int, short>,
::reduce::cols<short, int, int>,
::reduce::cols<short, float, float>,
::reduce::cols<short, double, double>
},
{
0/*::reduce::cols<int, int, unsigned char>*/,
0/*::reduce::cols<int, int, signed char>*/,
0/*::reduce::cols<int, int, unsigned short>*/,
0/*::reduce::cols<int, int, short>*/,
::reduce::cols<int, int, int>,
::reduce::cols<int, float, float>,
::reduce::cols<int, double, double>
},
{
0/*::reduce::cols<float, float, unsigned char>*/,
0/*::reduce::cols<float, float, signed char>*/,
0/*::reduce::cols<float, float, unsigned short>*/,
0/*::reduce::cols<float, float, short>*/,
0/*::reduce::cols<float, float, int>*/,
::reduce::cols<float, float, float>,
::reduce::cols<float, double, double>
},
{
0/*::reduce::cols<double, double, unsigned char>*/,
0/*::reduce::cols<double, double, signed char>*/,
0/*::reduce::cols<double, double, unsigned short>*/,
0/*::reduce::cols<double, double, short>*/,
0/*::reduce::cols<double, double, int>*/,
0/*::reduce::cols<double, double, float>*/,
::reduce::cols<double, double, double>
}
};
const func_t func = funcs[src.depth()][dst.depth()];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
}
}
////////////////////////////////////////////////////////////////////////
// meanStdDev
@ -748,116 +241,4 @@ void cv::cuda::normalize(InputArray _src, OutputArray dst, double a, double b, i
}
}
////////////////////////////////////////////////////////////////////////
// integral
namespace cv { namespace cuda { namespace device
{
namespace imgproc
{
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
}
}}}
void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& _stream)
{
GpuMat src = _src.getGpuMat();
CV_Assert( src.type() == CV_8UC1 );
cudaStream_t stream = StreamAccessor::getStream(_stream);
cv::Size whole;
cv::Point offset;
src.locateROI(whole, offset);
if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
&& offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
{
ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
cv::cuda::device::imgproc::shfl_integral_gpu(src, buffer, stream);
_dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
GpuMat dst = _dst.getGpuMat();
dst.setTo(Scalar::all(0), _stream);
GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
res.copyTo(inner, _stream);
}
else
{
#ifndef HAVE_OPENCV_CUDALEGACY
throw_no_cuda();
#else
_dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
GpuMat dst = _dst.getGpuMat();
NcvSize32u roiSize;
roiSize.width = src.cols;
roiSize.height = src.rows;
cudaDeviceProp prop;
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
Ncv32u bufSize;
ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
NppStStreamHandler h(stream);
ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
dst.ptr<Ncv32u>(), static_cast<int>(dst.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
#endif
}
}
//////////////////////////////////////////////////////////////////////////////
// sqrIntegral
void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& _stream)
{
#ifndef HAVE_OPENCV_CUDALEGACY
(void) _src;
(void) _dst;
(void) _stream;
throw_no_cuda();
#else
GpuMat src = _src.getGpuMat();
CV_Assert( src.type() == CV_8U );
NcvSize32u roiSize;
roiSize.width = src.cols;
roiSize.height = src.rows;
cudaDeviceProp prop;
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
Ncv32u bufSize;
ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
ensureSizeIsEnough(1, bufSize, CV_8U, buf);
cudaStream_t stream = StreamAccessor::getStream(_stream);
NppStStreamHandler h(stream);
_dst.create(src.rows + 1, src.cols + 1, CV_64F);
GpuMat dst = _dst.getGpuMat();
ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
dst.ptr<Ncv64u>(0), static_cast<int>(dst.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
#endif
}
#endif

@ -125,43 +125,6 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, GEMM, testing::Combine(
ALL_GEMM_FLAGS,
WHOLE_SUBMAT));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Integral
PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
bool useRoi;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
useRoi = GET_PARAM(2);
cv::cuda::setDevice(devInfo.deviceID());
}
};
CUDA_TEST_P(Integral, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
cv::cuda::integral(loadMat(src, useRoi), dst);
cv::Mat dst_gold;
cv::integral(src, dst_gold, CV_32S);
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
}
INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
WHOLE_SUBMAT));
////////////////////////////////////////////////////////////////////////////
// MulSpectrums

@ -816,4 +816,78 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine(
DIFFERENT_SIZES,
WHOLE_SUBMAT));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Integral
PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
bool useRoi;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
useRoi = GET_PARAM(2);
cv::cuda::setDevice(devInfo.deviceID());
}
};
CUDA_TEST_P(Integral, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
cv::cuda::integral(loadMat(src, useRoi), dst);
cv::Mat dst_gold;
cv::integral(src, dst_gold, CV_32S);
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
}
INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
WHOLE_SUBMAT));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// IntegralSqr
PARAM_TEST_CASE(IntegralSqr, cv::cuda::DeviceInfo, cv::Size, UseRoi)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
bool useRoi;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
useRoi = GET_PARAM(2);
cv::cuda::setDevice(devInfo.deviceID());
}
};
CUDA_TEST_P(IntegralSqr, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_64FC1, useRoi);
cv::cuda::sqrIntegral(loadMat(src, useRoi), dst);
cv::Mat dst_gold, temp;
cv::integral(src, temp, dst_gold);
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
}
INSTANTIATE_TEST_CASE_P(CUDA_Arithm, IntegralSqr, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
WHOLE_SUBMAT));
#endif // HAVE_CUDA

@ -4,7 +4,7 @@ endif()
set(the_description "CUDA device layer")
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable)
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare)
ocv_add_module(cudev)

@ -73,7 +73,7 @@
#include "cudev/block/vec_distance.hpp"
#include "cudev/grid/copy.hpp"
#include "cudev/grid/glob_reduce.hpp"
#include "cudev/grid/reduce.hpp"
#include "cudev/grid/histogram.hpp"
#include "cudev/grid/integral.hpp"
#include "cudev/grid/pyramids.hpp"

@ -47,7 +47,7 @@
#define __OPENCV_CUDEV_EXPR_REDUCTION_HPP__
#include "../common.hpp"
#include "../grid/glob_reduce.hpp"
#include "../grid/reduce.hpp"
#include "../grid/histogram.hpp"
#include "../grid/integral.hpp"
#include "../grid/reduce_to_vec.hpp"

@ -616,6 +616,30 @@ template <typename T> struct magnitude_func : binary_function<T, T, typename fun
}
};
template <typename T> struct magnitude_sqr_func : binary_function<T, T, typename functional_detail::FloatType<T>::type>
{
__device__ __forceinline__ typename functional_detail::FloatType<T>::type operator ()(typename TypeTraits<T>::parameter_type a, typename TypeTraits<T>::parameter_type b) const
{
return a * a + b * b;
}
};
template <typename T, bool angleInDegrees> struct direction_func : binary_function<T, T, T>
{
__device__ T operator ()(T x, T y) const
{
atan2_func<T> f;
typename atan2_func<T>::result_type angle = f(y, x);
angle += (angle < 0) * (2.0f * CV_PI_F);
if (angleInDegrees)
angle *= (180.0f / CV_PI_F);
return saturate_cast<T>(angle);
}
};
template <typename T> struct pow_func : binary_function<T, float, float>
{
__device__ __forceinline__ float operator ()(T val, float power) const

@ -594,7 +594,7 @@ namespace integral_detail
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
__host__ static void integral(const GlobPtr<uchar> src, GlobPtr<uint> dst, int rows, int cols, cudaStream_t stream)
__host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
{
if (deviceSupports(FEATURE_SET_COMPUTE_30)
&& (cols % 16 == 0)
@ -614,7 +614,7 @@ namespace integral_detail
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
__host__ static void integral(const GlobPtr<uchar> src, GlobPtr<int> dst, int rows, int cols, cudaStream_t stream)
__host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
{
GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
integral(src, dstui, rows, cols, stream);

@ -0,0 +1,177 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma once
#ifndef __OPENCV_CUDEV_GRID_MINMAXLOC_DETAIL_HPP__
#define __OPENCV_CUDEV_GRID_MINMAXLOC_DETAIL_HPP__
#include "../../common.hpp"
#include "../../util/vec_traits.hpp"
#include "../../util/type_traits.hpp"
#include "../../util/limits.hpp"
#include "../../block/reduce.hpp"
namespace cv { namespace cudev {
namespace grid_minmaxloc_detail
{
template <int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
__global__ void minMaxLoc_pass_1(const SrcPtr src, ResType* minVal, ResType* maxVal, int* minLoc, int* maxLoc, const MaskPtr mask, const int rows, const int cols, const int patch_y, const int patch_x)
{
__shared__ ResType sMinVal[BLOCK_SIZE];
__shared__ ResType sMaxVal[BLOCK_SIZE];
__shared__ uint sMinLoc[BLOCK_SIZE];
__shared__ uint sMaxLoc[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * patch_x + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * patch_y + threadIdx.y;
ResType myMin = numeric_limits<ResType>::max();
ResType myMax = -numeric_limits<ResType>::max();
int myMinLoc = -1;
int myMaxLoc = -1;
for (int i = 0, y = y0; i < patch_y && y < rows; ++i, y += blockDim.y)
{
for (int j = 0, x = x0; j < patch_x && x < cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const ResType srcVal = src(y, x);
if (srcVal < myMin)
{
myMin = srcVal;
myMinLoc = y * cols + x;
}
if (srcVal > myMax)
{
myMax = srcVal;
myMaxLoc = y * cols + x;
}
}
}
}
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
blockReduceKeyVal<BLOCK_SIZE>(smem_tuple(sMinVal, sMaxVal), tie(myMin, myMax),
smem_tuple(sMinLoc, sMaxLoc), tie(myMinLoc, myMaxLoc),
tid,
make_tuple(less<ResType>(), greater<ResType>()));
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
if (tid == 0)
{
minVal[bid] = myMin;
maxVal[bid] = myMax;
minLoc[bid] = myMinLoc;
maxLoc[bid] = myMaxLoc;
}
}
template <int BLOCK_SIZE, typename T>
__global__ void minMaxLoc_pass_2(T* minMal, T* maxVal, int* minLoc, int* maxLoc, int count)
{
__shared__ T sMinVal[BLOCK_SIZE];
__shared__ T sMaxVal[BLOCK_SIZE];
__shared__ int sMinLoc[BLOCK_SIZE];
__shared__ int sMaxLoc[BLOCK_SIZE];
const int idx = ::min(threadIdx.x, count - 1);
T myMin = minMal[idx];
T myMax = maxVal[idx];
int myMinLoc = minLoc[idx];
int myMaxLoc = maxLoc[idx];
blockReduceKeyVal<BLOCK_SIZE>(smem_tuple(sMinVal, sMaxVal), tie(myMin, myMax),
smem_tuple(sMinLoc, sMaxLoc), tie(myMinLoc, myMaxLoc),
threadIdx.x,
make_tuple(less<T>(), greater<T>()));
if (threadIdx.x == 0)
{
minMal[0] = myMin;
maxVal[0] = myMax;
minLoc[0] = myMinLoc;
maxLoc[0] = myMaxLoc;
}
}
template <class Policy>
void getLaunchCfg(int rows, int cols, dim3& block, dim3& grid)
{
block = dim3(Policy::block_size_x, Policy::block_size_y);
grid = dim3(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
__host__ void minMaxLoc(const SrcPtr& src, ResType* minVal, ResType* maxVal, int* minLoc, int* maxLoc, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
dim3 block, grid;
getLaunchCfg<Policy>(cols, rows, block, grid);
const int patch_x = divUp(divUp(cols, grid.x), block.x);
const int patch_y = divUp(divUp(rows, grid.y), block.y);
minMaxLoc_pass_1<Policy::block_size_x * Policy::block_size_y><<<grid, block, 0, stream>>>(src, minVal, maxVal, minLoc, maxLoc, mask, rows, cols, patch_y, patch_x);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
minMaxLoc_pass_2<Policy::block_size_x * Policy::block_size_y><<<1, Policy::block_size_x * Policy::block_size_y, 0, stream>>>(minVal, maxVal, minLoc, maxLoc, grid.x * grid.y);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
if (stream == 0)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
}
}}
#endif

@ -43,8 +43,8 @@
#pragma once
#ifndef __OPENCV_CUDEV_GRID_GLOB_REDUCE_DETAIL_HPP__
#define __OPENCV_CUDEV_GRID_GLOB_REDUCE_DETAIL_HPP__
#ifndef __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__
#define __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__
#include "../../common.hpp"
#include "../../util/tuple.hpp"
@ -59,7 +59,7 @@
namespace cv { namespace cudev {
namespace grid_glob_reduce_detail
namespace grid_reduce_detail
{
// Unroll
@ -389,7 +389,7 @@ namespace grid_glob_reduce_detail
// glob_reduce
template <class Reductor, int BLOCK_SIZE, int PATCH_X, int PATCH_Y, class SrcPtr, typename ResType, class MaskPtr>
__global__ void glob_reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
__global__ void reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
{
const int x0 = blockIdx.x * blockDim.x * PATCH_X + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * PATCH_Y + threadIdx.y;
@ -413,14 +413,12 @@ namespace grid_glob_reduce_detail
}
template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr>
__host__ void glob_reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
const dim3 block(Policy::block_size_x, Policy::block_size_y);
const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
const int BLOCK_SIZE = Policy::block_size_x * Policy::block_size_y;
glob_reduce<Reductor, BLOCK_SIZE, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
if (stream == 0)
@ -433,40 +431,33 @@ namespace grid_glob_reduce_detail
__host__ void sum(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
typedef typename PtrTraits<SrcPtr>::value_type src_type;
const int cn = VecTraits<src_type>::cn;
typedef typename MakeVec<ResType, cn>::type work_type;
typedef typename VecTraits<ResType>::elem_type res_elem_type;
glob_reduce<SumReductor<src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
reduce<SumReductor<src_type, ResType>, Policy>(src, (res_elem_type*) result, mask, rows, cols, stream);
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
__host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
typedef typename PtrTraits<SrcPtr>::value_type src_type;
const int cn = VecTraits<src_type>::cn;
typedef typename MakeVec<ResType, cn>::type work_type;
glob_reduce<MinMaxReductor<minop<work_type>, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
reduce<MinMaxReductor<minop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
__host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
typedef typename PtrTraits<SrcPtr>::value_type src_type;
const int cn = VecTraits<src_type>::cn;
typedef typename MakeVec<ResType, cn>::type work_type;
glob_reduce<MinMaxReductor<maxop<work_type>, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
reduce<MinMaxReductor<maxop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
__host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
typedef typename PtrTraits<SrcPtr>::value_type src_type;
const int cn = VecTraits<src_type>::cn;
typedef typename MakeVec<ResType, cn>::type work_type;
glob_reduce<MinMaxReductor<both, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
reduce<MinMaxReductor<both, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
}
}

@ -54,12 +54,52 @@ namespace cv { namespace cudev {
namespace grid_reduce_to_vec_detail
{
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor, int cn> struct Reduce;
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 1>
{
__device__ __forceinline__ static void call(work_elem_type smem[1][BLOCK_SIZE], work_type& myVal)
{
typename Reductor::template rebind<work_elem_type>::other op;
blockReduce<BLOCK_SIZE>(smem[0], myVal, threadIdx.x, op);
}
};
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 2>
{
__device__ __forceinline__ static void call(work_elem_type smem[2][BLOCK_SIZE], work_type& myVal)
{
typename Reductor::template rebind<work_elem_type>::other op;
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1]), tie(myVal.x, myVal.y), threadIdx.x, make_tuple(op, op));
}
};
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 3>
{
__device__ __forceinline__ static void call(work_elem_type smem[3][BLOCK_SIZE], work_type& myVal)
{
typename Reductor::template rebind<work_elem_type>::other op;
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2]), tie(myVal.x, myVal.y, myVal.z), threadIdx.x, make_tuple(op, op, op));
}
};
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 4>
{
__device__ __forceinline__ static void call(work_elem_type smem[4][BLOCK_SIZE], work_type& myVal)
{
typename Reductor::template rebind<work_elem_type>::other op;
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2], smem[3]), tie(myVal.x, myVal.y, myVal.z, myVal.w), threadIdx.x, make_tuple(op, op, op, op));
}
};
template <class Reductor, int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
__global__ void reduceToColumn(const SrcPtr src, ResType* dst, const MaskPtr mask, const int cols)
{
typedef typename Reductor::work_type work_type;
typedef typename VecTraits<work_type>::elem_type work_elem_type;
const int cn = VecTraits<work_type>::cn;
__shared__ work_type smem[BLOCK_SIZE];
__shared__ work_elem_type smem[cn][BLOCK_SIZE];
const int y = blockIdx.x;
@ -75,7 +115,7 @@ namespace grid_reduce_to_vec_detail
}
}
blockReduce<BLOCK_SIZE>(smem, myVal, threadIdx.x, op);
Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, cn>::call(smem, myVal);
if (threadIdx.x == 0)
dst[y] = saturate_cast<ResType>(Reductor::result(myVal, cols));

@ -217,7 +217,7 @@ namespace grid_transform_detail
}
template <int SHIFT, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
__global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, PtrStep<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols)
__global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, GlobPtr<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols)
{
typedef typename MakeVec<SrcType1, SHIFT>::type read_type1;
typedef typename MakeVec<SrcType2, SHIFT>::type read_type2;
@ -345,25 +345,25 @@ namespace grid_transform_detail
};
template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
__host__ void transform(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ void transform_unary(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream);
}
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
__host__ void transform(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ void transform_binary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
}
template <class Policy, typename SrcType, typename DstType, class UnOp, class MaskPtr>
__host__ void transform(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ void transform_unary(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
TransformDispatcher<VecTraits<SrcType>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src, dst, op, mask, rows, cols, stream);
}
template <class Policy, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
__host__ void transform(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ void transform_binary(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
TransformDispatcher<VecTraits<SrcType1>::cn == 1 && VecTraits<SrcType2>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
}

@ -55,15 +55,12 @@ namespace cv { namespace cudev {
namespace transpose_detail
{
const int TRANSPOSE_TILE_DIM = 16;
const int TRANSPOSE_BLOCK_ROWS = 16;
template <class SrcPtr, typename DstType>
template <int TILE_DIM, int BLOCK_DIM_Y, class SrcPtr, typename DstType>
__global__ void transpose(const SrcPtr src, GlobPtr<DstType> dst, const int rows, const int cols)
{
typedef typename PtrTraits<SrcPtr>::value_type src_type;
__shared__ src_type tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
__shared__ src_type tile[TILE_DIM][TILE_DIM + 1];
int blockIdx_x, blockIdx_y;
@ -80,12 +77,12 @@ namespace transpose_detail
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
}
int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
if (xIndex < cols)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
{
if (yIndex + i < rows)
{
@ -96,12 +93,12 @@ namespace transpose_detail
__syncthreads();
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
if (xIndex < rows)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
{
if (yIndex + i < cols)
{
@ -111,13 +108,13 @@ namespace transpose_detail
}
}
template <class SrcPtr, typename DstType>
template <class Policy, class SrcPtr, typename DstType>
__host__ void transpose(const SrcPtr& src, const GlobPtr<DstType>& dst, int rows, int cols, cudaStream_t stream)
{
const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
const dim3 block(Policy::tile_dim, Policy::block_dim_y);
const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
transpose<<<grid, block, 0, stream>>>(src, dst, rows, cols);
transpose<Policy::tile_dim, Policy::block_dim_y><<<grid, block, 0, stream>>>(src, dst, rows, cols);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
if (stream == 0)

@ -43,8 +43,8 @@
#pragma once
#ifndef __OPENCV_CUDEV_GRID_GLOB_REDUCE_HPP__
#define __OPENCV_CUDEV_GRID_GLOB_REDUCE_HPP__
#ifndef __OPENCV_CUDEV_GRID_REDUCE_HPP__
#define __OPENCV_CUDEV_GRID_REDUCE_HPP__
#include <limits>
#include "../common.hpp"
@ -52,13 +52,18 @@
#include "../ptr2d/gpumat.hpp"
#include "../ptr2d/mask.hpp"
#include "../ptr2d/transform.hpp"
#include "detail/glob_reduce.hpp"
#include "detail/reduce.hpp"
#include "detail/minmaxloc.hpp"
namespace cv { namespace cudev {
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
__host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
{
typedef typename PtrTraits<SrcPtr>::value_type src_type;
CV_StaticAssert( unsigned(VecTraits<src_type>::cn) == unsigned(VecTraits<ResType>::cn), "" );
dst.create(1, 1);
dst.setTo(0, stream);
@ -67,27 +72,31 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskP
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_glob_reduce_detail::sum<Policy>(shrinkPtr(src),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::sum<Policy>(shrinkPtr(src),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType>
__host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
{
typedef typename PtrTraits<SrcPtr>::value_type src_type;
CV_StaticAssert( unsigned(VecTraits<src_type>::cn) == unsigned(VecTraits<ResType>::cn), "" );
dst.create(1, 1);
dst.setTo(0, stream);
const int rows = getRows(src);
const int cols = getCols(src);
grid_glob_reduce_detail::sum<Policy>(shrinkPtr(src),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::sum<Policy>(shrinkPtr(src),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
@ -101,11 +110,11 @@ __host__ void gridFindMinVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const Ma
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_glob_reduce_detail::minVal<Policy>(shrinkPtr(src),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::minVal<Policy>(shrinkPtr(src),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType>
@ -117,11 +126,11 @@ __host__ void gridFindMinVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream&
const int rows = getRows(src);
const int cols = getCols(src);
grid_glob_reduce_detail::minVal<Policy>(shrinkPtr(src),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::minVal<Policy>(shrinkPtr(src),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
@ -135,11 +144,11 @@ __host__ void gridFindMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const Ma
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_glob_reduce_detail::maxVal<Policy>(shrinkPtr(src),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::maxVal<Policy>(shrinkPtr(src),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType>
@ -151,11 +160,11 @@ __host__ void gridFindMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream&
const int rows = getRows(src);
const int cols = getCols(src);
grid_glob_reduce_detail::maxVal<Policy>(shrinkPtr(src),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::maxVal<Policy>(shrinkPtr(src),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
@ -170,11 +179,11 @@ __host__ void gridFindMinMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_glob_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType>
@ -187,11 +196,51 @@ __host__ void gridFindMinMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Strea
const int rows = getRows(src);
const int cols = getCols(src);
grid_glob_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
__host__ void gridMinMaxLoc_(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, const MaskPtr& mask, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
dim3 grid, block;
grid_minmaxloc_detail::getLaunchCfg<Policy>(rows, cols, block, grid);
valBuf.create(2, grid.x * grid.y);
locBuf.create(2, grid.x * grid.y);
grid_minmaxloc_detail::minMaxLoc<Policy>(shrinkPtr(src),
valBuf[0], valBuf[1], locBuf[0], locBuf[1],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType>
__host__ void gridMinMaxLoc_(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
dim3 grid, block;
grid_minmaxloc_detail::getLaunchCfg<Policy>(rows, cols, block, grid);
valBuf.create(2, grid.x * grid.y);
locBuf.create(2, grid.x * grid.y);
grid_minmaxloc_detail::minMaxLoc<Policy>(shrinkPtr(src),
valBuf[0], valBuf[1], locBuf[0], locBuf[1],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
@ -209,11 +258,11 @@ __host__ void gridCountNonZero_(const SrcPtr& src, GpuMat_<ResType>& dst, const
not_equal_to<src_type> ne_op;
const src_type zero = VecTraits<src_type>::all(0);
grid_glob_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
dst[0],
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename ResType>
@ -229,11 +278,11 @@ __host__ void gridCountNonZero_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream
not_equal_to<src_type> ne_op;
const src_type zero = VecTraits<src_type>::all(0);
grid_glob_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
grid_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
dst[0],
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
// default policy
@ -297,6 +346,18 @@ __host__ void gridFindMinMaxVal(const SrcPtr& src, GpuMat_<ResType>& dst, Stream
gridFindMinMaxVal_<DefaultGlobReducePolicy>(src, dst, stream);
}
template <class SrcPtr, typename ResType, class MaskPtr>
__host__ void gridMinMaxLoc(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridMinMaxLoc_<DefaultGlobReducePolicy>(src, valBuf, locBuf, mask, stream);
}
template <class SrcPtr, typename ResType>
__host__ void gridMinMaxLoc(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, Stream& stream = Stream::Null())
{
gridMinMaxLoc_<DefaultGlobReducePolicy>(src, valBuf, locBuf, stream);
}
template <class SrcPtr, typename ResType, class MaskPtr>
__host__ void gridCountNonZero(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
{

@ -49,6 +49,7 @@
#include "../common.hpp"
#include "../util/vec_traits.hpp"
#include "../util/limits.hpp"
#include "../util/saturate_cast.hpp"
#include "../ptr2d/traits.hpp"
#include "../ptr2d/gpumat.hpp"
#include "../ptr2d/mask.hpp"
@ -62,6 +63,11 @@ template <typename T> struct Sum : plus<T>
{
typedef T work_type;
template <typename U> struct rebind
{
typedef Sum<U> other;
};
__device__ __forceinline__ static T initialValue()
{
return VecTraits<T>::all(0);
@ -77,14 +83,19 @@ template <typename T> struct Avg : plus<T>
{
typedef T work_type;
template <typename U> struct rebind
{
typedef Avg<U> other;
};
__device__ __forceinline__ static T initialValue()
{
return VecTraits<T>::all(0);
}
__device__ __forceinline__ static T result(T r, int sz)
__device__ __forceinline__ static T result(T r, float sz)
{
return r / sz;
return saturate_cast<T>(r / sz);
}
};
@ -92,6 +103,11 @@ template <typename T> struct Min : minimum<T>
{
typedef T work_type;
template <typename U> struct rebind
{
typedef Min<U> other;
};
__device__ __forceinline__ static T initialValue()
{
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
@ -107,6 +123,11 @@ template <typename T> struct Max : maximum<T>
{
typedef T work_type;
template <typename U> struct rebind
{
typedef Max<U> other;
};
__device__ __forceinline__ static T initialValue()
{
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
@ -158,7 +179,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, cons
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
createContinuous(rows, 1, DataType<ResType>::type, dst);
dst.create(1, rows);
grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
dst[0],
@ -173,7 +194,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, Stre
const int rows = getRows(src);
const int cols = getCols(src);
createContinuous(rows, 1, DataType<ResType>::type, dst);
dst.create(1, rows);
grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
dst[0],

@ -51,6 +51,7 @@
#include "../util/vec_traits.hpp"
#include "../ptr2d/traits.hpp"
#include "../ptr2d/gpumat.hpp"
#include "../ptr2d/glob.hpp"
#include "../ptr2d/mask.hpp"
#include "detail/split_merge.hpp"
@ -75,6 +76,24 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const Ma
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtrTuple, typename DstType, class MaskPtr>
__host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
shrinkPtr(dst),
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtrTuple, typename DstType>
__host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
{
@ -92,6 +111,23 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream&
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtrTuple, typename DstType>
__host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
{
CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
shrinkPtr(dst),
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
{
@ -132,6 +168,25 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[2], const Ma
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[2], const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
shrinkPtr(dst[0]), shrinkPtr(dst[1]),
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType>
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
{
@ -168,6 +223,24 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[2], Stream&
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType>
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[2], Stream& stream = Stream::Null())
{
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
shrinkPtr(dst[0]), shrinkPtr(dst[1]),
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
{
@ -210,6 +283,26 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[3], const Ma
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[3], const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType>
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
{
@ -248,6 +341,25 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[3], Stream&
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType>
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[3], Stream& stream = Stream::Null())
{
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
{
@ -283,10 +395,31 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[4], const Ma
dst[0].create(rows, cols);
dst[1].create(rows, cols);
dst[2].create(rows, cols);
dst[4].create(rows, cols);
dst[3].create(rows, cols);
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[4]),
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[4], const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
CV_Assert( getRows(dst[3]) == rows && getCols(dst[3]) == cols );
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
shrinkPtr(mask),
rows, cols,
StreamAccessor::getStream(stream));
@ -323,10 +456,30 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[4], Stream&
dst[0].create(rows, cols);
dst[1].create(rows, cols);
dst[2].create(rows, cols);
dst[4].create(rows, cols);
dst[3].create(rows, cols);
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType>
__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[4], Stream& stream = Stream::Null())
{
CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
CV_Assert( getRows(dst[3]) == rows && getCols(dst[3]) == cols );
grid_split_merge_detail::split<Policy>(shrinkPtr(src),
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[4]),
shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
WithOutMask(),
rows, cols,
StreamAccessor::getStream(stream));
@ -348,12 +501,24 @@ __host__ void gridMerge(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const Mas
gridMerge_<DefaultSplitMergePolicy>(src, dst, mask, stream);
}
template <class SrcPtrTuple, typename DstType, class MaskPtr>
__host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridMerge_<DefaultSplitMergePolicy>(src, dst, mask, stream);
}
template <class SrcPtrTuple, typename DstType>
__host__ void gridMerge(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
{
gridMerge_<DefaultSplitMergePolicy>(src, dst, stream);
}
template <class SrcPtrTuple, typename DstType>
__host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
{
gridMerge_<DefaultSplitMergePolicy>(src, dst, stream);
}
template <class SrcPtr, typename DstType, class MaskPtr>
__host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
{
@ -396,12 +561,24 @@ __host__ void gridSplit(const SrcPtr& src, GpuMat_<DstType> (&dst)[COUNT], const
gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
}
template <class SrcPtr, typename DstType, int COUNT, class MaskPtr>
__host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
}
template <class SrcPtr, typename DstType, int COUNT>
__host__ void gridSplit(const SrcPtr& src, GpuMat_<DstType> (&dst)[COUNT], Stream& stream = Stream::Null())
{
gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
}
template <class SrcPtr, typename DstType, int COUNT>
__host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], Stream& stream = Stream::Null())
{
gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
}
}}
#endif

@ -58,7 +58,7 @@
namespace cv { namespace cudev {
template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
__host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
@ -67,11 +67,11 @@ __host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnO
dst.create(rows, cols);
grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
__host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
@ -79,33 +79,33 @@ __host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, c
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class UnOp>
__host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
__host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
dst.create(rows, cols);
grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType, class UnOp>
__host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
const int rows = getRows(src1);
const int cols = getCols(src1);
@ -115,11 +115,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<D
dst.create(rows, cols);
grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
const int rows = getRows(src1);
const int cols = getCols(src1);
@ -128,11 +128,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, const Glo
CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
{
const int rows = getRows(src1);
const int cols = getCols(src1);
@ -141,11 +141,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<D
dst.create(rows, cols);
grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GlobPtrSz<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
{
const int rows = getRows(src1);
const int cols = getCols(src1);
@ -153,11 +153,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GlobPtrSz
CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
@ -178,7 +178,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
}
template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
@ -198,7 +198,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
}
template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
@ -217,7 +217,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
}
template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
@ -236,7 +236,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
}
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
@ -258,7 +258,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
}
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
@ -279,7 +279,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
}
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
@ -299,7 +299,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
}
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
@ -319,7 +319,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
}
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
@ -342,7 +342,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
}
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
@ -364,7 +364,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
}
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
@ -385,7 +385,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
}
template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
@ -417,123 +417,123 @@ struct DefaultTransformPolicy
};
template <class SrcPtr, typename DstType, class Op, class MaskPtr>
__host__ void gridTransform(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformUnary(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, mask, stream);
}
template <class SrcPtr, typename DstType, class Op, class MaskPtr>
__host__ void gridTransform(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformUnary(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, mask, stream);
}
template <class SrcPtr, typename DstType, class Op>
__host__ void gridTransform(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
__host__ void gridTransformUnary(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, stream);
}
template <class SrcPtr, typename DstType, class Op>
__host__ void gridTransform(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
__host__ void gridTransformUnary(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, stream);
}
template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
}
template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
}
template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
}
template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
}
template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
}
template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
}
template <class SrcPtr, typename D0, typename D1, class OpTuple>
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
}
template <class SrcPtr, typename D0, typename D1, class OpTuple>
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
}
template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
}
template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
}
template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
}
template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
}
template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
}
template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
}
template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
}
template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
{
gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
}
}}

@ -49,19 +49,53 @@
#include "../common.hpp"
#include "../ptr2d/traits.hpp"
#include "../ptr2d/gpumat.hpp"
#include "../ptr2d/glob.hpp"
#include "detail/transpose.hpp"
namespace cv { namespace cudev {
template <class SrcPtr, typename DstType>
__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
template <class Policy, class SrcPtr, typename DstType>
__host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
dst.create(cols, rows);
transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType>
__host__ void gridTranspose_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst) == cols && getCols(dst) == rows );
transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
}
// Default Policy
struct DefaultTransposePolicy
{
enum {
tile_dim = 16,
block_dim_y = 16
};
};
template <class SrcPtr, typename DstType>
__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
{
gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
}
template <class SrcPtr, typename DstType>
__host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
{
gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
}
}}

@ -47,6 +47,7 @@
#define __OPENCV_CUDEV_PTR2D_LUT_HPP__
#include "../common.hpp"
#include "../util/vec_traits.hpp"
#include "../grid/copy.hpp"
#include "traits.hpp"
#include "gpumat.hpp"
@ -63,7 +64,8 @@ template <class SrcPtr, class TablePtr> struct LutPtr
__device__ __forceinline__ typename PtrTraits<TablePtr>::value_type operator ()(typename PtrTraits<SrcPtr>::index_type y, typename PtrTraits<SrcPtr>::index_type x) const
{
return tbl(0, src(y, x));
typedef typename PtrTraits<TablePtr>::index_type tbl_index_type;
return tbl(VecTraits<tbl_index_type>::all(0), src(y, x));
}
};
@ -81,8 +83,6 @@ template <class SrcPtr, class TablePtr> struct LutPtrSz : LutPtr<SrcPtr, TablePt
template <class SrcPtr, class TablePtr>
__host__ LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> lutPtr(const SrcPtr& src, const TablePtr& tbl)
{
CV_Assert( getRows(tbl) == 1 );
LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> ptr;
ptr.src = shrinkPtr(src);
ptr.tbl = shrinkPtr(tbl);

@ -62,6 +62,42 @@ struct WithOutMask
}
};
template <class MaskPtr> struct SingleMaskChannels
{
typedef typename PtrTraits<MaskPtr>::value_type value_type;
typedef typename PtrTraits<MaskPtr>::index_type index_type;
MaskPtr mask;
int channels;
__device__ __forceinline__ value_type operator()(index_type y, index_type x) const
{
return mask(y, x / channels);
}
};
template <class MaskPtr> struct SingleMaskChannelsSz : SingleMaskChannels<MaskPtr>
{
int rows, cols;
};
template <class MaskPtr>
__host__ SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type>
singleMaskChannels(const MaskPtr& mask, int channels)
{
SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type> ptr;
ptr.mask = shrinkPtr(mask);
ptr.channels = channels;
ptr.rows = getRows(mask);
ptr.cols = getCols(mask) * channels;
return ptr;
}
template <class MaskPtr> struct PtrTraits< SingleMaskChannelsSz<MaskPtr> > : PtrTraitsBase<SingleMaskChannelsSz<MaskPtr>, SingleMaskChannels<MaskPtr> >
{
};
}}
#endif

@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
}
namespace vec_math_detail
{
__device__ __forceinline__ schar abs_(schar val)
{
return (schar) ::abs((int) val);
}
__device__ __forceinline__ short abs_(short val)
{
return (short) ::abs((int) val);
}
}
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uchar, uchar)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, char, char)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, char, char)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, ushort, ushort)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, short, short)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, short, short)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, int, int)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uint, uint)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)

@ -70,7 +70,7 @@ CV_CUDEV_MAKE_VEC_INST(double)
#undef CV_CUDEV_MAKE_VEC_INST
template<> struct MakeVec<schar, 1> { typedef char type; };
template<> struct MakeVec<schar, 1> { typedef schar type; };
template<> struct MakeVec<schar, 2> { typedef char2 type; };
template<> struct MakeVec<schar, 3> { typedef char3 type; };
template<> struct MakeVec<schar, 4> { typedef char4 type; };

@ -228,6 +228,9 @@ TEST(ReduceToColumn, Sum)
Mat dst_gold;
cv::reduce(src, dst_gold, 1, REDUCE_SUM, CV_32S);
dst_gold.cols = dst_gold.rows;
dst_gold.rows = 1;
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
}
@ -244,6 +247,9 @@ TEST(ReduceToColumn, Avg)
Mat dst_gold;
cv::reduce(src, dst_gold, 1, REDUCE_AVG, CV_32F);
dst_gold.cols = dst_gold.rows;
dst_gold.rows = 1;
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
}
@ -260,6 +266,9 @@ TEST(ReduceToColumn, Min)
Mat dst_gold;
cv::reduce(src, dst_gold, 1, REDUCE_MIN);
dst_gold.cols = dst_gold.rows;
dst_gold.rows = 1;
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
}
@ -276,6 +285,9 @@ TEST(ReduceToColumn, Max)
Mat dst_gold;
cv::reduce(src, dst_gold, 1, REDUCE_MAX);
dst_gold.cols = dst_gold.rows;
dst_gold.rows = 1;
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
}

Loading…
Cancel
Save