From 9c5da2ea22ebcece572c700d738e8934a86ebbd9 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Fri, 23 Aug 2013 18:28:13 +0400 Subject: [PATCH] used new device layer for cv::gpu::add --- modules/cudaarithm/CMakeLists.txt | 2 +- modules/cudaarithm/src/cuda/add_mat.cu | 254 ++++++++++-------- modules/cudaarithm/src/cuda/add_scalar.cu | 200 ++++++++------ modules/cudaarithm/src/element_operations.cpp | 243 +---------------- .../opencv2/cudev/grid/detail/integral.hpp | 4 +- 5 files changed, 268 insertions(+), 435 deletions(-) diff --git a/modules/cudaarithm/CMakeLists.txt b/modules/cudaarithm/CMakeLists.txt index 67a7ff922f..e676fa6735 100644 --- a/modules/cudaarithm/CMakeLists.txt +++ b/modules/cudaarithm/CMakeLists.txt @@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Operations on Matrices") ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations) -ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy) +ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev opencv_cudalegacy) ocv_module_include_directories() ocv_glob_module_sources() diff --git a/modules/cudaarithm/src/cuda/add_mat.cu b/modules/cudaarithm/src/cuda/add_mat.cu index 1270438208..6e7a7925fd 100644 --- a/modules/cudaarithm/src/cuda/add_mat.cu +++ b/modules/cudaarithm/src/cuda/add_mat.cu @@ -40,146 +40,186 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/simd_functions.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "arithm_func_traits.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace arithm +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int); + +namespace { - struct VAdd4 : binary_function + template struct AddOp1 : binary_function { - __device__ __forceinline__ uint operator ()(uint a, uint b) const + __device__ __forceinline__ D operator ()(T a, T b) const { - return vadd4(a, b); + return saturate_cast(a + b); } - - __host__ __device__ __forceinline__ VAdd4() {} - __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {} }; - struct VAdd2 : binary_function + template + void addMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) + { + if (mask.data) + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), AddOp1(), globPtr(mask), stream); + else + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), AddOp1(), stream); + } + + struct AddOp2 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { return vadd2(a, b); } - - __host__ __device__ __forceinline__ VAdd2() {} - __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {} }; - template struct AddMat : binary_function + void addMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { - __device__ __forceinline__ D operator ()(T a, T b) const + const int vcols = src1.cols >> 1; + + GlobPtrSz src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols); + GlobPtrSz src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols); + GlobPtrSz dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols); + + gridTransformBinary(src1_, src2_, dst_, AddOp2(), stream); + } + + struct AddOp4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - return saturate_cast(a + b); + return vadd4(a, b); } - - __host__ __device__ __forceinline__ AddMat() {} - __host__ __device__ __forceinline__ AddMat(const AddMat&) {} }; + + void addMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) + { + const int vcols = src1.cols >> 2; + + GlobPtrSz src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols); + GlobPtrSz src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols); + GlobPtrSz dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols); + + gridTransformBinary(src1_, src2_, dst_, AddOp4(), stream); + } } -namespace cv { namespace cuda { namespace device +void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int) { - template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream); + static const func_t funcs[7][7] = { + { + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1 + }, + { + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1 + }, + { + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1 + }, + { + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1, + addMat_v1 + }, + { + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + addMat_v1, + addMat_v1, + addMat_v1 + }, + { + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + addMat_v1, + addMat_v1 + }, + { + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + 0 /*addMat_v1*/, + addMat_v1 + } }; - template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits - { - }; + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); - template struct TransformFunctorTraits< arithm::AddMat > : arithm::ArithmFuncTraits - { - }; -}}} + CV_DbgAssert( sdepth < 7 && ddepth < 7 ); -namespace arithm -{ - void addMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) - { - device::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream); - } + GpuMat src1_ = src1.reshape(1); + GpuMat src2_ = src2.reshape(1); + GpuMat dst_ = dst.reshape(1); - void addMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth) { - device::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream); - } + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); - template - void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) - { - if (mask.data) - device::transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), mask, stream); - else - device::transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), WithOutMask(), stream); + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + if (sdepth == CV_8U && (src1_.cols & 3) == 0) + { + addMat_v4(src1_, src2_, dst_, stream); + return; + } + else if (sdepth == CV_16U && (src1_.cols & 1) == 0) + { + addMat_v2(src1_, src2_, dst_, stream); + return; + } + } } - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, mask, stream); } -#endif // CUDA_DISABLER +#endif diff --git a/modules/cudaarithm/src/cuda/add_scalar.cu b/modules/cudaarithm/src/cuda/add_scalar.cu index 680061be11..e0788e9bdd 100644 --- a/modules/cudaarithm/src/cuda/add_scalar.cu +++ b/modules/cudaarithm/src/cuda/add_scalar.cu @@ -40,109 +40,141 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/simd_functions.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "arithm_func_traits.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace arithm +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int); + +namespace { - template struct AddScalar : unary_function + template struct AddScalarOp : unary_function { - S val; - - __host__ explicit AddScalar(S val_) : val(val_) {} + ScalarType val; - __device__ __forceinline__ D operator ()(T a) const + __device__ __forceinline__ DstType operator ()(SrcType a) const { - return saturate_cast(a + val); + return saturate_cast(saturate_cast(a) + val); } }; -} -namespace cv { namespace cuda { namespace device -{ - template struct TransformFunctorTraits< arithm::AddScalar > : arithm::ArithmFuncTraits + template struct TransformPolicy : DefaultTransformPolicy + { + }; + template <> struct TransformPolicy : DefaultTransformPolicy { + enum { + shift = 1 + }; }; -}}} -namespace arithm -{ - template - void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + template + void addScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, const GpuMat& mask, Stream& stream) { - AddScalar op(static_cast(val)); + typedef typename MakeVec::cn>::type ScalarType; + + cv::Scalar_ value_ = value; + + AddScalarOp op; + op.val = VecTraits::make(value_.val); if (mask.data) - device::transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, globPtr(mask), stream); else - device::transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, stream); } +} + +void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int) +{ + typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, const GpuMat& mask, Stream& stream); + static const func_t funcs[7][7][4] = + { + { + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl} + }, + { + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl} + }, + { + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl} + }, + { + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl} + }, + { + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl} + }, + { + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl} + }, + { + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/, 0 /*addScalarImpl*/}, + {addScalarImpl, addScalarImpl, addScalarImpl, addScalarImpl} + } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + CV_DbgAssert( sdepth < 7 && ddepth < 7 && cn <= 4 ); + + const func_t func = funcs[sdepth][ddepth][cn - 1]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + func(src, val, dst, mask, stream); } -#endif // CUDA_DISABLER +#endif diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp index 715771b671..b3711dcc1d 100644 --- a/modules/cudaarithm/src/element_operations.cpp +++ b/modules/cudaarithm/src/element_operations.cpp @@ -336,248 +336,9 @@ namespace //////////////////////////////////////////////////////////////////////// // add -namespace arithm -{ - void addMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); - void addMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); - - template - void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); -} - -static void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) -{ - typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat - }, - { - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat - }, - { - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat - }, - { - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat, - arithm::addMat - }, - { - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - arithm::addMat, - arithm::addMat, - arithm::addMat - }, - { - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - arithm::addMat, - arithm::addMat - }, - { - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - 0 /*arithm::addMat*/, - arithm::addMat - } - }; - - const int sdepth = src1.depth(); - const int ddepth = dst.depth(); - const int cn = src1.channels(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); - PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); - PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); - - if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth) - { - const intptr_t src1ptr = reinterpret_cast(src1_.data); - const intptr_t src2ptr = reinterpret_cast(src2_.data); - const intptr_t dstptr = reinterpret_cast(dst_.data); - - const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; - - if (isAllAligned) - { - if (sdepth == CV_8U && (src1_.cols & 3) == 0) - { - const int vcols = src1_.cols >> 2; - - arithm::addMat_v4(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), - PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), - PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), - stream); - - return; - } - else if (sdepth == CV_16U && (src1_.cols & 1) == 0) - { - const int vcols = src1_.cols >> 1; - - arithm::addMat_v2(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), - PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), - PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), - stream); - - return; - } - } - } - - const func_t func = funcs[sdepth][ddepth]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - - func(src1_, src2_, dst_, mask, stream); -} - -namespace arithm -{ - template - void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); -} - -static void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) -{ - typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar - }, - { - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar - }, - { - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar - }, - { - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar - }, - { - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - arithm::addScalar, - arithm::addScalar, - arithm::addScalar - }, - { - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - arithm::addScalar, - arithm::addScalar - }, - { - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - 0 /*arithm::addScalar*/, - arithm::addScalar - } - }; - - typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); - static const npp_func_t npp_funcs[7][4] = - { - {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, - {0 , 0 , 0 , 0 }, - {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, - {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, - {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, 0 }, - {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, - {0 , 0 , 0 , 0 } - }; - - const int sdepth = src.depth(); - const int ddepth = dst.depth(); - const int cn = src.channels(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; - if (ddepth == sdepth && cn > 1 && npp_func != 0) - { - npp_func(src, val, dst, stream); - return; - } +void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int); - CV_Assert( cn == 1 ); - - const func_t func = funcs[sdepth][ddepth]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - - func(src, val[0], dst, mask, stream); -} +void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int); void cv::cuda::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream) { diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp index b06d7ddbb6..5c90e99893 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp @@ -594,7 +594,7 @@ namespace integral_detail CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } - __host__ static void integral(const GlobPtr src, GlobPtr dst, int rows, int cols, cudaStream_t stream) + __host__ static void integral(const GlobPtr& src, const GlobPtr& dst, int rows, int cols, cudaStream_t stream) { if (deviceSupports(FEATURE_SET_COMPUTE_30) && (cols % 16 == 0) @@ -614,7 +614,7 @@ namespace integral_detail CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } - __host__ static void integral(const GlobPtr src, GlobPtr dst, int rows, int cols, cudaStream_t stream) + __host__ __forceinline__ void integral(const GlobPtr& src, const GlobPtr& dst, int rows, int cols, cudaStream_t stream) { GlobPtr dstui = globPtr((uint*) dst.data, dst.step); integral(src, dstui, rows, cols, stream);