From d00fa6b8176c12a211c842a737039734735d9277 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 29 Feb 2012 13:02:25 +0000 Subject: [PATCH] improved type dispatching in gpu arithm functions --- modules/gpu/src/element_operations.cpp | 225 ++++++++++++------------- 1 file changed, 107 insertions(+), 118 deletions(-) diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index a7e5201555..35926f1b9a 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -89,92 +89,74 @@ void cv::gpu::addWeighted(const GpuMat&, double, const GpuMat&, double, double, namespace { - typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); - typedef NppStatus (*npp_arithm_16u_t)(const Npp16u* pSrc1, int nSrc1Step, const Npp16u* pSrc2, int nSrc2Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); - typedef NppStatus (*npp_arithm_16s_t)(const Npp16s* pSrc1, int nSrc1Step, const Npp16s* pSrc2, int nSrc2Step, Npp16s* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); - typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); - typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + template struct NppTypeTraits; + template<> struct NppTypeTraits { typedef Npp8u npp_t; }; + template<> struct NppTypeTraits { typedef Npp8s npp_t; }; + template<> struct NppTypeTraits { typedef Npp16u npp_t; }; + template<> struct NppTypeTraits { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; }; - bool nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, - npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4, - npp_arithm_16u_t npp_func_16uc1, npp_arithm_16u_t npp_func_16uc4, - npp_arithm_16s_t npp_func_16sc1, npp_arithm_16s_t npp_func_16sc4, - npp_arithm_32s_t npp_func_32sc1, - npp_arithm_32f_t npp_func_32fc1, npp_arithm_32f_t npp_func_32fc4, - cudaStream_t stream) + template struct NppArithmFunc { - bool useNpp = (src1.depth() == CV_8U || src1.depth() == CV_16U || src1.depth() == CV_16S || src1.depth() == CV_32S || src1.depth() == CV_32F); + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template <> struct NppArithmFunc + { + typedef NppTypeTraits::npp_t npp_t; - if (!useNpp) - return false; + typedef NppStatus (*func_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + }; - bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16); + template ::func_t func> struct NppArithm + { + typedef typename NppArithmFunc::npp_t npp_t; - NppiSize sz; - sz.width = src1.cols * src1.channels(); - sz.height = src1.rows; + static void call(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream) + { + NppStreamHandler h(stream); - NppStreamHandler h(stream); + NppiSize sz; + sz.width = src1.cols; + sz.height = src1.rows; - if (aligned && src1.depth() == CV_8U && (sz.width % 4) == 0) - { - sz.width /= 4; + nppSafeCall( func((const npp_t*)src1.data, static_cast(src1.step), (const npp_t*)src2.data, static_cast(src2.step), + (npp_t*)dst.data, static_cast(dst.step), sz, 0) ); - nppSafeCall( npp_func_8uc4(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz, 0) ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - else if (src1.depth() == CV_8U) + static void call(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream) { - nppSafeCall( npp_func_8uc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz, 0) ); + call(src1, src2, dst, PtrStepb(), stream); } - else if (aligned && src1.depth() == CV_16U && (sz.width % 4) == 0) - { - sz.width /= 4; + }; + template ::func_t func> struct NppArithm + { + typedef typename NppArithmFunc::npp_t npp_t; - nppSafeCall( npp_func_16uc4(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz, 0) ); - } - else if (src1.depth() == CV_16U) + static void call(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream) { - nppSafeCall( npp_func_16uc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz, 0) ); - } - else if (aligned && src1.depth() == CV_16S && (sz.width % 4) == 0) - { - sz.width /= 4; + NppStreamHandler h(stream); - nppSafeCall( npp_func_16sc4(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz, 0) ); - } - else if (src1.depth() == CV_16S) - { - nppSafeCall( npp_func_16sc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz, 0) ); - } - else if (src1.depth() == CV_32S) - { - nppSafeCall( npp_func_32sc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz, 0) ); - } - else if (aligned && src1.depth() == CV_32F && (sz.width % 4) == 0) - { - sz.width /= 4; + NppiSize sz; + sz.width = src1.cols; + sz.height = src1.rows; + + nppSafeCall( func((const npp_t*)src1.data, static_cast(src1.step), (const npp_t*)src2.data, static_cast(src2.step), + (npp_t*)dst.data, static_cast(dst.step), sz) ); - nppSafeCall( npp_func_32fc4(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - else // if (src1.depth() == CV_32F) + static void call(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream) { - nppSafeCall( npp_func_32fc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); + call(src1, src2, dst, PtrStepb(), stream); } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - - return true; - } + }; } //////////////////////////////////////////////////////////////////////// @@ -206,6 +188,18 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu {0/*add_gpu*/, 0/*add_gpu*/, 0/*add_gpu*/, 0/*add_gpu*/, 0/*add_gpu*/, 0/*add_gpu*/, add_gpu} }; + static const func_t npp_funcs[7] = + { + NppArithm::call, + 0, + NppArithm::call, + NppArithm::call, + NppArithm::call, + NppArithm::call, + add_gpu + }; + + CV_Assert(src1.type() != CV_8S); CV_Assert(src1.type() == src2.type() && src1.size() == src2.size()); CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U)); @@ -218,16 +212,8 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu if (mask.empty() && dst.type() == src1.type()) { - if (nppArithmCaller(src1, src2, dst, - nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, - nppiAdd_16u_C1RSfs, nppiAdd_16u_C4RSfs, - nppiAdd_16s_C1RSfs, nppiAdd_16s_C4RSfs, - nppiAdd_32s_C1RSfs, - nppiAdd_32f_C1R, nppiAdd_32f_C4R, - stream)) - { - return; - } + npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), PtrStepb(), stream); + return; } const func_t func = funcs[src1.depth()][dst.depth()]; @@ -238,15 +224,6 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu namespace { - template struct NppTypeTraits; - template<> struct NppTypeTraits { typedef Npp8u npp_t; }; - template<> struct NppTypeTraits { typedef Npp8s npp_t; }; - template<> struct NppTypeTraits { typedef Npp16u npp_t; }; - template<> struct NppTypeTraits { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; }; - template<> struct NppTypeTraits { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; }; - template<> struct NppTypeTraits { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; }; - template<> struct NppTypeTraits { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; }; - template struct NppArithmScalarFunc { typedef typename NppTypeTraits::npp_t npp_t; @@ -485,6 +462,18 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons {0/*subtract_gpu*/, 0/*subtract_gpu*/, 0/*subtract_gpu*/, 0/*subtract_gpu*/, 0/*subtract_gpu*/, 0/*subtract_gpu*/, subtract_gpu} }; + static const func_t npp_funcs[7] = + { + NppArithm::call, + 0, + NppArithm::call, + NppArithm::call, + NppArithm::call, + NppArithm::call, + subtract_gpu + }; + + CV_Assert(src1.type() != CV_8S); CV_Assert(src1.type() == src2.type() && src1.size() == src2.size()); CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U)); @@ -497,16 +486,8 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons if (mask.empty() && dst.type() == src1.type()) { - if (nppArithmCaller(src2, src1, dst, - nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, - nppiSub_16u_C1RSfs, nppiSub_16u_C4RSfs, - nppiSub_16s_C1RSfs, nppiSub_16s_C4RSfs, - nppiSub_32s_C1RSfs, - nppiSub_32f_C1R, nppiSub_32f_C4R, - stream)) - { - return; - } + npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), PtrStepb(), stream); + return; } const func_t func = funcs[src1.depth()][dst.depth()]; @@ -604,6 +585,17 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub {0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, multiply_gpu} }; + static const func_t npp_funcs[7] = + { + NppArithm::call, + 0, + NppArithm::call, + NppArithm::call, + NppArithm::call, + NppArithm::call, + multiply_gpu + }; + cudaStream_t stream = StreamAccessor::getStream(s); if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1) @@ -624,6 +616,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub } else { + CV_Assert(src1.type() != CV_8S); CV_Assert(src1.type() == src2.type() && src1.size() == src2.size()); if (dtype < 0) @@ -633,16 +626,8 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub if (scale == 1 && dst.type() == src1.type()) { - if (nppArithmCaller(src1, src2, dst, - nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, - nppiMul_16u_C1RSfs, nppiMul_16u_C4RSfs, - nppiMul_16s_C1RSfs, nppiMul_16s_C4RSfs, - nppiMul_32s_C1RSfs, - nppiMul_32f_C1R, nppiMul_32f_C4R, - stream)) - { - return; - } + npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), 1, stream); + return; } const func_t func = funcs[src1.depth()][dst.depth()]; @@ -749,6 +734,17 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double {0/*divide_gpu*/, 0/*divide_gpu*/, 0/*divide_gpu*/, 0/*divide_gpu*/, 0/*divide_gpu*/, 0/*divide_gpu*/, divide_gpu} }; + static const func_t npp_funcs[7] = + { + NppArithm::call, + 0, + NppArithm::call, + NppArithm::call, + NppArithm::call, + NppArithm::call, + divide_gpu + }; + cudaStream_t stream = StreamAccessor::getStream(s); if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1) @@ -768,7 +764,8 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double multiply_gpu(static_cast >(src1), static_cast(src2), static_cast >(dst), stream); } else - { + { + CV_Assert(src1.type() != CV_8S); CV_Assert(src1.type() == src2.type() && src1.size() == src2.size()); if (dtype < 0) @@ -778,16 +775,8 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double if (scale == 1 && dst.type() == src1.type()) { - if (nppArithmCaller(src2, src1, dst, - nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, - nppiDiv_16u_C1RSfs, nppiDiv_16u_C4RSfs, - nppiDiv_16s_C1RSfs, nppiDiv_16s_C4RSfs, - nppiDiv_32s_C1RSfs, - nppiDiv_32f_C1R, nppiDiv_32f_C4R, - stream)) - { - return; - } + npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), 1, stream); + return; } const func_t func = funcs[src1.depth()][dst.depth()];