diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index 0cb8129b58..0f1843656c 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -602,4 +602,71 @@ namespace cv { namespace gpu { namespace device template void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream); template void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream); template void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream); + + + ////////////////////////////////////////////////////////////////////////// + // multiply + + template + void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols, + PtrStep dst) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < cols && y < rows) + { + ((TDst*)dst.ptr(y))[x] = saturate_cast(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]); + } + } + + + template + void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream) + { + dim3 threads(32, 8); + dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + + multiplyKernel<<>>(src1, src2, rows, cols, dst); + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + + + template void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream); + + + ////////////////////////////////////////////////////////////////////////// + // multiply (by scalar) + + template + void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < cols && y < rows) + { + ((TDst*)dst.ptr(y))[x] = saturate_cast(((TSrc*)src1.ptr(y))[x] * scale); + } + } + + + template + void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream) + { + dim3 threads(32, 8); + dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + + multiplyScalarKernel<<>>(src, scale, rows, cols, dst); + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + + + template void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream); }}} diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 62d0ea861d..57b8f2c80a 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -197,11 +197,59 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream)); } +namespace cv { namespace gpu { namespace device +{ + template + void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream); + + template + void multiplyScalarCaller(const PtrStep src, float scalar, int rows, int cols, PtrStep dst, cudaStream_t stream); +}}} + void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { - nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream)); + if (src1.type() == CV_8UC4 && src2.type() == CV_32F) + { + CV_Assert(src1.size() == src2.size()); + dst.create(src1.size(), src1.type()); + device::multiplyCaller(static_cast(src1), static_cast(src2), + src1.rows, src1.cols * 4, static_cast(dst), + StreamAccessor::getStream(stream)); + } + else + nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream)); } +void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) +{ + if (src.depth() == CV_8U) + { + dst.create(src.size(), src.type()); + device::multiplyScalarCaller(static_cast(src), (float)(sc[0]), src.rows, src.cols * src.channels(), + static_cast(dst), StreamAccessor::getStream(stream)); + } + else + { + CV_Assert(src.type() == CV_32FC1); + + dst.create(src.size(), src.type()); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + cudaStream_t cudaStream = StreamAccessor::getStream(stream); + + NppStreamHandler h(cudaStream); + + nppSafeCall( nppiMulC_32f_C1R(src.ptr(), static_cast(src.step), (Npp32f)sc[0], dst.ptr(), static_cast(dst.step), sz) ); + + if (cudaStream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + + void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R, StreamAccessor::getStream(stream)); @@ -227,26 +275,6 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream)); } -void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) -{ - CV_Assert(src.type() == CV_32FC1); - - dst.create(src.size(), src.type()); - - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - cudaStream_t cudaStream = StreamAccessor::getStream(stream); - - NppStreamHandler h(cudaStream); - - nppSafeCall( nppiMulC_32f_C1R(src.ptr(), static_cast(src.step), (Npp32f)sc[0], dst.ptr(), static_cast(dst.step), sz) ); - - if (cudaStream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} - void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) { CV_Assert(src.type() == CV_32FC1);