From 11c6eb63051d9160865436db3cdf69987bc5fe06 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 21 Nov 2012 12:46:11 +0400 Subject: [PATCH] element operations --- .../include/opencv2/gpu/device/functional.hpp | 34 + modules/gpu/src/cuda/element_operations.cu | 4174 +++++++++++------ modules/gpu/src/element_operations.cpp | 3356 ++++++++----- modules/gpu/test/test_core.cpp | 142 +- 4 files changed, 4900 insertions(+), 2806 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp index 6e0471e9ac..cd63c3ac91 100644 --- a/modules/gpu/include/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp @@ -357,6 +357,9 @@ namespace cv { namespace gpu { namespace device { return abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -364,6 +367,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -371,6 +377,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -378,6 +387,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -385,6 +397,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -392,6 +407,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -399,6 +417,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -406,6 +427,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -413,6 +437,9 @@ namespace cv { namespace gpu { namespace device { return ::fabsf(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -420,6 +447,9 @@ namespace cv { namespace gpu { namespace device { return ::fabs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \ @@ -429,6 +459,8 @@ namespace cv { namespace gpu { namespace device { \ return func ## f(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; \ template <> struct name ## _func : unary_function \ { \ @@ -436,6 +468,8 @@ namespace cv { namespace gpu { namespace device { \ return func(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \ diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index c61601d4f7..eaf577bacf 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -42,1140 +42,2063 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" -namespace cv { namespace gpu { namespace device +using namespace cv::gpu; +using namespace cv::gpu::device; + +////////////////////////////////////////////////////////////////////////// +// addMat + +namespace { - ////////////////////////////////////////////////////////////////////////// - // add + template struct VAdd4; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + + //////////////////////////////////// + + template struct VAdd2; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; - template struct Add : binary_function + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + + //////////////////////////////////// + + template struct AddMat : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(a + b); } + + __device__ __forceinline__ AddMat() {} + __device__ __forceinline__ AddMat(const AddMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VAdd4 > : DefaultTransformFunctorTraits< VAdd4 > + { + enum { smart_shift = 2 }; + }; + + //////////////////////////////////// + + template struct TransformFunctorTraits< VAdd2 > : DefaultTransformFunctorTraits< VAdd4 > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + //////////////////////////////////// + + template <> struct TransformFunctorTraits< AddMat > : DefaultTransformFunctorTraits< AddMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct TransformFunctorTraits< AddMat > : DefaultTransformFunctorTraits< AddMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct TransformFunctorTraits< AddMat > : DefaultTransformFunctorTraits< AddMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct TransformFunctorTraits< AddMat > : DefaultTransformFunctorTraits< AddMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} + +namespace arithm +{ + template + void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd4(), WithOutMask(), stream); + } + + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + template + void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd2(), WithOutMask(), stream); + } + + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), SingleMask(mask), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), mask, stream); else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), WithOutMask(), stream); } - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct AddScalar : unary_function - { - AddScalar(double val_) : val(val_) {} + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// addScalar + +namespace +{ + template struct AddScalar : unary_function + { + S val; + + explicit AddScalar(S val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { return saturate_cast(a + val); } - const double val; }; +} - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) +namespace arithm +{ + template + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AddScalar op(val); + AddScalar op(static_cast(val)); + if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // subtract - - template struct Subtract : binary_function + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subMat + +namespace +{ + template struct VSub4; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + + //////////////////////////////////// + + template struct VSub2; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + + //////////////////////////////////// + + template struct SubMat : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(a - b); } + + __device__ __forceinline__ SubMat() {} + __device__ __forceinline__ SubMat(const SubMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VSub4 > : DefaultTransformFunctorTraits< VSub4 > + { + enum { smart_shift = 2 }; + }; + + //////////////////////////////////// + + template struct TransformFunctorTraits< VSub2 > : DefaultTransformFunctorTraits< VSub2 > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + //////////////////////////////////// + + template <> struct TransformFunctorTraits< SubMat > : DefaultTransformFunctorTraits< SubMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + template <> struct TransformFunctorTraits< SubMat > : DefaultTransformFunctorTraits< SubMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + template <> struct TransformFunctorTraits< SubMat > : DefaultTransformFunctorTraits< SubMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + template <> struct TransformFunctorTraits< SubMat > : DefaultTransformFunctorTraits< SubMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} + +namespace arithm +{ + template + void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub4(), WithOutMask(), stream); + } + + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + template + void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub2(), WithOutMask(), stream); + } + + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), SingleMask(mask), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), mask, stream); else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct SubtractScalar : unary_function - { - SubtractScalar(double val_) : val(val_) {} - __device__ __forceinline__ D operator ()(T a) const + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subScalar + +namespace arithm +{ + template + void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + AddScalar op(-static_cast(val)); + + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulMat + +namespace +{ + struct Mul_8uc4_32f : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, float b) const { - return saturate_cast(a - val); + uint res = 0; + + res |= (saturate_cast((0xffu & (a )) * b) ); + res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + + return res; } - const double val; + + __device__ __forceinline__ Mul_8uc4_32f() {} + __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {} }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + struct Mul_16sc4_32f : binary_function + { + __device__ __forceinline__ short4 operator ()(short4 a, float b) const + { + return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), + saturate_cast(a.z * b), saturate_cast(a.w * b)); + } + + __device__ __forceinline__ Mul_16sc4_32f() {} + __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {} + }; + + template struct Mul : binary_function + { + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a * b); + } + + __device__ __forceinline__ Mul() {} + __device__ __forceinline__ Mul(const Mul& other) {} + }; + + template struct MulScale : binary_function + { + S scale; + + explicit MulScale(S scale_) : scale(scale_) {} + + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(scale * a * b); + } + }; +} + +namespace cv { namespace gpu { namespace device +{ + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Mul_8uc4_32f) + { + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 8 }; + }; + + template <> struct TransformFunctorTraits< Mul > : DefaultTransformFunctorTraits< Mul > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + template <> struct TransformFunctorTraits< Mul > : DefaultTransformFunctorTraits< Mul > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + template <> struct TransformFunctorTraits< Mul > : DefaultTransformFunctorTraits< Mul > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + template <> struct TransformFunctorTraits< Mul > : DefaultTransformFunctorTraits< Mul > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + template <> struct TransformFunctorTraits< MulScale > : DefaultTransformFunctorTraits< MulScale > { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - SubtractScalar op(val); - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale > : DefaultTransformFunctorTraits< MulScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale > : DefaultTransformFunctorTraits< MulScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale > : DefaultTransformFunctorTraits< MulScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream); + } + + void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream); + } + + template + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) + { + Mul op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + { + MulScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + } + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulScalar + +namespace +{ + template struct MulScalar : unary_function + { + S val; + + explicit MulScalar(S val_) : val(val_) {} + + __device__ __forceinline__ D operator ()(T a) const + { + return saturate_cast(a * val); + } + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< MulScalar > : DefaultTransformFunctorTraits< MulScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar > : DefaultTransformFunctorTraits< MulScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar > : DefaultTransformFunctorTraits< MulScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar > : DefaultTransformFunctorTraits< MulScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) + { + MulScalar op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // multiply - - struct multiply_8uc4_32f : binary_function + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divMat + +namespace +{ + struct Div_8uc4_32f : binary_function { __device__ __forceinline__ uint operator ()(uint a, float b) const { uint res = 0; - res |= (saturate_cast((0xffu & (a )) * b) ); - res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); - res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); - res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + if (b != 0) + { + b = 1.0f / b; + res |= (saturate_cast((0xffu & (a )) * b) ); + res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + } return res; } }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f) + struct Div_16sc4_32f : binary_function { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + __device__ __forceinline__ short4 operator ()(short4 a, float b) const + { + return b != 0 ? make_short4(saturate_cast(a.x / b), saturate_cast(a.y / b), + saturate_cast(a.z / b), saturate_cast(a.w / b)) + : make_short4(0,0,0,0); + } }; - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template struct Div : binary_function { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_8uc4_32f(), WithOutMask(), stream); - } + __device__ __forceinline__ D operator ()(T a, T b) const + { + return b != 0 ? saturate_cast(a / b) : 0; + } - struct multiply_16sc4_32f : binary_function + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template struct Div : binary_function { - __device__ __forceinline__ short4 operator ()(short4 a, float b) const + __device__ __forceinline__ float operator ()(T a, T b) const { - return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), - saturate_cast(a.z * b), saturate_cast(a.w * b)); + return b != 0 ? static_cast(a) / b : 0; } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template struct Div : binary_function + { + __device__ __forceinline__ double operator ()(T a, T b) const + { + return b != 0 ? static_cast(a) / b : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + + template struct DivScale : binary_function + { + S scale; + + explicit DivScale(S scale_) : scale(scale_) {} + + __device__ __forceinline__ D operator ()(T a, T b) const + { + return b != 0 ? saturate_cast(scale * a / b) : 0; + } + }; +} + +namespace cv { namespace gpu { namespace device +{ + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Div_8uc4_32f) + { + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 8 }; + }; + + template <> struct TransformFunctorTraits< Div > : DefaultTransformFunctorTraits< Div > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div > : DefaultTransformFunctorTraits< Div > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div > : DefaultTransformFunctorTraits< Div > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div > : DefaultTransformFunctorTraits< Div > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f) + template <> struct TransformFunctorTraits< DivScale > : DefaultTransformFunctorTraits< DivScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale > : DefaultTransformFunctorTraits< DivScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale > : DefaultTransformFunctorTraits< DivScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale > : DefaultTransformFunctorTraits< DivScale > { - enum { smart_block_dim_x = 8 }; enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + enum { smart_shift = 4 }; }; +}}} + +namespace arithm +{ + void divMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream); + } - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + void divMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_16sc4_32f(), WithOutMask(), stream); + transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream); } - template struct Multiply : binary_function + template + void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) { - Multiply(float scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a, T b) const + if (scale == 1) { - return saturate_cast(scale * a * b); + Div op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); } - const float scale; - }; - template struct Multiply : binary_function - { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ double operator ()(T a, T b) const + else { - return scale * a * b; + DivScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); } - const double scale; - }; - template <> struct Multiply : binary_function + } + + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divScalar + +namespace arithm +{ + template + void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ int operator ()(int a, int b) const + MulScalar op(static_cast(1.0 / val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divInv + +namespace +{ + template struct DivInv : unary_function + { + S val; + + explicit DivInv(double val_) : val(val_) {} + + __device__ __forceinline__ D operator ()(T a) const { - return saturate_cast(scale * a * b); + return a != 0 ? saturate_cast(val / a) : 0; } - const double scale; }; +} - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< DivInv > : DefaultTransformFunctorTraits< DivInv > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + template <> struct TransformFunctorTraits< DivInv > : DefaultTransformFunctorTraits< DivInv > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + template <> struct TransformFunctorTraits< DivInv > : DefaultTransformFunctorTraits< DivInv > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + template <> struct TransformFunctorTraits< DivInv > : DefaultTransformFunctorTraits< DivInv > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} + +namespace arithm +{ + template + void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) + { + DivInv op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absDiffMat + +namespace +{ + template struct VAbsDiff4; + template <> struct VAbsDiff4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } - template struct MultiplyCaller + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} + }; + template <> struct VAbsDiff4 : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ int operator ()(int a, int b) const { - Multiply op(static_cast(scale)); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; - template struct MultiplyCaller + + //////////////////////////////////// + + template struct VAbsDiff2; + template <> struct VAbsDiff2 : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; - template <> struct MultiplyCaller + template <> struct VAbsDiff2 : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ int operator ()(int a, int b) const { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + //////////////////////////////////// + + __device__ __forceinline__ int _abs(int a) + { + return ::abs(a); + } + __device__ __forceinline__ float _abs(float a) + { + return ::fabsf(a); + } + __device__ __forceinline__ double _abs(double a) { - MultiplyCaller::call(src1, src2, dst, scale, stream); + return ::fabs(a); } - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct MultiplyScalar : unary_function - { - MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + template struct AbsDiffMat : binary_function + { + __device__ __forceinline__ T operator ()(T a, T b) const { - return saturate_cast(scale * a * val); + return saturate_cast(_abs(a - b)); } - const double val; - const double scale; + + __device__ __forceinline__ AbsDiffMat() {} + __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VAbsDiff4 > : DefaultTransformFunctorTraits< VAbsDiff4 > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + //////////////////////////////////// + + template struct TransformFunctorTraits< VAbsDiff2 > : DefaultTransformFunctorTraits< VAbsDiff4 > + { + enum { smart_shift = 2 }; + }; + + //////////////////////////////////// + + template <> struct TransformFunctorTraits< AbsDiffMat > : DefaultTransformFunctorTraits< AbsDiffMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + template <> struct TransformFunctorTraits< AbsDiffMat > : DefaultTransformFunctorTraits< AbsDiffMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + template <> struct TransformFunctorTraits< AbsDiffMat > : DefaultTransformFunctorTraits< AbsDiffMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + template <> struct TransformFunctorTraits< AbsDiffMat > : DefaultTransformFunctorTraits< AbsDiffMat > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + template + void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - MultiplyScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff4(), WithOutMask(), stream); } - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // divide - - struct divide_8uc4_32f : binary_function - { - __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const - { - return b != 0 ? make_uchar4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_uchar4(0,0,0,0); - } - }; + template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f) + template + void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff2(), WithOutMask(), stream); + } + + template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template + void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_8uc4_32f(), WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AbsDiffMat(), WithOutMask(), stream); } + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absDiffScalar - struct divide_16sc4_32f : binary_function +namespace +{ + template struct AbsDiffScalar : unary_function { - __device__ __forceinline__ short4 operator ()(short4 a, float b) const + S val; + + explicit AbsDiffScalar(S val_) : val(val_) {} + + __device__ __forceinline__ T operator ()(T a) const { - return b != 0 ? make_short4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_short4(0,0,0,0); + abs_func f; + return saturate_cast(f(a - val)); } }; +} - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f) +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< AbsDiffScalar > : DefaultTransformFunctorTraits< AbsDiffScalar > { - enum { smart_block_dim_x = 8 }; enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffScalar > : DefaultTransformFunctorTraits< AbsDiffScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffScalar > : DefaultTransformFunctorTraits< AbsDiffScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffScalar > : DefaultTransformFunctorTraits< AbsDiffScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; }; +}}} - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) +namespace arithm +{ + template + void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_16sc4_32f(), WithOutMask(), stream); + AbsDiffScalar op(static_cast(val)); + + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template struct Divide : binary_function - { - Divide(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a, T b) const - { - return b != 0 ? saturate_cast(a * scale / b) : 0; - } - const double scale; - }; + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); +} - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > +////////////////////////////////////////////////////////////////////////// +// absMat + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< abs_func > : DefaultTransformFunctorTraits< abs_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + template <> struct TransformFunctorTraits< abs_func > : DefaultTransformFunctorTraits< abs_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + template <> struct TransformFunctorTraits< abs_func > : DefaultTransformFunctorTraits< abs_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + template <> struct TransformFunctorTraits< abs_func > : DefaultTransformFunctorTraits< abs_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + template + void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Divide op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src, (PtrStepSz) dst, abs_func(), WithOutMask(), stream); } - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct DivideScalar : unary_function - { - DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// sqrMat + +namespace +{ + template struct Sqr : unary_function + { + __device__ __forceinline__ T operator ()(T x) const { - return saturate_cast(scale * a / val); + return saturate_cast(x * x); } - const double val; - const double scale; + + __device__ __forceinline__ Sqr() {} + __device__ __forceinline__ Sqr(const Sqr& other) {} }; +} - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< Sqr > : DefaultTransformFunctorTraits< Sqr > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + template <> struct TransformFunctorTraits< Sqr > : DefaultTransformFunctorTraits< Sqr > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + template <> struct TransformFunctorTraits< Sqr > : DefaultTransformFunctorTraits< Sqr > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + template <> struct TransformFunctorTraits< Sqr > : DefaultTransformFunctorTraits< Sqr > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + template + void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - DivideScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src, (PtrStepSz) dst, Sqr(), WithOutMask(), stream); } - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct Reciprocal : unary_function - { - Reciprocal(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const - { - return a != 0 ? saturate_cast(scale / a) : 0; - } - const double scale; - }; + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// sqrtMat - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< sqrt_func > : DefaultTransformFunctorTraits< sqrt_func > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< sqrt_func > : DefaultTransformFunctorTraits< sqrt_func > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< sqrt_func > : DefaultTransformFunctorTraits< sqrt_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + template <> struct TransformFunctorTraits< sqrt_func > : DefaultTransformFunctorTraits< sqrt_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + template <> struct TransformFunctorTraits< sqrt_func > : DefaultTransformFunctorTraits< sqrt_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + template <> struct TransformFunctorTraits< sqrt_func > : DefaultTransformFunctorTraits< sqrt_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream) +namespace arithm +{ + template + void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&scalar) ); - Reciprocal op(scalar); - cv::gpu::device::transform((PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src, (PtrStepSz) dst, sqrt_func(), WithOutMask(), stream); } - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // absdiff - - template struct Absdiff : binary_function - { - static __device__ __forceinline__ int abs(int a) - { - return ::abs(a); - } - static __device__ __forceinline__ float abs(float a) - { - return ::fabsf(a); - } - static __device__ __forceinline__ double abs(double a) - { - return ::fabs(a); - } + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} - __device__ __forceinline__ T operator ()(T a, T b) const - { - return saturate_cast(::abs(a - b)); - } - }; +////////////////////////////////////////////////////////////////////////// +// logMat - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< log_func > : DefaultTransformFunctorTraits< log_func > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< log_func > : DefaultTransformFunctorTraits< log_func > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< log_func > : DefaultTransformFunctorTraits< log_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + template <> struct TransformFunctorTraits< log_func > : DefaultTransformFunctorTraits< log_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + template <> struct TransformFunctorTraits< log_func > : DefaultTransformFunctorTraits< log_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + template <> struct TransformFunctorTraits< log_func > : DefaultTransformFunctorTraits< log_func > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template + void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Absdiff(), WithOutMask(), stream); + transform((PtrStepSz) src, (PtrStepSz) dst, log_func(), WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// expMat - template struct AbsdiffScalar : unary_function +namespace +{ + template struct Exp : unary_function { - AbsdiffScalar(double val_) : val(val_) {} - __device__ __forceinline__ T operator ()(T a) const + __device__ __forceinline__ T operator ()(T x) const { - return saturate_cast(::fabs(a - val)); + exp_func f; + return saturate_cast(f(x)); } - double val; + + __device__ __forceinline__ Exp() {} + __device__ __forceinline__ Exp(const Exp& other) {} }; +} - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< Exp > : DefaultTransformFunctorTraits< Exp > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + template <> struct TransformFunctorTraits< Exp > : DefaultTransformFunctorTraits< Exp > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + template <> struct TransformFunctorTraits< Exp > : DefaultTransformFunctorTraits< Exp > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + template <> struct TransformFunctorTraits< Exp > : DefaultTransformFunctorTraits< Exp > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template + void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AbsdiffScalar op(val); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src, (PtrStepSz) dst, Exp(), WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////////////////// - // Compare +////////////////////////////////////////////////////////////////////////////////////// +// cmpMat +namespace +{ template