diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index 73da7e7a8a..6bf4e5d21b 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -79,6 +79,8 @@ namespace cv { namespace gpu WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30 }; + CV_EXPORTS bool deviceSupports(FeatureSet feature_set); + // Gives information about what GPU archs this OpenCV GPU module was // compiled for class CV_EXPORTS TargetArchs diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/core/src/cuda/matrix_operations.cu index 9e830e563b..60aa073406 100644 --- a/modules/core/src/cuda/matrix_operations.cu +++ b/modules/core/src/cuda/matrix_operations.cu @@ -44,6 +44,7 @@ #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/type_traits.hpp" namespace cv { namespace gpu { namespace device { @@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device void writeScalar(const int*); void writeScalar(const float*); void writeScalar(const double*); + void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t); }}} @@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device //////////////////////////////// ConvertTo //////////////////////////////// /////////////////////////////////////////////////////////////////////////// - template struct Convertor : unary_function + template struct Convertor : unary_function { - Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} + Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {} - __device__ __forceinline__ D operator()(const T& src) const + __device__ __forceinline__ D operator()(typename TypeTraits::ParameterType src) const { return saturate_cast(alpha * src + beta); } - double alpha, beta; + S alpha, beta; }; namespace detail @@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device }; } - template struct TransformFunctorTraits< Convertor > : detail::ConvertTraits< Convertor > + template struct TransformFunctorTraits< Convertor > : detail::ConvertTraits< Convertor > { }; - template + template void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream) { cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); cudaSafeCall( cudaSetDoubleForDevice(&beta) ); - Convertor op(alpha, beta); + Convertor op(static_cast(alpha), static_cast(beta)); cv::gpu::device::transform((PtrStepSz)src, (PtrStepSz)dst, op, WithOutMask(), stream); } @@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device { typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream); - static const caller_t tab[8][8] = + static const caller_t tab[7][7] = { - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {0,0,0,0,0,0,0,0} + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + } }; caller_t func = tab[sdepth][ddepth]; - if (!func) - cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu"); - func(src, dst, alpha, beta, stream); } diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index afd7115b00..6b5b076cd3 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -69,33 +69,89 @@ using namespace cv::gpu; namespace { - // Compares value to set using the given comparator. Returns true if - // there is at least one element x in the set satisfying to: x cmp value - // predicate. - template - bool compareToSet(const std::string& set_as_str, int value, Comparer cmp) + class CudaArch + { + public: + CudaArch(); + + bool builtWith(FeatureSet feature_set) const; + bool hasPtx(int major, int minor) const; + bool hasBin(int major, int minor) const; + bool hasEqualOrLessPtx(int major, int minor) const; + bool hasEqualOrGreaterPtx(int major, int minor) const; + bool hasEqualOrGreaterBin(int major, int minor) const; + + private: + static void fromStr(const string& set_as_str, vector& arr); + + vector bin; + vector ptx; + vector features; + }; + + const CudaArch cudaArch; + + CudaArch::CudaArch() + { + #ifdef HAVE_CUDA + fromStr(CUDA_ARCH_BIN, bin); + fromStr(CUDA_ARCH_PTX, ptx); + fromStr(CUDA_ARCH_FEATURES, features); + #endif + } + + bool CudaArch::builtWith(FeatureSet feature_set) const + { + return !features.empty() && (features.back() >= feature_set); + } + + bool CudaArch::hasPtx(int major, int minor) const + { + return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); + } + + bool CudaArch::hasBin(int major, int minor) const + { + return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); + } + + bool CudaArch::hasEqualOrLessPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.front() <= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.back() >= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const + { + return !bin.empty() && (bin.back() >= major * 10 + minor); + } + + void CudaArch::fromStr(const string& set_as_str, vector& arr) { if (set_as_str.find_first_not_of(" ") == string::npos) - return false; + return; - std::stringstream stream(set_as_str); + istringstream stream(set_as_str); int cur_value; while (!stream.eof()) { stream >> cur_value; - if (cmp(cur_value, value)) - return true; + arr.push_back(cur_value); } - return false; + sort(arr.begin(), arr.end()); } } bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal()); + return cudaArch.builtWith(feature_set); #else (void)feature_set; return false; @@ -110,7 +166,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor) bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to()); + return cudaArch.hasPtx(major, minor); #else (void)major; (void)minor; @@ -121,7 +177,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor) bool cv::gpu::TargetArchs::hasBin(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to()); + return cudaArch.hasBin(major, minor); #else (void)major; (void)minor; @@ -132,8 +188,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, - std::less_equal()); + return cudaArch.hasEqualOrLessPtx(major, minor); #else (void)major; (void)minor; @@ -143,14 +198,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { - return hasEqualOrGreaterPtx(major, minor) || - hasEqualOrGreaterBin(major, minor); + return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); } bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal()); + return cudaArch.hasEqualOrGreaterPtx(major, minor); #else (void)major; (void)minor; @@ -161,8 +215,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, - std::greater_equal()); + return cudaArch.hasEqualOrGreaterBin(major, minor); #else (void)major; (void)minor; @@ -170,6 +223,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) #endif } +bool cv::gpu::deviceSupports(FeatureSet feature_set) +{ + static int versions[] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); + + const int devId = getDevice(); + + int version; + + if (devId < cache_size && versions[devId] >= 0) + version = versions[devId]; + else + { + DeviceInfo dev(devId); + version = dev.majorVersion() * 10 + dev.minorVersion(); + if (devId < cache_size) + versions[devId] = version; + } + + return TargetArchs::builtWith(feature_set) && (version >= feature_set); +} + #if !defined (HAVE_CUDA) #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp new file mode 100644 index 0000000000..091a160e31 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp @@ -0,0 +1,361 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__ +#define __OPENCV_GPU_REDUCE_DETAIL_HPP__ + +#include +#include "../warp.hpp" +#include "../warp_shuffle.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace reduce_detail + { + template struct GetType; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) + { + thrust::get(smem)[tid] = thrust::get(val); + + For::loadToSmem(smem, val, tid); + } + template + static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) + { + thrust::get(val) = thrust::get(smem)[tid]; + + For::loadFromSmem(smem, val, tid); + } + + template + static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op) + { + typename GetType::type>::type reg = thrust::get(smem)[tid + delta]; + thrust::get(smem)[tid] = thrust::get(val) = thrust::get(op)(thrust::get(val), reg); + + For::merge(smem, val, tid, delta, op); + } + template + static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op) + { + typename GetType::type>::type reg = shfl_down(thrust::get(val), delta, width); + thrust::get(val) = thrust::get(op)(thrust::get(val), reg); + + For::mergeShfl(val, delta, width, op); + } + }; + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int) + { + } + template + static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int) + { + } + + template + static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&) + { + } + template + static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&) + { + } + }; + + template + __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid) + { + smem[tid] = val; + } + template + __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid) + { + val = smem[tid]; + } + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, val, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op) + { + T reg = smem[tid + delta]; + smem[tid] = val = op(val, reg); + } + template + __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op) + { + T reg = shfl_down(val, delta, width); + val = op(val, reg); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid, + unsigned int delta, + const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::merge(smem, val, tid, delta, op); + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& val, + unsigned int delta, + unsigned int width, + const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::mergeShfl(val, delta, width, op); + } + + template struct Generic + { + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + loadToSmem(smem, val, tid); + if (N >= 32) + __syncthreads(); + + if (N >= 2048) + { + if (tid < 1024) + merge(smem, val, tid, 1024, op); + + __syncthreads(); + } + if (N >= 1024) + { + if (tid < 512) + merge(smem, val, tid, 512, op); + + __syncthreads(); + } + if (N >= 512) + { + if (tid < 256) + merge(smem, val, tid, 256, op); + + __syncthreads(); + } + if (N >= 256) + { + if (tid < 128) + merge(smem, val, tid, 128, op); + + __syncthreads(); + } + if (N >= 128) + { + if (tid < 64) + merge(smem, val, tid, 64, op); + + __syncthreads(); + } + if (N >= 64) + { + if (tid < 32) + merge(smem, val, tid, 32, op); + } + + if (tid < 16) + { + merge(smem, val, tid, 16, op); + merge(smem, val, tid, 8, op); + merge(smem, val, tid, 4, op); + merge(smem, val, tid, 2, op); + merge(smem, val, tid, 1, op); + } + } + }; + + template + struct Unroll + { + static __device__ void loopShfl(Reference val, Op op, unsigned int N) + { + mergeShfl(val, I, N, op); + Unroll::loopShfl(val, op, N); + } + static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op) + { + merge(smem, val, tid, I, op); + Unroll::loop(smem, val, tid, op); + } + }; + template + struct Unroll<0, Pointer, Reference, Op> + { + static __device__ void loopShfl(Reference, Op, unsigned int) + { + } + static __device__ void loop(Pointer, Reference, unsigned int, Op) + { + } + }; + + template struct WarpOptimized + { + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + #if __CUDA_ARCH__ >= 300 + (void) smem; + (void) tid; + + Unroll::loopShfl(val, op, N); + #else + loadToSmem(smem, val, tid); + + if (tid < N / 2) + Unroll::loop(smem, val, tid, op); + #endif + } + }; + + template struct GenericOptimized32 + { + enum { M = N / 32 }; + + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + const unsigned int laneId = Warp::laneId(); + + #if __CUDA_ARCH__ >= 300 + Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize); + + if (laneId == 0) + loadToSmem(smem, val, tid / 32); + #else + loadToSmem(smem, val, tid); + + if (laneId < 16) + Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op); + + __syncthreads(); + + if (laneId == 0) + loadToSmem(smem, val, tid / 32); + #endif + + __syncthreads(); + + loadFromSmem(smem, val, tid); + + if (tid < 32) + { + #if __CUDA_ARCH__ >= 300 + Unroll::loopShfl(val, op, M); + #else + Unroll::loop(smem, val, tid, op); + #endif + } + } + }; + + template struct StaticIf; + template struct StaticIf + { + typedef T1 type; + }; + template struct StaticIf + { + typedef T2 type; + }; + + template struct IsPowerOf2 + { + enum { value = ((N != 0) && !(N & (N - 1))) }; + }; + + template struct Dispatcher + { + typedef typename StaticIf< + (N <= 32) && IsPowerOf2::value, + WarpOptimized, + typename StaticIf< + (N <= 1024) && IsPowerOf2::value, + GenericOptimized32, + Generic + >::type + >::type reductor; + }; + } +}}} + +#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp new file mode 100644 index 0000000000..ca2c431273 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp @@ -0,0 +1,498 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ +#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ + +#include +#include "../warp.hpp" +#include "../warp_shuffle.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace reduce_key_val_detail + { + template struct GetType; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) + { + thrust::get(smem)[tid] = thrust::get(data); + + For::loadToSmem(smem, data, tid); + } + template + static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) + { + thrust::get(data) = thrust::get(smem)[tid]; + + For::loadFromSmem(smem, data, tid); + } + + template + static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width) + { + thrust::get(val) = shfl_down(thrust::get(val), delta, width); + + For::copyShfl(val, delta, width); + } + template + static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta) + { + thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; + + For::copy(svals, val, tid, delta); + } + + template + static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width) + { + typename GetType::type>::type reg = shfl_down(thrust::get(key), delta, width); + + if (thrust::get(cmp)(reg, thrust::get(key))) + { + thrust::get(key) = reg; + thrust::get(val) = shfl_down(thrust::get(val), delta, width); + } + + For::mergeShfl(key, val, cmp, delta, width); + } + template + static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key, + const ValPointerTuple& svals, const ValReferenceTuple& val, + const CmpTuple& cmp, + unsigned int tid, unsigned int delta) + { + typename GetType::type>::type reg = thrust::get(skeys)[tid + delta]; + + if (thrust::get(cmp)(reg, thrust::get(key))) + { + thrust::get(skeys)[tid] = thrust::get(key) = reg; + thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; + } + + For::merge(skeys, key, svals, val, cmp, tid, delta); + } + }; + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) + { + } + template + static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) + { + } + + template + static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int) + { + } + template + static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int) + { + } + + template + static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int) + { + } + template + static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&, + const ValPointerTuple&, const ValReferenceTuple&, + const CmpTuple&, + unsigned int, unsigned int) + { + } + }; + + ////////////////////////////////////////////////////// + // loadToSmem + + template + __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid) + { + smem[tid] = data; + } + template + __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid) + { + data = smem[tid]; + } + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, + const thrust::tuple& data, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, data, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, + const thrust::tuple& data, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, data, tid); + } + + ////////////////////////////////////////////////////// + // copyVals + + template + __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width) + { + val = shfl_down(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta) + { + svals[tid] = val = svals[tid + delta]; + } + template + __device__ __forceinline__ void copyValsShfl(const thrust::tuple& val, + unsigned int delta, + int width) + { + For<0, thrust::tuple_size >::value>::copyShfl(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::copy(svals, val, tid, delta); + } + + ////////////////////////////////////////////////////// + // merge + + template + __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(K& key, + const thrust::tuple& val, + const Cmp& cmp, + unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& key, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int delta, int width) + { + For<0, thrust::tuple_size >::value>::mergeShfl(key, val, cmp, delta, width); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); + } + + ////////////////////////////////////////////////////// + // Generic + + template struct Generic + { + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + loadToSmem(skeys, key, tid); + loadValsToSmem(svals, val, tid); + if (N >= 32) + __syncthreads(); + + if (N >= 2048) + { + if (tid < 1024) + merge(skeys, key, svals, val, cmp, tid, 1024); + + __syncthreads(); + } + if (N >= 1024) + { + if (tid < 512) + merge(skeys, key, svals, val, cmp, tid, 512); + + __syncthreads(); + } + if (N >= 512) + { + if (tid < 256) + merge(skeys, key, svals, val, cmp, tid, 256); + + __syncthreads(); + } + if (N >= 256) + { + if (tid < 128) + merge(skeys, key, svals, val, cmp, tid, 128); + + __syncthreads(); + } + if (N >= 128) + { + if (tid < 64) + merge(skeys, key, svals, val, cmp, tid, 64); + + __syncthreads(); + } + if (N >= 64) + { + if (tid < 32) + merge(skeys, key, svals, val, cmp, tid, 32); + } + + if (tid < 16) + { + merge(skeys, key, svals, val, cmp, tid, 16); + merge(skeys, key, svals, val, cmp, tid, 8); + merge(skeys, key, svals, val, cmp, tid, 4); + merge(skeys, key, svals, val, cmp, tid, 2); + merge(skeys, key, svals, val, cmp, tid, 1); + } + } + }; + + template + struct Unroll + { + static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N) + { + mergeShfl(key, val, cmp, I, N); + Unroll::loopShfl(key, val, cmp, N); + } + static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + merge(skeys, key, svals, val, cmp, tid, I); + Unroll::loop(skeys, key, svals, val, tid, cmp); + } + }; + template + struct Unroll<0, KP, KR, VP, VR, Cmp> + { + static __device__ void loopShfl(KR, VR, Cmp, unsigned int) + { + } + static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp) + { + } + }; + + template struct WarpOptimized + { + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + #if __CUDA_ARCH__ >= 300 + (void) skeys; + (void) svals; + (void) tid; + + Unroll::loopShfl(key, val, cmp, N); + #else + loadToSmem(skeys, key, tid); + loadToSmem(svals, val, tid); + + if (tid < N / 2) + Unroll::loop(skeys, key, svals, val, tid, cmp); + #endif + } + }; + + template struct GenericOptimized32 + { + enum { M = N / 32 }; + + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + const unsigned int laneId = Warp::laneId(); + + #if __CUDA_ARCH__ >= 300 + Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize); + + if (laneId == 0) + { + loadToSmem(skeys, key, tid / 32); + loadToSmem(svals, val, tid / 32); + } + #else + loadToSmem(skeys, key, tid); + loadToSmem(svals, val, tid); + + if (laneId < 16) + Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); + + __syncthreads(); + + if (laneId == 0) + { + loadToSmem(skeys, key, tid / 32); + loadToSmem(svals, val, tid / 32); + } + #endif + + __syncthreads(); + + loadFromSmem(skeys, key, tid); + + if (tid < 32) + { + #if __CUDA_ARCH__ >= 300 + loadFromSmem(svals, val, tid); + + Unroll::loopShfl(key, val, cmp, M); + #else + Unroll::loop(skeys, key, svals, val, tid, cmp); + #endif + } + } + }; + + template struct StaticIf; + template struct StaticIf + { + typedef T1 type; + }; + template struct StaticIf + { + typedef T2 type; + }; + + template struct IsPowerOf2 + { + enum { value = ((N != 0) && !(N & (N - 1))) }; + }; + + template struct Dispatcher + { + typedef typename StaticIf< + (N <= 32) && IsPowerOf2::value, + WarpOptimized, + typename StaticIf< + (N <= 1024) && IsPowerOf2::value, + GenericOptimized32, + Generic + >::type + >::type reductor; + }; + } +}}} + +#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp deleted file mode 100644 index 0274f204a2..0000000000 --- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp +++ /dev/null @@ -1,841 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__ -#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__ - -namespace cv { namespace gpu { namespace device -{ - namespace utility_detail - { - /////////////////////////////////////////////////////////////////////////////// - // Reductor - - template struct WarpReductor - { - template static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - if (tid < n) - data[tid] = partial_reduction; - if (n > 32) __syncthreads(); - - if (n > 32) - { - if (tid < n - 32) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - if (tid < 16) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 16) - { - if (tid < n - 16) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - if (tid < 8) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 8) - { - if (tid < n - 8) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - if (tid < 4) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 4) - { - if (tid < n - 4) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - if (tid < 2) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 2) - { - if (tid < n - 2) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - if (tid < 2) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - } - }; - template <> struct WarpReductor<64> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - __syncthreads(); - - if (tid < 32) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<32> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 16) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<16> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 8) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<8> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 4) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - - template struct ReductionDispatcher; - template <> struct ReductionDispatcher - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - WarpReductor::reduce(data, partial_reduction, tid, op); - } - }; - template <> struct ReductionDispatcher - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - if (tid < n) - data[tid] = partial_reduction; - __syncthreads(); - - - if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); } - if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); } - if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); } - - if (tid < 32) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // PredValWarpReductor - - template struct PredValWarpReductor; - template <> struct PredValWarpReductor<64> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 32) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = sdata[tid + 32]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 32]; - } - - reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - template <> struct PredValWarpReductor<32> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 16) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - - template <> struct PredValWarpReductor<16> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 8) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - template <> struct PredValWarpReductor<8> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 4) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - - template struct PredValReductionDispatcher; - template <> struct PredValReductionDispatcher - { - template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - PredValWarpReductor::reduce(myData, myVal, sdata, sval, tid, pred); - } - }; - template <> struct PredValReductionDispatcher - { - template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - myData = sdata[tid]; - myVal = sval[tid]; - - if (n >= 512 && tid < 256) - { - T reg = sdata[tid + 256]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 256]; - } - __syncthreads(); - } - if (n >= 256 && tid < 128) - { - T reg = sdata[tid + 128]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 128]; - } - __syncthreads(); - } - if (n >= 128 && tid < 64) - { - T reg = sdata[tid + 64]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 64]; - } - __syncthreads(); - } - - if (tid < 32) - { - if (n >= 64) - { - T reg = sdata[tid + 32]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 32]; - } - } - if (n >= 32) - { - T reg = sdata[tid + 16]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - } - if (n >= 16) - { - T reg = sdata[tid + 8]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - } - if (n >= 8) - { - T reg = sdata[tid + 4]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - } - if (n >= 4) - { - T reg = sdata[tid + 2]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - } - if (n >= 2) - { - T reg = sdata[tid + 1]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // PredVal2WarpReductor - - template struct PredVal2WarpReductor; - template <> struct PredVal2WarpReductor<64> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 32) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = sdata[tid + 32]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 32]; - sval2[tid] = myVal2 = sval2[tid + 32]; - } - - reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - template <> struct PredVal2WarpReductor<32> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 16) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - - template <> struct PredVal2WarpReductor<16> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 8) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - template <> struct PredVal2WarpReductor<8> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 4) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - - template struct PredVal2ReductionDispatcher; - template <> struct PredVal2ReductionDispatcher - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - PredVal2WarpReductor::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); - } - }; - template <> struct PredVal2ReductionDispatcher - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - if (n >= 512 && tid < 256) - { - T reg = sdata[tid + 256]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 256]; - sval2[tid] = myVal2 = sval2[tid + 256]; - } - __syncthreads(); - } - if (n >= 256 && tid < 128) - { - T reg = sdata[tid + 128]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 128]; - sval2[tid] = myVal2 = sval2[tid + 128]; - } - __syncthreads(); - } - if (n >= 128 && tid < 64) - { - T reg = sdata[tid + 64]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 64]; - sval2[tid] = myVal2 = sval2[tid + 64]; - } - __syncthreads(); - } - - if (tid < 32) - { - if (n >= 64) - { - T reg = sdata[tid + 32]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 32]; - sval2[tid] = myVal2 = sval2[tid + 32]; - } - } - if (n >= 32) - { - T reg = sdata[tid + 16]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - } - if (n >= 16) - { - T reg = sdata[tid + 8]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - } - if (n >= 8) - { - T reg = sdata[tid + 4]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - } - if (n >= 4) - { - T reg = sdata[tid + 2]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - } - if (n >= 2) - { - T reg = sdata[tid + 1]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - } - }; - } // namespace utility_detail -}}} // namespace cv { namespace gpu { namespace device - -#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp index c601cf5273..cd63c3ac91 100644 --- a/modules/gpu/include/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp @@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device template <> struct name : binary_function \ { \ __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ - __device__ __forceinline__ name(const name& other):binary_function(){}\ - __device__ __forceinline__ name():binary_function(){}\ + __device__ __forceinline__ name() {}\ + __device__ __forceinline__ name(const name&) {}\ }; template struct maximum : binary_function { __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { - return lhs < rhs ? rhs : lhs; + return max(lhs, rhs); } - __device__ __forceinline__ maximum(const maximum& other):binary_function(){} - __device__ __forceinline__ maximum():binary_function(){} + __device__ __forceinline__ maximum() {} + __device__ __forceinline__ maximum(const maximum&) {} }; OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max) @@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device { __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { - return lhs < rhs ? lhs : rhs; + return min(lhs, rhs); } - __device__ __forceinline__ minimum(const minimum& other):binary_function(){} - __device__ __forceinline__ minimum():binary_function(){} + __device__ __forceinline__ minimum() {} + __device__ __forceinline__ minimum(const minimum&) {} }; OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min) @@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device // Math functions ///bound========================================= + + template struct abs_func : unary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType x) const + { + return abs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned char operator ()(unsigned char x) const + { + return x; + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ signed char operator ()(signed char x) const + { + return ::abs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ char operator ()(char x) const + { + return ::abs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned short operator ()(unsigned short x) const + { + return x; + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ short operator ()(short x) const + { + return ::abs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned int operator ()(unsigned int x) const + { + return x; + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ int operator ()(int x) const + { + return ::abs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ float operator ()(float x) const + { + return ::fabsf(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ double operator ()(double x) const + { + return ::fabs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \ template struct name ## _func : unary_function \ { \ @@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device { \ return func ## f(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; \ template <> struct name ## _func : unary_function \ { \ @@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device { \ return func(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \ @@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device } \ }; - OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2) diff --git a/modules/gpu/include/opencv2/gpu/device/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/reduce.hpp new file mode 100644 index 0000000000..2161b06495 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp @@ -0,0 +1,197 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_REDUCE_HPP__ +#define __OPENCV_GPU_REDUCE_HPP__ + +#include +#include "detail/reduce.hpp" +#include "detail/reduce_key_val.hpp" + +namespace cv { namespace gpu { namespace device +{ + template + __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op) + { + reduce_detail::Dispatcher::reductor::template reduce(smem, val, tid, op); + } + template + __device__ __forceinline__ void reduce(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid, + const thrust::tuple& op) + { + reduce_detail::Dispatcher::reductor::template reduce< + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&>(smem, val, tid, op); + } + + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce(skeys, key, svals, val, tid, cmp); + } + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, + const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce&, + const thrust::tuple&, + const Cmp&>(skeys, key, svals, val, tid, cmp); + } + template + __device__ __forceinline__ void reduceKeyVal(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, + const thrust::tuple& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce< + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple& + >(skeys, key, svals, val, tid, cmp); + } + + // smem_tuple + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0) + { + return thrust::make_tuple((volatile T0*) t0); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9); + } +}}} + +#endif // __OPENCV_GPU_UTILITY_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp index 7bb1da751f..7a2799fa37 100644 --- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp +++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp @@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ uchar saturate_cast(schar v) { - return (uchar) ::max((int)v, 0); + uint res = 0; + int vi = v; + asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; + } + template<> __device__ __forceinline__ uchar saturate_cast(short v) + { + uint res = 0; + asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(ushort v) { - return (uchar) ::min((uint)v, (uint)UCHAR_MAX); + uint res = 0; + asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(int v) { - return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); + uint res = 0; + asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(uint v) { - return (uchar) ::min(v, (uint)UCHAR_MAX); + uint res = 0; + asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } - template<> __device__ __forceinline__ uchar saturate_cast(short v) - { - return saturate_cast((uint)v); - } - template<> __device__ __forceinline__ uchar saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + uint res = 0; + asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + uint res = 0; + asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ schar saturate_cast(uchar v) { - return (schar) ::min((int)v, SCHAR_MAX); + uint res = 0; + uint vi = v; + asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(ushort v) + template<> __device__ __forceinline__ schar saturate_cast(short v) { - return (schar) ::min((uint)v, (uint)SCHAR_MAX); + uint res = 0; + asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(int v) + template<> __device__ __forceinline__ schar saturate_cast(ushort v) { - return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); + uint res = 0; + asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(short v) + template<> __device__ __forceinline__ schar saturate_cast(int v) { - return saturate_cast((int)v); + uint res = 0; + asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(uint v) { - return (schar) ::min(v, (uint)SCHAR_MAX); + uint res = 0; + asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + uint res = 0; + asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + uint res = 0; + asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ ushort saturate_cast(schar v) { - return (ushort) ::max((int)v, 0); + ushort res = 0; + int vi = v; + asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(short v) { - return (ushort) ::max((int)v, 0); + ushort res = 0; + asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(int v) { - return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); + ushort res = 0; + asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(uint v) { - return (ushort) ::min(v, (uint)USHRT_MAX); + ushort res = 0; + asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + ushort res = 0; + asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + ushort res = 0; + asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ short saturate_cast(ushort v) { - return (short) ::min((int)v, SHRT_MAX); + short res = 0; + asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(int v) { - return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); + short res = 0; + asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(uint v) { - return (short) ::min(v, (uint)SHRT_MAX); + short res = 0; + asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + short res = 0; + asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + short res = 0; + asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif } + template<> __device__ __forceinline__ int saturate_cast(uint v) + { + int res = 0; + asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } template<> __device__ __forceinline__ int saturate_cast(float v) { return __float2int_rn(v); @@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device #endif } + template<> __device__ __forceinline__ uint saturate_cast(schar v) + { + uint res = 0; + int vi = v; + asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; + } + template<> __device__ __forceinline__ uint saturate_cast(short v) + { + uint res = 0; + asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; + } + template<> __device__ __forceinline__ uint saturate_cast(int v) + { + uint res = 0; + asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } template<> __device__ __forceinline__ uint saturate_cast(float v) { return __float2uint_rn(v); diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp index 4489a20b15..dd2dd8af01 100644 --- a/modules/gpu/include/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp @@ -45,7 +45,6 @@ #include "saturate_cast.hpp" #include "datamov_utils.hpp" -#include "detail/reduction_detail.hpp" namespace cv { namespace gpu { namespace device { @@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device } }; - /////////////////////////////////////////////////////////////////////////////// - // Reduction - - template __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::ReductionDispatcher::reduce(data, partial_reduction, tid, op); - } - - template - __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::PredValReductionDispatcher::reduce(myData, myVal, sdata, sval, tid, pred); - } - - template - __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::PredVal2ReductionDispatcher::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); - } - /////////////////////////////////////////////////////////////////////////////// // Solve linear system diff --git a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp index b7861bca75..d5b4bb202c 100644 --- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp +++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp @@ -43,7 +43,7 @@ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__ -#include "utility.hpp" +#include "reduce.hpp" #include "functional.hpp" #include "detail/vec_distance_detail.hpp" @@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const @@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const diff --git a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp index 0ec790c0b7..1c46dc0c33 100644 --- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp +++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp @@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \ - OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \ + OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \ @@ -327,4 +327,4 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP }}} // namespace cv { namespace gpu { namespace device -#endif // __OPENCV_GPU_VECMATH_HPP__ \ No newline at end of file +#endif // __OPENCV_GPU_VECMATH_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp new file mode 100644 index 0000000000..8b4479a79b --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp @@ -0,0 +1,145 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__ +#define __OPENCV_GPU_WARP_SHUFFLE_HPP__ + +namespace cv { namespace gpu { namespace device +{ + template + __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl(val, srcLane, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl((int) val, srcLane, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl(lo, srcLane, width); + hi = __shfl(hi, srcLane, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } + + template + __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl_down(val, delta, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl_down((int) val, delta, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl_down(lo, delta, width); + hi = __shfl_down(hi, delta, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } + + template + __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl_up(val, delta, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl_up((int) val, delta, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl_up(lo, delta, width); + hi = __shfl_up(hi, delta, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } +}}} + +#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index d83b23ee27..7f7da9e9fb 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -792,31 +792,23 @@ private: GpuMat lab, l, ab; }; - -struct CV_EXPORTS CannyBuf; - -CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); -CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); -CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); -CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); - struct CV_EXPORTS CannyBuf { - CannyBuf() {} - explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);} - CannyBuf(const GpuMat& dx_, const GpuMat& dy_); - void create(const Size& image_size, int apperture_size = 3); - void release(); GpuMat dx, dy; - GpuMat dx_buf, dy_buf; - GpuMat edgeBuf; - GpuMat trackBuf1, trackBuf2; + GpuMat mag; + GpuMat map; + GpuMat st1, st2; Ptr filterDX, filterDY; }; +CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); + class CV_EXPORTS ImagePyramid { public: @@ -1036,11 +1028,9 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels //! Calculates histogram for 8u one channel image //! Output hist will have one row, 256 cols and CV32SC1 type. CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); -CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //! normalizes the grayscale image brightness and contrast by normalizing its histogram CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); -CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null()); CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //////////////////////////////// StereoBM_GPU //////////////////////////////// diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 2ecfd7dde8..a9c8b5984d 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES) { cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_hist; - cv::gpu::GpuMat d_buf; - cv::gpu::calcHist(d_src, d_hist, d_buf); + cv::gpu::calcHist(d_src, d_hist); TEST_CYCLE() { - cv::gpu::calcHist(d_src, d_hist, d_buf); + cv::gpu::calcHist(d_src, d_hist); } GPU_SANITY_CHECK(d_hist); diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index b31f25ca8b..491ac31112 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -42,10 +42,13 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" +#include "opencv2/gpu/device/warp_shuffle.hpp" namespace cv { namespace gpu { namespace device { @@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device int& bestTrainIdx1, int& bestTrainIdx2, float* s_distance, int* s_trainIdx) { + #if __CUDA_ARCH__ >= 300 + (void) s_distance; + (void) s_trainIdx; + + float d1, d2; + int i1, i2; + + #pragma unroll + for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) + { + d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); + d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); + i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); + i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); + + if (bestDistance1 < d1) + { + if (d1 < bestDistance2) + { + bestDistance2 = d1; + bestTrainIdx2 = i1; + } + } + else + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = d1; + bestTrainIdx1 = i1; + + if (d2 < bestDistance2) + { + bestDistance2 = d2; + bestTrainIdx2 = i2; + } + } + } + #else float myBestDistance1 = numeric_limits::max(); float myBestDistance2 = numeric_limits::max(); int myBestTrainIdx1 = -1; @@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device bestTrainIdx1 = myBestTrainIdx1; bestTrainIdx2 = myBestTrainIdx2; + #endif } template @@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device int& bestImgIdx1, int& bestImgIdx2, float* s_distance, int* s_trainIdx, int* s_imgIdx) { + #if __CUDA_ARCH__ >= 300 + (void) s_distance; + (void) s_trainIdx; + (void) s_imgIdx; + + float d1, d2; + int i1, i2; + int j1, j2; + + #pragma unroll + for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) + { + d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); + d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); + i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); + i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); + j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE); + j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE); + + if (bestDistance1 < d1) + { + if (d1 < bestDistance2) + { + bestDistance2 = d1; + bestTrainIdx2 = i1; + bestImgIdx2 = j1; + } + } + else + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + bestImgIdx2 = bestImgIdx1; + + bestDistance1 = d1; + bestTrainIdx1 = i1; + bestImgIdx1 = j1; + + if (d2 < bestDistance2) + { + bestDistance2 = d2; + bestTrainIdx2 = i2; + bestImgIdx2 = j2; + } + } + } + #else float myBestDistance1 = numeric_limits::max(); float myBestDistance2 = numeric_limits::max(); int myBestTrainIdx1 = -1; @@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device bestImgIdx1 = myBestImgIdx1; bestImgIdx2 = myBestImgIdx2; + #endif } /////////////////////////////////////////////////////////////////////////////// @@ -1005,7 +1096,7 @@ namespace cv { namespace gpu { namespace device s_trainIdx[threadIdx.x] = bestIdx; __syncthreads(); - reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); + reduceKeyVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); if (threadIdx.x == 0) { @@ -1164,4 +1255,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index f50089ed94..9745dee82f 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -42,7 +42,9 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" @@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device s_distance += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - - __syncthreads(); - - reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); + reduceKeyVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); } template @@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device s_trainIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - s_imgIdx [threadIdx.x] = bestImgIdx; - - __syncthreads(); - - reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); + reduceKeyVal(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less()); } /////////////////////////////////////////////////////////////////////////////// @@ -782,4 +773,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 934b8fe84c..bb828829f0 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -42,7 +42,8 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" @@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device __global__ void matchUnrolled(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - extern __shared__ int smem[]; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; @@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device bestDistance.ptr(queryIdx)[ind] = distVal; } } - - #endif } template @@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device __global__ void match(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - extern __shared__ int smem[]; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; @@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device bestDistance.ptr(queryIdx)[ind] = distVal; } } - - #endif } template @@ -469,4 +462,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 40c847547e..0fd482c41a 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -42,9 +42,10 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/reduce.hpp" namespace cv { namespace gpu { namespace device { @@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); } + __device__ __forceinline__ TransformOp() {} + __device__ __forceinline__ TransformOp(const TransformOp&) {} }; void call(const PtrStepSz src, const float* rot, @@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); } + __device__ __forceinline__ ProjectOp() {} + __device__ __forceinline__ ProjectOp(const ProjectOp&) {} }; void call(const PtrStepSz src, const float* rot, @@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device return x * x; } + template __global__ void computeHypothesisScoresKernel( const int num_points, const float3* object, const float2* image, const float dist_threshold, int* g_num_inliers) @@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device ++num_inliers; } - extern __shared__ float s_num_inliers[]; - s_num_inliers[threadIdx.x] = num_inliers; - __syncthreads(); - - for (int step = blockDim.x / 2; step > 0; step >>= 1) - { - if (threadIdx.x < step) - s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step]; - __syncthreads(); - } + __shared__ int s_num_inliers[BLOCK_SIZE]; + reduce(s_num_inliers, num_inliers, threadIdx.x, plus()); if (threadIdx.x == 0) - g_num_inliers[blockIdx.x] = s_num_inliers[0]; + g_num_inliers[blockIdx.x] = num_inliers; } void computeHypothesisScores( @@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device dim3 threads(256); dim3 grid(num_hypotheses); - int smem_size = threads.x * sizeof(float); - computeHypothesisScoresKernel<<>>( + computeHypothesisScoresKernel<256><<>>( num_points, object, image, dist_threshold, hypothesis_scores); cudaSafeCall( cudaGetLastError() ); @@ -193,4 +190,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index 3dc0486783..b08a61c83e 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -43,459 +43,463 @@ #if !defined CUDA_DISABLER #include -#include -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/emulation.hpp" +#include "opencv2/gpu/device/transform.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/utility.hpp" -namespace cv { namespace gpu { namespace device +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace { - namespace canny + struct L1 : binary_function { - __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + __device__ __forceinline__ float operator ()(int x, int y) const { - __shared__ int smem[16][18]; - - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (i < rows) - { - smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; - if (threadIdx.x == 0) - { - smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; - smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; - } - __syncthreads(); - - if (j < cols) - { - dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; - dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; - } - } + return ::abs(x) + ::abs(y); } - void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + __device__ __forceinline__ L1() {} + __device__ __forceinline__ L1(const L1&) {} + }; + struct L2 : binary_function + { + __device__ __forceinline__ float operator ()(int x, int y) const { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + return ::sqrtf(x * x + y * y); + } - calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); - cudaSafeCall( cudaGetLastError() ); + __device__ __forceinline__ L2() {} + __device__ __forceinline__ L2(const L2&) {} + }; +} - cudaSafeCall( cudaDeviceSynchronize() ); - } +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} - struct L1 - { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::abs(x) + ::abs(y); - } - }; - struct L2 - { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::sqrtf(x * x + y * y); - } - }; +namespace +{ + texture tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); + struct SrcTex + { + const int xoff; + const int yoff; + __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} - template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, - PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + __device__ __forceinline__ int operator ()(int y, int x) const { - __shared__ int sdx[18][16]; - __shared__ int sdy[18][16]; + return tex2D(tex_src, x + xoff, y + yoff); + } + }; - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + template __global__ + void calcMagnitude(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (j < cols) - { - sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; - sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; - if (threadIdx.y == 0) - { - sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; - sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; + if (y >= mag.rows || x >= mag.cols) + return; - sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; - sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; - } - __syncthreads(); + int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); + int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); - if (i < rows) - { - int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; - int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; + dx(y, x) = dxVal; + dy(y, x) = dyVal; - dx.ptr(i)[j] = x; - dy.ptr(i)[j] = y; + mag(y, x) = norm(dxVal, dyVal); + } +} - mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); - } - } - } +namespace canny +{ + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + const dim3 block(16, 16); + const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); - void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + bindTexture(&tex_src, srcWhole); + SrcTex src(xoff, yoff); - if (L2Grad) - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + if (L2Grad) + { + L2 norm; + ::calcMagnitude<<>>(src, dx, dy, mag, norm); + } + else + { + L1 norm; + ::calcMagnitude<<>>(src, dx, dy, mag, norm); + } - cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - } + cudaSafeCall(cudaThreadSynchronize()); + } - template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + if (L2Grad) { - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (i < rows && j < cols) - mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); + L2 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); } - - void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) + else { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + L1 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); + } + } +} - if (L2Grad) - calcMagnitude<<>>(dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx, dy, mag, rows, cols); +////////////////////////////////////////////////////////////////////////////////////////// - cudaSafeCall( cudaGetLastError() ); +namespace +{ + texture tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); - cudaSafeCall( cudaDeviceSynchronize() ); - } + __global__ void calcMap(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) + { + const int CANNY_SHIFT = 15; + const int TG22 = (int)(0.4142135623730950488016887242097*(1<= dx.cols || y >= dx.rows) + return; - __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) - { - __shared__ float smem[18][18]; + int dxVal = dx(y, x); + int dyVal = dy(y, x); - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; + const float m = tex2D(tex_mag, x, y); - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + dxVal = ::abs(dxVal); + dyVal = ::abs(dyVal); - if (ly < 14) - smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + // 0 - the pixel can not belong to an edge + // 1 - the pixel might belong to an edge + // 2 - the pixel does belong to an edge + int edge_type = 0; - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + if (m > low_thresh) + { + const int tg22x = dxVal * TG22; + const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); - __syncthreads(); + dyVal <<= CANNY_SHIFT; - if (i < rows && j < cols) + if (dyVal < tg22x) + { + if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) + edge_type = 1 + (int)(m > high_thresh); + } + else if(dyVal > tg67x) { - int x = dx.ptr(i)[j]; - int y = dy.ptr(i)[j]; - const int s = (x ^ y) < 0 ? -1 : 1; - const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; + if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } + else + { + if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } + } - x = ::abs(x); - y = ::abs(y); + map(y, x) = edge_type; + } +} - // 0 - the pixel can not belong to an edge - // 1 - the pixel might belong to an edge - // 2 - the pixel does belong to an edge - int edge_type = 0; +namespace canny +{ + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) + { + const dim3 block(16, 16); + const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); - if (m > low_thresh) - { - const int tg22x = x * TG22; - const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); - - y <<= CANNY_SHIFT; - - if (y < tg22x) - { - if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) - edge_type = 1 + (int)(m > high_thresh); - } - else if( y > tg67x ) - { - if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) - edge_type = 1 + (int)(m > high_thresh); - } - } + bindTexture(&tex_mag, mag); - map.ptr(i + 1)[j + 1] = edge_type; - } - } + ::calcMap<<>>(dx, dy, map, low_thresh, high_thresh); + cudaSafeCall( cudaGetLastError() ); - #undef CANNY_SHIFT - #undef TG22 + cudaSafeCall( cudaDeviceSynchronize() ); + } +} - void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) +////////////////////////////////////////////////////////////////////////////////////////// + +namespace +{ + __device__ int counter = 0; + + __global__ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st) + { + __shared__ volatile int smem[18][18]; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0; + if (threadIdx.y == 0) + smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0; + if (threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0; + if (threadIdx.x == 0) + smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1) + smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == 0) + smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) + smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0; + + __syncthreads(); + + if (x >= map.cols || y >= map.rows) + return; + + int n; + + #pragma unroll + for (int k = 0; k < 16; ++k) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + n = 0; - calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); + if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) + { + n += smem[threadIdx.y ][threadIdx.x ] == 2; + n += smem[threadIdx.y ][threadIdx.x + 1] == 2; + n += smem[threadIdx.y ][threadIdx.x + 2] == 2; - cudaSafeCall( cudaDeviceSynchronize() ); - } + n += smem[threadIdx.y + 1][threadIdx.x ] == 2; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; - ////////////////////////////////////////////////////////////////////////////////////////// + n += smem[threadIdx.y + 2][threadIdx.x ] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; + } - __device__ unsigned int counter = 0; + if (n > 0) + smem[threadIdx.y + 1][threadIdx.x + 1] = 2; + } - __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) - { - #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120) + const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - __shared__ int smem[18][18]; + map(y, x) = e; - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + n = 0; - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + if (e == 2) + { + n += smem[threadIdx.y ][threadIdx.x ] == 1; + n += smem[threadIdx.y ][threadIdx.x + 1] == 1; + n += smem[threadIdx.y ][threadIdx.x + 2] == 1; - if (ly < 14) - smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + n += smem[threadIdx.y + 1][threadIdx.x ] == 1; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + n += smem[threadIdx.y + 2][threadIdx.x ] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; + } - __syncthreads(); + if (n > 0) + { + const int ind = ::atomicAdd(&counter, 1); + st[ind] = make_ushort2(x, y); + } + } +} - if (i < rows && j < cols) - { - int n; +namespace canny +{ + void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - #pragma unroll - for (int k = 0; k < 16; ++k) - { - n = 0; + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); - if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) - { - n += smem[threadIdx.y ][threadIdx.x ] == 2; - n += smem[threadIdx.y ][threadIdx.x + 1] == 2; - n += smem[threadIdx.y ][threadIdx.x + 2] == 2; + const dim3 block(16, 16); + const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - n += smem[threadIdx.y + 1][threadIdx.x ] == 2; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; + ::edgesHysteresisLocal<<>>(map, st1); + cudaSafeCall( cudaGetLastError() ); - n += smem[threadIdx.y + 2][threadIdx.x ] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; - } + cudaSafeCall( cudaDeviceSynchronize() ); + } +} - if (n > 0) - smem[threadIdx.y + 1][threadIdx.x + 1] = 2; - } +////////////////////////////////////////////////////////////////////////////////////////// - const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; +namespace +{ + __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; + __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - map.ptr(i + 1)[j + 1] = e; + __global__ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count) + { + const int stack_size = 512; - n = 0; + __shared__ int s_counter; + __shared__ int s_ind; + __shared__ ushort2 s_st[stack_size]; - if (e == 2) - { - n += smem[threadIdx.y ][threadIdx.x ] == 1; - n += smem[threadIdx.y ][threadIdx.x + 1] == 1; - n += smem[threadIdx.y ][threadIdx.x + 2] == 1; + if (threadIdx.x == 0) + s_counter = 0; - n += smem[threadIdx.y + 1][threadIdx.x ] == 1; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; + __syncthreads(); - n += smem[threadIdx.y + 2][threadIdx.x ] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; - } + int ind = blockIdx.y * gridDim.x + blockIdx.x; - if (n > 0) - { - const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); - st[ind] = make_ushort2(j + 1, i + 1); - } - } + if (ind >= count) + return; - #endif - } + ushort2 pos = st1[ind]; - void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) + if (threadIdx.x < 8) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + pos.x += c_dx[threadIdx.x]; + pos.y += c_dy[threadIdx.x]; - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1) + { + map(pos.y, pos.x) = 2; - edgesHysteresisLocal<<>>(map, st1, rows, cols); - cudaSafeCall( cudaGetLastError() ); + ind = Emulation::smem::atomicAdd(&s_counter, 1); - cudaSafeCall( cudaDeviceSynchronize() ); + s_st[ind] = pos; + } } - __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; - __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + __syncthreads(); - __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) + while (s_counter > 0 && s_counter <= stack_size - blockDim.x) { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120 + const int subTaskIdx = threadIdx.x >> 3; + const int portion = ::min(s_counter, blockDim.x >> 3); - const int stack_size = 512; + if (subTaskIdx < portion) + pos = s_st[s_counter - 1 - subTaskIdx]; - __shared__ unsigned int s_counter; - __shared__ unsigned int s_ind; - __shared__ ushort2 s_st[stack_size]; + __syncthreads(); if (threadIdx.x == 0) - s_counter = 0; - __syncthreads(); + s_counter -= portion; - int ind = blockIdx.y * gridDim.x + blockIdx.x; + __syncthreads(); - if (ind < count) + if (subTaskIdx < portion) { - ushort2 pos = st1[ind]; + pos.x += c_dx[threadIdx.x & 7]; + pos.y += c_dy[threadIdx.x & 7]; - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1) { - if (threadIdx.x < 8) - { - pos.x += c_dx[threadIdx.x]; - pos.y += c_dy[threadIdx.x]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - - while (s_counter > 0 && s_counter <= stack_size - blockDim.x) - { - const int subTaskIdx = threadIdx.x >> 3; - const int portion = ::min(s_counter, blockDim.x >> 3); - - pos.x = pos.y = 0; - - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - __syncthreads(); - - if (threadIdx.x == 0) - s_counter -= portion; - __syncthreads(); - - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) - { - pos.x += c_dx[threadIdx.x & 7]; - pos.y += c_dy[threadIdx.x & 7]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - } - - if (s_counter > 0) - { - if (threadIdx.x == 0) - { - ind = atomicAdd(&counter, s_counter); - s_ind = ind - s_counter; - } - __syncthreads(); - - ind = s_ind; - - for (int i = threadIdx.x; i < s_counter; i += blockDim.x) - { - st2[ind + i] = s_st[i]; - } - } + map(pos.y, pos.x) = 2; + + ind = Emulation::smem::atomicAdd(&s_counter, 1); + + s_st[ind] = pos; } } - #endif + __syncthreads(); } - void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) + if (s_counter > 0) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - unsigned int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - - while (count > 0) + if (threadIdx.x == 0) { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - - dim3 block(128, 1, 1); - dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); - cudaSafeCall( cudaGetLastError() ); + ind = ::atomicAdd(&counter, s_counter); + s_ind = ind - s_counter; + } - cudaSafeCall( cudaDeviceSynchronize() ); + __syncthreads(); - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + ind = s_ind; - std::swap(st1, st2); - } + for (int i = threadIdx.x; i < s_counter; i += blockDim.x) + st2[ind + i] = s_st[i]; } + } +} - __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) - { - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; +namespace canny +{ + void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, ::counter) ); - if (i < rows && j < cols) - dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); - } + int count; + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); - void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) + while (count > 0) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + + const dim3 block(128); + const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - getEdges<<>>(map, dst, rows, cols); + ::edgesHysteresisGlobal<<>>(map, st1, st2, count); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); + + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + + std::swap(st1, st2); } - } // namespace canny -}}} // namespace cv { namespace gpu { namespace device + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +namespace +{ + struct GetEdges : unary_function + { + __device__ __forceinline__ uchar operator ()(int e) const + { + return (uchar)(-(e >> 1)); + } + + __device__ __forceinline__ GetEdges() {} + __device__ __forceinline__ GetEdges(const GetEdges&) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} + +namespace canny +{ + void getEdges(PtrStepSzi map, PtrStepSzb dst) + { + transform(map, dst, GetEdges(), WithOutMask(), 0); + } +} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index c61601d4f7..071ad89ceb 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -42,405 +42,875 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" -namespace cv { namespace gpu { namespace device -{ - ////////////////////////////////////////////////////////////////////////// - // add +using namespace cv::gpu; +using namespace cv::gpu::device; - template struct Add : binary_function +namespace +{ + template struct ArithmFuncTraits { - __device__ __forceinline__ D operator ()(T a, T b) const - { - return saturate_cast(a + b); - } + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 1 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct ArithmFuncTraits<1, 1> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct ArithmFuncTraits<1, 2> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct ArithmFuncTraits<1, 4> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + + template <> struct ArithmFuncTraits<2, 1> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + template <> struct ArithmFuncTraits<2, 2> { - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), WithOutMask(), stream); - } + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct AddScalar : unary_function - { - AddScalar(double val_) : val(val_) {} - __device__ __forceinline__ D operator ()(T a) const - { - return saturate_cast(a + val); - } - const double val; + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; }; - - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct ArithmFuncTraits<2, 4> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + + template <> struct ArithmFuncTraits<4, 1> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct ArithmFuncTraits<4, 2> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct ArithmFuncTraits<4, 4> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +} + +////////////////////////////////////////////////////////////////////////// +// addMat - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) +namespace +{ + template struct VAdd4; + template <> struct VAdd4 : binary_function { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AddScalar op(val); - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); - } + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // subtract - - template struct Subtract : binary_function + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function { - __device__ __forceinline__ D operator ()(T a, T b) const + __device__ __forceinline__ uint operator ()(int a, int b) const { - return saturate_cast(a - b); + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + + //////////////////////////////////// + + template struct VAdd2; + template <> struct VAdd2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + template <> struct VAdd2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + template <> struct VAdd2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + + //////////////////////////////////// - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + template struct AddMat : binary_function + { + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a + b); + } + + __device__ __forceinline__ AddMat() {} + __device__ __forceinline__ AddMat(const AddMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VAdd4 > : ArithmFuncTraits + { + }; + + //////////////////////////////////// + + template struct TransformFunctorTraits< VAdd2 > : ArithmFuncTraits + { + }; + + //////////////////////////////////// + + template struct TransformFunctorTraits< AddMat > : ArithmFuncTraits + { + }; +}}} + +namespace arithm +{ + template + void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd4(), WithOutMask(), stream); + } + + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd2(), WithOutMask(), stream); + } + + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), SingleMask(mask), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), mask, stream); else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct SubtractScalar : unary_function - { - SubtractScalar(double val_) : val(val_) {} + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// addScalar + +namespace +{ + template struct AddScalar : unary_function + { + S val; + + explicit AddScalar(S val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { - return saturate_cast(a - val); + return saturate_cast(a + val); } - const double val; }; +} - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< AddScalar > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > +}}} + +namespace arithm +{ + template + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + AddScalar op(static_cast(val)); + + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subMat + +namespace +{ + template struct VSub4; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + template <> struct VSub4 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + + //////////////////////////////////// + + template struct VSub2; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + + //////////////////////////////////// + + template struct SubMat : binary_function + { + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a - b); + } + + __device__ __forceinline__ SubMat() {} + __device__ __forceinline__ SubMat(const SubMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VSub4 > : ArithmFuncTraits + { + }; + + //////////////////////////////////// + + template struct TransformFunctorTraits< VSub2 > : ArithmFuncTraits + { + }; + + //////////////////////////////////// + + template struct TransformFunctorTraits< SubMat > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} + +namespace arithm +{ + template + void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub4(), WithOutMask(), stream); + } + + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + template + void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub2(), WithOutMask(), stream); + } + + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - SubtractScalar op(val); if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), mask, stream); else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // multiply - - struct multiply_8uc4_32f : binary_function + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subScalar + +namespace arithm +{ + template + void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + AddScalar op(-static_cast(val)); + + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulMat + +namespace +{ + struct Mul_8uc4_32f : binary_function { __device__ __forceinline__ uint operator ()(uint a, float b) const { @@ -453,890 +923,1112 @@ namespace cv { namespace gpu { namespace device return res; } - }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f) - { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + __device__ __forceinline__ Mul_8uc4_32f() {} + __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {} }; - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) - { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_8uc4_32f(), WithOutMask(), stream); - } - - struct multiply_16sc4_32f : binary_function + struct Mul_16sc4_32f : binary_function { __device__ __forceinline__ short4 operator ()(short4 a, float b) const { return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), saturate_cast(a.z * b), saturate_cast(a.w * b)); } + + __device__ __forceinline__ Mul_16sc4_32f() {} + __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {} }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f) + template struct Mul : binary_function { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a * b); + } + + __device__ __forceinline__ Mul() {} + __device__ __forceinline__ Mul(const Mul& other) {} }; - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template struct MulScale : binary_function { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_16sc4_32f(), WithOutMask(), stream); - } + S scale; + + explicit MulScale(S scale_) : scale(scale_) {} - template struct Multiply : binary_function - { - Multiply(float scale_) : scale(scale_) {} __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(scale * a * b); } - const float scale; }; - template struct Multiply : binary_function +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : ArithmFuncTraits + { + }; + + template struct TransformFunctorTraits< Mul > : ArithmFuncTraits { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ double operator ()(T a, T b) const + }; + + template struct TransformFunctorTraits< MulScale > : ArithmFuncTraits + { + }; +}}} + +namespace arithm +{ + void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream); + } + + void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream); + } + + template + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) { - return scale * a * b; + Mul op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); } - const double scale; - }; - template <> struct Multiply : binary_function + else + { + MulScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + } + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulScalar + +namespace +{ + template struct MulScalar : unary_function { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ int operator ()(int a, int b) const + S val; + + explicit MulScalar(S val_) : val(val_) {} + + __device__ __forceinline__ D operator ()(T a) const { - return saturate_cast(scale * a * b); + return saturate_cast(a * val); } - const double scale; }; +} - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< MulScalar > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > +}}} + +namespace arithm +{ + template + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + MulScalar op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divMat + +namespace +{ + struct Div_8uc4_32f : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, float b) const + { + uint res = 0; + + if (b != 0) + { + b = 1.0f / b; + res |= (saturate_cast((0xffu & (a )) * b) ); + res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + } + + return res; + } }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + + struct Div_16sc4_32f : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ short4 operator ()(short4 a, float b) const + { + return b != 0 ? make_short4(saturate_cast(a.x / b), saturate_cast(a.y / b), + saturate_cast(a.z / b), saturate_cast(a.w / b)) + : make_short4(0,0,0,0); + } }; - template struct MultiplyCaller + template struct Div : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ D operator ()(T a, T b) const { - Multiply op(static_cast(scale)); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + return b != 0 ? saturate_cast(a / b) : 0; } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} }; - template struct MultiplyCaller + template struct Div : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ float operator ()(T a, T b) const { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + return b != 0 ? static_cast(a) / b : 0; } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} }; - template <> struct MultiplyCaller + template struct Div : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ double operator ()(T a, T b) const { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + return b != 0 ? static_cast(a) / b : 0; } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} }; - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + template struct DivScale : binary_function { - MultiplyCaller::call(src1, src2, dst, scale, stream); - } + S scale; - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct MultiplyScalar : unary_function - { - MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + explicit DivScale(S scale_) : scale(scale_) {} + + __device__ __forceinline__ D operator ()(T a, T b) const { - return saturate_cast(scale * a * val); + return b != 0 ? saturate_cast(scale * a / b) : 0; } - const double val; - const double scale; }; +} - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + + template struct TransformFunctorTraits< Div > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + + template struct TransformFunctorTraits< DivScale > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > +}}} + +namespace arithm +{ + void divMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; + transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream); + } - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) + void divMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - MultiplyScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream); } - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // divide - - struct divide_8uc4_32f : binary_function - { - __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const - { - return b != 0 ? make_uchar4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_uchar4(0,0,0,0); + template + void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) + { + Div op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); } - }; + else + { + DivScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + } - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f) + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divScalar + +namespace arithm +{ + template + void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; + MulScalar op(static_cast(1.0 / val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divInv + +namespace +{ + template struct DivInv : unary_function { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_8uc4_32f(), WithOutMask(), stream); - } + S val; + explicit DivInv(double val_) : val(val_) {} - struct divide_16sc4_32f : binary_function - { - __device__ __forceinline__ short4 operator ()(short4 a, float b) const + __device__ __forceinline__ D operator ()(T a) const { - return b != 0 ? make_short4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_short4(0,0,0,0); + return a != 0 ? saturate_cast(val / a) : 0; } }; +} - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f) +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< DivInv > : ArithmFuncTraits { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; }; +}}} - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) +namespace arithm +{ + template + void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_16sc4_32f(), WithOutMask(), stream); + DivInv op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template struct Divide : binary_function + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absDiffMat + +namespace +{ + template struct VAbsDiff4; + template <> struct VAbsDiff4 : binary_function { - Divide(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a, T b) const + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - return b != 0 ? saturate_cast(a * scale / b) : 0; + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const double scale; - }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + template <> struct VAbsDiff4 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + + //////////////////////////////////// + + template struct VAbsDiff2; + template <> struct VAbsDiff2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + template <> struct VAbsDiff2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + //////////////////////////////////// + + __device__ __forceinline__ int _abs(int a) + { + return ::abs(a); + } + __device__ __forceinline__ float _abs(float a) + { + return ::fabsf(a); + } + __device__ __forceinline__ double _abs(double a) { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Divide op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + return ::fabs(a); } - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct DivideScalar : unary_function - { - DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + template struct AbsDiffMat : binary_function + { + __device__ __forceinline__ T operator ()(T a, T b) const { - return saturate_cast(scale * a / val); + return saturate_cast(_abs(a - b)); } - const double val; - const double scale; + + __device__ __forceinline__ AbsDiffMat() {} + __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {} }; +} - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VAbsDiff4 > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + + //////////////////////////////////// + + template struct TransformFunctorTraits< VAbsDiff2 > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + + //////////////////////////////////// + + template struct TransformFunctorTraits< AbsDiffMat > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > +}}} + +namespace arithm +{ + template + void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff4(), WithOutMask(), stream); + } - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) + template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - DivideScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff2(), WithOutMask(), stream); } - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct Reciprocal : unary_function - { - Reciprocal(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AbsDiffMat(), WithOutMask(), stream); + } + + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absDiffScalar + +namespace +{ + template struct AbsDiffScalar : unary_function + { + S val; + + explicit AbsDiffScalar(S val_) : val(val_) {} + + __device__ __forceinline__ T operator ()(T a) const { - return a != 0 ? saturate_cast(scale / a) : 0; + abs_func f; + return saturate_cast(f(a - val)); } - const double scale; }; +} - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< AbsDiffScalar > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > +}}} + +namespace arithm +{ + template + void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + AbsDiffScalar op(static_cast(val)); + + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absMat + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< abs_func > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream) +namespace arithm +{ + template + void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&scalar) ); - Reciprocal op(scalar); - cv::gpu::device::transform((PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src, (PtrStepSz) dst, abs_func(), WithOutMask(), stream); } - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // absdiff - - template struct Absdiff : binary_function - { - static __device__ __forceinline__ int abs(int a) - { - return ::abs(a); - } - static __device__ __forceinline__ float abs(float a) - { - return ::fabsf(a); - } - static __device__ __forceinline__ double abs(double a) - { - return ::fabs(a); - } + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} - __device__ __forceinline__ T operator ()(T a, T b) const +////////////////////////////////////////////////////////////////////////// +// sqrMat + +namespace +{ + template struct Sqr : unary_function + { + __device__ __forceinline__ T operator ()(T x) const { - return saturate_cast(::abs(a - b)); + return saturate_cast(x * x); } + + __device__ __forceinline__ Sqr() {} + __device__ __forceinline__ Sqr(const Sqr& other) {} }; +} - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< Sqr > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > +}}} + +namespace arithm +{ + template + void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + transform((PtrStepSz) src, (PtrStepSz) dst, Sqr(), WithOutMask(), stream); + } + + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// sqrtMat + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< sqrt_func > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > +}}} + +namespace arithm +{ + template + void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src, (PtrStepSz) dst, sqrt_func(), WithOutMask(), stream); + } + + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// logMat + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< log_func > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template + void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Absdiff(), WithOutMask(), stream); + transform((PtrStepSz) src, (PtrStepSz) dst, log_func(), WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// expMat - template struct AbsdiffScalar : unary_function +namespace +{ + template struct Exp : unary_function { - AbsdiffScalar(double val_) : val(val_) {} - __device__ __forceinline__ T operator ()(T a) const + __device__ __forceinline__ T operator ()(T x) const { - return saturate_cast(::fabs(a - val)); + exp_func f; + return saturate_cast(f(x)); } - double val; - }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ Exp() {} + __device__ __forceinline__ Exp(const Exp& other) {} }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< Exp > : ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} - template void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template + void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AbsdiffScalar op(val); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src, (PtrStepSz) dst, Exp(), WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////////////////// - // Compare +////////////////////////////////////////////////////////////////////////////////////// +// cmpMat - template