added TransformFunctorTraits, optimized some functions that use transform

14 years ago · 5e9ae6b19f
parent 6ce2277cc7
commit 5e9ae6b19f
11 changed files with 581 additions and 302 deletions
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -47,37 +47,33 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "internal_shared.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
-
-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
-
    //////////////////////////////////////////////////////////////////////////////////////
    // Compare

-    template <typename T1, typename T2> struct NotEqual : binary_function<T1, T2, uchar>
+    template <typename T> struct NotEqual : binary_function<T, T, uchar>
    {
-        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const
+        __device__ __forceinline__ uchar operator()(T src1, T src2) const
        {
            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
        }
    };

-    template <typename T1, typename T2>
+    template <typename T>
    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
-        NotEqual<T1, T2> op;
-        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
+        NotEqual<T> op;
+        transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);
    }

    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
-        compare_ne<uint, uint>(src1, src2, dst, stream);
+        compare_ne<uint>(src1, src2, dst, stream);
    }
    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
-        compare_ne<float, float>(src1, src2, dst, stream);
+        compare_ne<float>(src1, src2, dst, stream);
    }


@ -354,6 +350,35 @@ namespace cv { namespace gpu { namespace mathfunc

    //////////////////////////////////////////////////////////////////////////
    // min/max
+
+    namespace detail
+    {
+        template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+
+    template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
+    {
+    };
    
    template <typename T>
    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
@ -413,7 +438,39 @@ namespace cv { namespace gpu { namespace mathfunc

    
    //////////////////////////////////////////////////////////////////////////
-    // threshold  
+    // threshold
+
+    namespace detail
+    {
+        template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+
+    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
+    {
+    };

    template <template <typename> class Op, typename T>
    void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
@ -454,8 +511,13 @@ namespace cv { namespace gpu { namespace mathfunc
    //////////////////////////////////////////////////////////////////////////
    // subtract

-    template <typename T>
-    void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
+    template <> struct TransformFunctorTraits< minus<short> > : DefaultTransformFunctorTraits< minus<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T> void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
    {
        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minus<T>(), stream);
    }
@ -499,10 +561,35 @@ namespace cv { namespace gpu { namespace mathfunc

        __device__ __forceinline__ float operator()(const float& e) const
        {
-            return __powf(fabs(e), power);
+            return __powf(::fabs(e), power);
        }
    };

+    namespace detail
+    {
+        template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+        };
+        template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 8 };
+        };
+        template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+
+    template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
+    {
+    };
+
    template<typename T>
    void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream)
    {
@ -514,6 +601,5 @@ namespace cv { namespace gpu { namespace mathfunc
    template void pow_caller<short>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
    template void pow_caller<ushort>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
    template void pow_caller<int>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
-    template void pow_caller<uint>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
    template void pow_caller<float>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
 }}}
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@ -40,14 +40,9 @@
 //
 //M*/

-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;

 #ifndef CV_PI
 #define CV_PI   3.1415926535897932384626433832795f
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@ -45,9 +45,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

-using namespace cv::gpu::device;
-
-namespace cv { namespace gpu { namespace matrix_operations {
+namespace cv { namespace gpu { namespace device {

    template <typename T> struct shift_and_sizeof;
    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
@ -249,7 +247,55 @@ namespace cv { namespace gpu { namespace matrix_operations {

        const double alpha, beta;
    };
-    
+
+    namespace detail
+    {
+        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 8 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+
+        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+        {
+        };
+    }
+
+    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    {
+    };
+        
    template<typename T, typename D>
    void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
    {
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@ -71,23 +71,16 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }

 #include "opencv2/gpu/stream_accessor.hpp"

-namespace cv 
-{
-    namespace gpu
-    {
-        namespace matrix_operations
-        {            
-            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+namespace cv { namespace gpu { namespace device {            
+    void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);

-            template <typename T>
-            void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
-            template <typename T>
-            void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
+    template <typename T>
+    void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
+    template <typename T>
+    void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);

-            void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
-        }
-    }
-}
+    void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+}}}

 struct Stream::Impl
 {
@ -108,14 +101,14 @@ namespace
    void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);
+        device::set_to_gpu(src, sf.val, src.channels(), stream);
    }

    template <typename T>
    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+        device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
    }
 }

@ -262,7 +255,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
        psrc = &(temp = src);

    dst.create( src.size(), rtype );
-    matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
+    device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
 }

 cv::gpu::Stream::operator bool() const
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -67,7 +67,6 @@ void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
 double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_nogpu(); return 0.0;}
-
 void cv::gpu::pow(const GpuMat&, double, GpuMat&, Stream&)  { throw_nogpu(); }

 #else
@ -180,7 +179,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
    nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, StreamAccessor::getStream(stream));
 }

-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
    template <typename T>
    void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
@ -192,7 +191,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
    {
        CV_Assert(src1.size() == src2.size());
        dst.create(src1.size(), src1.type());
-        mathfunc::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+        device::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
    }
    else
        nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));
@ -338,7 +337,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
 //////////////////////////////////////////////////////////////////////////////
 // Comparison of two matrixes

-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
@ -375,7 +374,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        }
        else
        {
-            mathfunc::compare_ne_8uc4(src1, src2, dst, stream);
+            device::compare_ne_8uc4(src1, src2, dst, stream);
        }
    }
    else
@ -393,7 +392,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        }
        else
        {
-            mathfunc::compare_ne_32f(src1, src2, dst, stream);
+            device::compare_ne_32f(src1, src2, dst, stream);
        }
    }
 }
@ -402,7 +401,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations

-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream);

@ -416,7 +415,7 @@ namespace
    {
        dst.create(src.size(), src.type());

-        cv::gpu::mathfunc::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), 
+        cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), 
                                              dst.channels(), src, dst, stream);
    }

@ -426,10 +425,10 @@ namespace
        using namespace cv::gpu;

        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwiseMaskNotCaller<unsigned char>, mathfunc::bitwiseMaskNotCaller<unsigned char>, 
-                                   mathfunc::bitwiseMaskNotCaller<unsigned short>, mathfunc::bitwiseMaskNotCaller<unsigned short>,
-                                   mathfunc::bitwiseMaskNotCaller<unsigned int>, mathfunc::bitwiseMaskNotCaller<unsigned int>,
-                                   mathfunc::bitwiseMaskNotCaller<unsigned int>};
+        static Caller callers[] = {device::bitwiseMaskNotCaller<unsigned char>, device::bitwiseMaskNotCaller<unsigned char>, 
+                                   device::bitwiseMaskNotCaller<unsigned short>, device::bitwiseMaskNotCaller<unsigned short>,
+                                   device::bitwiseMaskNotCaller<unsigned int>, device::bitwiseMaskNotCaller<unsigned int>,
+                                   device::bitwiseMaskNotCaller<unsigned int>};

        CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
        dst.create(src.size(), src.type());
@ -456,7 +455,7 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations

-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);

@ -482,7 +481,7 @@ namespace
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());

-        cv::gpu::mathfunc::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), 
+        cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), 
                                             dst.channels(), src1, src2, dst, stream);
    }

@ -492,10 +491,10 @@ namespace
        using namespace cv::gpu;

        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwiseMaskOrCaller<unsigned char>, mathfunc::bitwiseMaskOrCaller<unsigned char>, 
-                                   mathfunc::bitwiseMaskOrCaller<unsigned short>, mathfunc::bitwiseMaskOrCaller<unsigned short>,
-                                   mathfunc::bitwiseMaskOrCaller<unsigned int>, mathfunc::bitwiseMaskOrCaller<unsigned int>,
-                                   mathfunc::bitwiseMaskOrCaller<unsigned int>};
+        static Caller callers[] = {device::bitwiseMaskOrCaller<unsigned char>, device::bitwiseMaskOrCaller<unsigned char>, 
+                                   device::bitwiseMaskOrCaller<unsigned short>, device::bitwiseMaskOrCaller<unsigned short>,
+                                   device::bitwiseMaskOrCaller<unsigned int>, device::bitwiseMaskOrCaller<unsigned int>,
+                                   device::bitwiseMaskOrCaller<unsigned int>};

        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
@ -513,7 +512,7 @@ namespace
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());

-        cv::gpu::mathfunc::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), 
+        cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), 
                                              dst.channels(), src1, src2, dst, stream);
    }

@ -523,10 +522,10 @@ namespace
        using namespace cv::gpu;

        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwiseMaskAndCaller<unsigned char>, mathfunc::bitwiseMaskAndCaller<unsigned char>, 
-                                   mathfunc::bitwiseMaskAndCaller<unsigned short>, mathfunc::bitwiseMaskAndCaller<unsigned short>,
-                                   mathfunc::bitwiseMaskAndCaller<unsigned int>, mathfunc::bitwiseMaskAndCaller<unsigned int>,
-                                   mathfunc::bitwiseMaskAndCaller<unsigned int>};
+        static Caller callers[] = {device::bitwiseMaskAndCaller<unsigned char>, device::bitwiseMaskAndCaller<unsigned char>, 
+                                   device::bitwiseMaskAndCaller<unsigned short>, device::bitwiseMaskAndCaller<unsigned short>,
+                                   device::bitwiseMaskAndCaller<unsigned int>, device::bitwiseMaskAndCaller<unsigned int>,
+                                   device::bitwiseMaskAndCaller<unsigned int>};

        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
@ -544,7 +543,7 @@ namespace
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());

-        cv::gpu::mathfunc::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), 
+        cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), 
                                              dst.channels(), src1, src2, dst, stream);
    }

@ -554,10 +553,10 @@ namespace
        using namespace cv::gpu;

        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwiseMaskXorCaller<unsigned char>, mathfunc::bitwiseMaskXorCaller<unsigned char>, 
-                                   mathfunc::bitwiseMaskXorCaller<unsigned short>, mathfunc::bitwiseMaskXorCaller<unsigned short>,
-                                   mathfunc::bitwiseMaskXorCaller<unsigned int>, mathfunc::bitwiseMaskXorCaller<unsigned int>,
-                                   mathfunc::bitwiseMaskXorCaller<unsigned int>};
+        static Caller callers[] = {device::bitwiseMaskXorCaller<unsigned char>, device::bitwiseMaskXorCaller<unsigned char>, 
+                                   device::bitwiseMaskXorCaller<unsigned short>, device::bitwiseMaskXorCaller<unsigned short>,
+                                   device::bitwiseMaskXorCaller<unsigned int>, device::bitwiseMaskXorCaller<unsigned int>,
+                                   device::bitwiseMaskXorCaller<unsigned int>};

        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
@ -601,7 +600,7 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations

-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
    template <typename T>
    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
@ -623,14 +622,14 @@ namespace
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+        device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
    }

    template <typename T>
    void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
    {
        dst.create(src1.size(), src1.type());
-        mathfunc::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
    }
    
    template <typename T>
@ -638,14 +637,14 @@ namespace
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+        device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
    }

    template <typename T>
    void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
    {
        dst.create(src1.size(), src1.type());
-        mathfunc::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
    }
 }

@ -709,7 +708,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // threshold

-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
    template <typename T>
    void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, T thresh, T maxVal, int type,
@ -718,24 +717,25 @@ namespace cv { namespace gpu { namespace mathfunc

 namespace
 {
-    template <typename T>
-    void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, 
-        cudaStream_t stream)
+    template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
    {
-        mathfunc::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
+        device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
    }
 }

 double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
 {
+    CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
+    CV_Assert(type <= THRESH_TOZERO_INV);
+
+    dst.create(src.size(), src.type());
+
    cudaStream_t stream = StreamAccessor::getStream(s);

    if (src.type() == CV_32FC1 && type == THRESH_TRUNC)
    {
        NppStreamHandler h(stream);

-        dst.create(src.size(), src.type());
-
        NppiSize sz;
        sz.width  = src.cols;
        sz.height = src.rows;
@ -761,12 +761,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
            threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
        };

-        CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
-        CV_Assert(type <= THRESH_TOZERO_INV);
-
-        dst.create(src.size(), src.type());
-
-        if (src.depth() != CV_32F)
+        if (src.depth() != CV_32F && src.depth() != CV_64F)
        {
            thresh = cvFloor(thresh);
            maxVal = cvRound(maxVal);
@ -781,17 +776,11 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 ////////////////////////////////////////////////////////////////////////
 // pow

-namespace cv
+namespace cv { namespace gpu { namespace device
 {
-    namespace gpu
-    {
-        namespace mathfunc
-        {
-            template<typename T>
-            void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
-        }
-    }
-}
+    template<typename T>
+    void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
+}}}

 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {    
@ -802,9 +791,9 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)

    static const caller_t callers[] = 
    {
-        mathfunc::pow_caller<unsigned char>,  mathfunc::pow_caller<signed char>, 
-        mathfunc::pow_caller<unsigned short>, mathfunc::pow_caller<short>, 
-        mathfunc::pow_caller<int>, mathfunc::pow_caller<float>
+        device::pow_caller<unsigned char>,  device::pow_caller<signed char>, 
+        device::pow_caller<unsigned short>, device::pow_caller<short>, 
+        device::pow_caller<int>, device::pow_caller<float>
    };

    callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));    
--- a/modules/gpu/src/gpumat.cpp
+++ b/modules/gpu/src/gpumat.cpp
@ -393,7 +393,7 @@ void cv::gpu::ensureSizeIsEnough(int, int, int, GpuMat&) { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace matrix_operations
+namespace cv { namespace gpu { namespace device
 {
    void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);

@ -449,7 +449,7 @@ void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
    else
    {
        mat.create(size(), type());
-        cv::gpu::matrix_operations::copy_to_with_mask(*this, mat, depth(), mask, channels());
+        device::copy_to_with_mask(*this, mat, depth(), mask, channels());
    }
 }

@ -508,7 +508,7 @@ namespace

    void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
    {
-        matrix_operations::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
+        device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
    }
 }

@ -540,7 +540,7 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be
    dst.create( size(), rtype );

    if (!noScale)
-        matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
+        device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
    else
    {
        typedef void (*convert_caller_t)(const GpuMat& src, GpuMat& dst);
@ -681,7 +681,7 @@ namespace
    void kernelSet(GpuMat& src, const Scalar& s)
    {
        Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);
+        device::set_to_gpu(src, sf.val, src.channels(), 0);
    }

    template<int SDEPTH, int SCN> struct NppSetMaskFunc
@ -732,7 +732,7 @@ namespace
    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
    {
        Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);
+        device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
    }
 }

--- a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
@ -45,6 +45,7 @@

 #include "internal_shared.hpp"
 #include "../vec_traits.hpp"
+#include "../functional.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -68,51 +69,17 @@ namespace cv { namespace gpu { namespace device

        //! Read Write Traits

-        template <size_t src_elem_size, size_t dst_elem_size>
-        struct UnReadWriteTraits_
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
        {
-            enum { shift = 1 };
-        };
-        template <size_t src_elem_size>
-        struct UnReadWriteTraits_<src_elem_size, 1>
-        {
-            enum { shift = 4 };
-        };
-        template <size_t src_elem_size>
-        struct UnReadWriteTraits_<src_elem_size, 2>
-        {
-            enum { shift = 2 };
-        };
-        template <typename T, typename D> struct UnReadWriteTraits
-        {
-            enum { shift = UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift };
-            
            typedef typename TypeVec<T, shift>::vec_type read_type;
            typedef typename TypeVec<D, shift>::vec_type write_type;
        };

-        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
-        struct BinReadWriteTraits_
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
        {
-            enum { shift = 1 };
-        };
-        template <size_t src_elem_size1, size_t src_elem_size2>
-        struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
-        {
-            enum { shift = 4 };
-        };
-        template <size_t src_elem_size1, size_t src_elem_size2>
-        struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
-        {
-            enum { shift = 2 };
-        };
-        template <typename T1, typename T2, typename D> struct BinReadWriteTraits
-        {
-            enum {shift = BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
-
            typedef typename TypeVec<T1, shift>::vec_type read_type1;
            typedef typename TypeVec<T2, shift>::vec_type read_type2;
-            typedef typename TypeVec<D , shift>::vec_type write_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
        };

        //! Transform kernels
@ -206,29 +173,73 @@ namespace cv { namespace gpu { namespace device
                    dst.w = op(src1.w, src2.w);
            }
        };
+        template <> struct OpUnroller<8>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };

        template <typename T, typename D, typename UnOp, typename Mask>
        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, const UnOp op)
        {
-            typedef typename UnReadWriteTraits<T, D>::read_type read_type;
-            typedef typename UnReadWriteTraits<T, D>::write_type write_type;
-            const int shift = UnReadWriteTraits<T, D>::shift;
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;

            const int x = threadIdx.x + blockIdx.x * blockDim.x;
            const int y = threadIdx.y + blockIdx.y * blockDim.y;
-            const int x_shifted = x * shift;
+            const int x_shifted = x * ft::smart_shift;

            if (y < src_.rows)
            {
                const T* src = src_.ptr(y);
                D* dst = dst_.ptr(y);

-                if (x_shifted + shift - 1 < src_.cols)
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
                {
                    const read_type src_n_el = ((const read_type*)src)[x];
                    write_type dst_n_el;

-                    OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);

                    ((write_type*)dst)[x] = dst_n_el;
                }
@ -259,14 +270,14 @@ namespace cv { namespace gpu { namespace device
        __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
            const Mask mask, const BinOp op)
        {
-            typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
-            typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
-            typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
-            const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;

            const int x = threadIdx.x + blockIdx.x * blockDim.x;
            const int y = threadIdx.y + blockIdx.y * blockDim.y;
-            const int x_shifted = x * shift;
+            const int x_shifted = x * ft::smart_shift;

            if (y < src1_.rows)
            {
@ -274,13 +285,13 @@ namespace cv { namespace gpu { namespace device
                const T2* src2 = src2_.ptr(y);
                D* dst = dst_.ptr(y);

-                if (x_shifted + shift - 1 < src1_.cols)
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
                {
                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
                    write_type dst_n_el;
                    
-                    OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);

                    ((write_type*)dst)[x] = dst_n_el;
                }
@ -308,7 +319,7 @@ namespace cv { namespace gpu { namespace device
                const T2 src2_data = src2.ptr(y)[x];
                dst.ptr(y)[x] = op(src1_data, src2_data);
            }
-        }        
+        }

        template <bool UseSmart> struct TransformDispatcher;
        template<> struct TransformDispatcher<false>
@ -316,11 +327,10 @@ namespace cv { namespace gpu { namespace device
            template <typename T, typename D, typename UnOp, typename Mask>
            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
            {
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
+                typedef TransformFunctorTraits<UnOp> ft;

-                grid.x = divUp(src.cols, threads.x);
-                grid.y = divUp(src.rows, threads.y);        
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);     

                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );
@ -332,11 +342,10 @@ namespace cv { namespace gpu { namespace device
            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
            {
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
+                typedef TransformFunctorTraits<BinOp> ft;

-                grid.x = divUp(src1.cols, threads.x);
-                grid.y = divUp(src1.rows, threads.y);        
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);     

                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );
@ -350,13 +359,12 @@ namespace cv { namespace gpu { namespace device
            template <typename T, typename D, typename UnOp, typename Mask>
            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
            {
-                const int shift = UnReadWriteTraits<T, D>::shift;
+                typedef TransformFunctorTraits<UnOp> ft;

-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);            
+                StaticAssert<ft::smart_shift != 1>::check();

-                grid.x = divUp(src.cols, threads.x * shift);
-                grid.y = divUp(src.rows, threads.y);        
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);      

                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );
@ -368,13 +376,12 @@ namespace cv { namespace gpu { namespace device
            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
            {
-                const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+                typedef TransformFunctorTraits<BinOp> ft;

-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
+                StaticAssert<ft::smart_shift != 1>::check();

-                grid.x = divUp(src1.cols, threads.x * shift);
-                grid.y = divUp(src1.rows, threads.y);        
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);    

                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );
@ -382,44 +389,20 @@ namespace cv { namespace gpu { namespace device
                if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() );            
            }
-        };
-
-        template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
-        {
-            static const bool value = false;
-        };
-        template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
-        {
-            static const bool value = UnReadWriteTraits<T, D>::shift != 1;
-        };
-        template <typename T, typename D> struct UseSmartUn
-        {
-            static const bool value = UseSmartUn_<T, D, VecTraits<T>::cn, VecTraits<D>::cn>::value;
-        };
-
-        template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
-        {
-            static const bool value = false;
-        };
-        template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
-        {
-            static const bool value = BinReadWriteTraits<T1, T2, D>::shift != 1;
-        };
-        template <typename T1, typename T2, typename D> struct UseSmartBin
-        {
-            static const bool value = UseSmartBin_<T1, T2, D, VecTraits<T1>::cn, VecTraits<T2>::cn, VecTraits<D>::cn>::value;
-        };
+        };        

        template <typename T, typename D, typename UnOp, typename Mask>
        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
        {
-            TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
+            typedef TransformFunctorTraits<UnOp> ft;
+            TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
        {
-            TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
+            typedef TransformFunctorTraits<BinOp> ft;
+            TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
        }
    }
 }}}
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@ -46,18 +46,25 @@
 #include <thrust/functional.h>
 #include "internal_shared.hpp"
 #include "saturate_cast.hpp"
+#include "vec_traits.hpp"

 namespace cv { namespace gpu { namespace device
 {
+    // Function Objects
+
    using thrust::unary_function;
    using thrust::binary_function;

+    // Arithmetic Operations
+
    using thrust::plus;
    using thrust::minus;
    using thrust::multiplies;
    using thrust::divides;
    using thrust::modulus;
    using thrust::negate;
+
+    // Comparison Operations
    
    using thrust::equal_to;
    using thrust::not_equal_to;
@ -65,11 +72,15 @@ namespace cv { namespace gpu { namespace device
    using thrust::less;
    using thrust::greater_equal;
    using thrust::less_equal;
+
+    // Logical Operations
    
    using thrust::logical_and;
    using thrust::logical_or;
    using thrust::logical_not;

+    // Bitwise Operations
+
    using thrust::bit_and;
    using thrust::bit_or;
    using thrust::bit_xor;
@ -78,7 +89,13 @@ namespace cv { namespace gpu { namespace device
        __forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
    };

-    using thrust::identity;
+    // Generalized Identity Operations
+
+    using thrust::identity;    
+    using thrust::project1st;
+    using thrust::project2nd;
+
+    // Min/Max Operations

 #define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
    template <> struct name<type> : binary_function<type, type, type> \
@ -115,15 +132,8 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin)

 #undef OPENCV_GPU_IMPLEMENT_MINMAX
-    
-    using thrust::project1st;
-    using thrust::project2nd;
-
-    using thrust::unary_negate;
-    using thrust::not1;

-    using thrust::binary_negate;
-    using thrust::not2;
+    // Math functions

 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
    template <typename T> struct func ## _func : unary_function<T, float> \
@ -192,6 +202,8 @@ namespace cv { namespace gpu { namespace device
        }
    };

+    // Saturate Cast Functor
+
    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
    {
        __forceinline__ __device__ D operator ()(const T& v) const
@ -200,6 +212,8 @@ namespace cv { namespace gpu { namespace device
        }
    };

+    // Threshold Functors
+
    template <typename T> struct thresh_binary_func : unary_function<T, T>
    {
        __forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
@ -256,7 +270,15 @@ namespace cv { namespace gpu { namespace device
        }

        const T thresh;
-    };
+    };    
+
+    // Function Object Adaptors
+
+    using thrust::unary_negate;
+    using thrust::not1;
+
+    using thrust::binary_negate;
+    using thrust::not2;

    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
    {
@ -291,46 +313,77 @@ namespace cv { namespace gpu { namespace device
        return binder2nd<Op>(op, typename Op::second_argument_type(x));
    }

-    template <typename T1, typename T2> struct BinOpTraits
-    {
-        typedef int argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, T>
-    {
-        typedef T argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, double>
-    {
-        typedef double argument_type;
-    };
-    template <typename T> struct BinOpTraits<double, T>
-    {
-        typedef double argument_type;
-    };
-    template <> struct BinOpTraits<double, double>
-    {
-        typedef double argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, float>
+    // Functor Traits
+
+    template <typename F> struct IsUnaryFunction
    {
-        typedef float argument_type;
+        struct Yes {};
+        struct No {Yes a[2];};
+
+        template <typename T, typename D> static Yes check(unary_function<T, D>*);
+        static No check(...);
+
+        enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
    };
-    template <typename T> struct BinOpTraits<float, T>
+
+    template <typename F> struct IsBinaryFunction
    {
-        typedef float argument_type;
+        struct Yes {};
+        struct No {Yes a[2];};
+
+        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>*);
+        static No check(...);
+
+        enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
    };
-    template <> struct BinOpTraits<float, float>
+
+    namespace detail
    {
-        typedef float argument_type;
-    };
-    template <> struct BinOpTraits<double, float>
+        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
+
+        template <typename T, typename D> struct DefaultUnaryShift
+        {
+            enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
+        };
+        
+        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
+
+        template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+        {
+            enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
+        };
+
+        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
+        template <typename Func> struct ShiftDispatcher<Func, true>
+        {
+            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
+        };
+        template <typename Func> struct ShiftDispatcher<Func, false>
+        {
+            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+        };
+    }
+
+    template <typename Func> struct DefaultTransformShift
    {
-        typedef double argument_type;
+        enum { shift = detail::ShiftDispatcher<Func>::shift };
    };
-    template <> struct BinOpTraits<float, double>
+
+    template <typename Func> struct DefaultTransformFunctorTraits
    {
-        typedef double argument_type;
+        enum { simple_block_dim_x = 16 };
+        enum { simple_block_dim_y = 16 };
+
+        enum { smart_block_dim_x = 16 };
+        enum { smart_block_dim_y = 16 };
+        enum { smart_shift = DefaultTransformShift<Func>::shift };
    };
+
+    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
 }}}

 #endif // __OPENCV_GPU_FUNCTIONAL_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@ -150,6 +150,50 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
    }

+    namespace detail
+    {    
+        template <typename T1, typename T2> struct BinOpTraits
+        {
+            typedef int argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, T>
+        {
+            typedef T argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<double, T>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<double, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, float>
+        {
+            typedef float argument_type;
+        };
+        template <typename T> struct BinOpTraits<float, T>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<float, float>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<double, float>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<float, double>
+        {
+            typedef double argument_type;
+        };
+    }
+
 #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
    static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
    { \
@ -157,16 +201,16 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
    } \
    static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
    { \
@ -174,16 +218,16 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
    } \
    static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
    { \
@ -191,16 +235,16 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
    } \
    static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
    { \
@ -208,16 +252,16 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
    { \
-        func<typename BinOpTraits<T, type>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
+        func<typename detail::BinOpTraits<T, type>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
    }

 #define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@ -49,6 +49,79 @@ namespace cv { namespace gpu { namespace device
 {
    template<typename T, int N> struct TypeVec;

+    struct __align__(8) uchar8
+    {
+        uchar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+    {
+        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(8) char8
+    {
+        schar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+    {
+        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) ushort8
+    {
+        ushort a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+    {
+        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) short8
+    {
+        short a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+    {
+        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) uint8
+    {
+        uint a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+    {
+        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) int8
+    {
+        int a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+    {
+        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) float8
+    {
+        float a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+    {
+        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct double8
+    {
+        double a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+    {
+        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+
 #define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
@ -57,7 +130,9 @@ namespace cv { namespace gpu { namespace device
    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
-    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; };
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
+    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };

    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
@ -74,11 +149,13 @@ namespace cv { namespace gpu { namespace device
    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };

    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };

    template<typename T> struct VecTraits;

@ -87,36 +164,43 @@ namespace cv { namespace gpu { namespace device
    { \
        typedef type elem_type; \
        enum {cn=1}; \
-        static __device__ __host__ type all(type v) {return v;} \
-        static __device__ __host__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
+        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
    }; \
    template<> struct VecTraits<type ## 1> \
    { \
        typedef type elem_type; \
        enum {cn=1}; \
-        static __device__ __host__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
-        static __device__ __host__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
    }; \
    template<> struct VecTraits<type ## 2> \
    { \
        typedef type elem_type; \
        enum {cn=2}; \
-        static __device__ __host__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
-        static __device__ __host__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
    }; \
    template<> struct VecTraits<type ## 3> \
    { \
        typedef type elem_type; \
        enum {cn=3}; \
-        static __device__ __host__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
-        static __device__ __host__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
    }; \
    template<> struct VecTraits<type ## 4> \
    { \
        typedef type elem_type; \
        enum {cn=4}; \
-        static __device__ __host__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
-        static __device__ __host__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+    }; \
+    template<> struct VecTraits<type ## 8> \
+    { \
+        typedef type elem_type; \
+        enum {cn=8}; \
+        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
    };

    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
@ -134,8 +218,8 @@ namespace cv { namespace gpu { namespace device
    { 
        typedef schar elem_type; 
        enum {cn=1}; 
-        static __device__ __host__ schar all(schar v) {return v;}
-        static __device__ __host__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
+        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
    };
 }}}

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@ -286,7 +286,7 @@ TEST(BruteForceMatcher)
 {
    // Init CPU matcher

-    int desc_len = 64;
+    int desc_len = 128;

    BruteForceMatcher< L2<float> > matcher;

@ -328,7 +328,7 @@ TEST(BruteForceMatcher)
    d_matcher.knnMatch(d_query, d_train, d_matches, knn);
    GPU_OFF;

-    /*SUBTEST << "radiusMatch";
+    SUBTEST << "radiusMatch";
    float max_distance = 3.8f;

    CPU_ON;
@ -337,7 +337,7 @@ TEST(BruteForceMatcher)

    GPU_ON;
    d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
-    GPU_OFF;*/
+    GPU_OFF;
 }


@ -689,60 +689,66 @@ TEST(threshold)
    Mat src, dst;
    gpu::GpuMat d_src, d_dst;

-    for (int size = 2000; size <= 4000; size += 1000)
+    for (int size = 1000; size <= 4000; size += 1000)
    {
-        SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
+        SUBTEST << "size " << size << ", 8U, THRESH_BINARY";

        gen(src, size, size, CV_8U, 0, 100);
        dst.create(size, size, CV_8U);

        CPU_ON; 
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
        CPU_OFF;

        d_src = src;
        d_dst.create(size, size, CV_8U);

        GPU_ON;
-        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        GPU_OFF;
    }

-    for (int size = 2000; size <= 4000; size += 1000)
+    for (int size = 1000; size <= 4000; size += 1000)
    {
-        SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
+        SUBTEST << "size " << size << ", 32F, THRESH_BINARY";

-        gen(src, size, size, CV_8U, 0, 100);
-        dst.create(size, size, CV_8U);
+        gen(src, size, size, CV_32F, 0, 100);
+        dst.create(size, size, CV_32F);

        CPU_ON; 
        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
        CPU_OFF;

        d_src = src;
-        d_dst.create(size, size, CV_8U);
+        d_dst.create(size, size, CV_32F);

        GPU_ON;
        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        GPU_OFF;
    }
+}

-    for (int size = 2000; size <= 4000; size += 1000)
+TEST(pow)
+{
+    Mat src, dst;
+    gpu::GpuMat d_src, d_dst;
+
+    for (int size = 1000; size <= 4000; size += 1000)
    {
-        SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
+        SUBTEST << "size " << size << ", 32F";

        gen(src, size, size, CV_32F, 0, 100);
        dst.create(size, size, CV_32F);

-        CPU_ON; 
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        CPU_ON;
+        pow(src, -2.0, dst);
        CPU_OFF;

        d_src = src;
        d_dst.create(size, size, CV_32F);

        GPU_ON;
-        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        gpu::pow(d_src, -2.0, d_dst);
        GPU_OFF;
    }
 }