diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index d6bbb7eb67..f4f53fc0da 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -78,60 +78,53 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector
 
 namespace cv { namespace gpu { namespace bfmatcher
 {
-    template <typename T>
-    void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        bool cc_12, cudaStream_t stream);
-    template <typename T>
-    void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        bool cc_12, cudaStream_t stream);
-    template <typename T>
-    void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        bool cc_12, cudaStream_t stream);
-    template <typename T>
-    void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
-        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        bool cc_12, cudaStream_t stream);
-    template <typename T>
-    void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
-        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        bool cc_12, cudaStream_t stream);
-    template <typename T>
-    void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
-        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
-        bool cc_12, cudaStream_t stream);
-
-    template <typename T>
-    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template <typename T>
-    void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template <typename T>
-    void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-
-    template <typename T>
-    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template <typename T>
-    void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template <typename T>
-    void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
+    template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream);
+    template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream);
+    template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream);
+
+    template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream);
+    template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream);
+    template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
+        int cc, cudaStream_t stream);
+
+    template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream);
+    template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream);
+    template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream);
+
+    template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream);
+    template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream);
+    template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream);
 }}}
 
 namespace
 {
-    class ImgIdxSetter
+    struct ImgIdxSetter
     {
-    public:
-        ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
-        void operator()(DMatch& m) const {m.imgIdx = imgIdx;}
-    private:
+        explicit inline ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
+        inline void operator()(DMatch& m) const {m.imgIdx = imgIdx;}
         int imgIdx;
     };
 }
@@ -179,9 +172,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
 
     using namespace cv::gpu::bfmatcher;
 
-    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        bool cc_12, cudaStream_t stream);
+    typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream);
 
     static const match_caller_t match_callers[3][8] =
     {
@@ -213,11 +206,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
     match_caller_t func = match_callers[distType][queryDescs.depth()];
     CV_Assert(func != 0);
 
-    bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
+    DeviceInfo info;
+    int cc = info.majorVersion() * 10 + info.minorVersion();
 
-    // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.
-    // trainIdx store after imgIdx, so we doesn't lose it value.
-    func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12, StreamAccessor::getStream(stream));
+    func(queryDescs, trainDescs, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
@@ -319,9 +311,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
 
     using namespace cv::gpu::bfmatcher;
 
-    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
-        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
-        const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
+    typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream);
 
     static const match_caller_t match_callers[3][8] =
     {
@@ -353,9 +345,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
     match_caller_t func = match_callers[distType][queryDescs.depth()];
     CV_Assert(func != 0);
 
-    bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
+    DeviceInfo info;
+    int cc = info.majorVersion() * 10 + info.minorVersion();
 
-    func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12, StreamAccessor::getStream(stream));
+    func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@@ -427,8 +420,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
 
     using namespace cv::gpu::bfmatcher;
 
-    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
+    typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream);
 
     static const match_caller_t match_callers[3][8] =
     {
@@ -473,9 +467,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
     match_caller_t func = match_callers[distType][queryDescs.depth()];
     CV_Assert(func != 0);
     
-    bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
+    DeviceInfo info;
+    int cc = info.majorVersion() * 10 + info.minorVersion();
 
-    func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, cc_12, StreamAccessor::getStream(stream));
+    func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@@ -563,7 +558,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs,
             vector<DMatch>& localMatch = curMatches[queryIdx];
             vector<DMatch>& globalMatch = matches[queryIdx];
 
-            for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));
+            for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast<int>(imgIdx)));
 
             temp.clear();
             merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));
@@ -593,8 +588,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
 
     using namespace cv::gpu::bfmatcher;
 
-    typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
+    typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream);
 
     static const radiusMatch_caller_t radiusMatch_callers[3][8] =
     {
@@ -636,7 +632,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
     radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()];
     CV_Assert(func != 0);
 
-    func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr<unsigned int>(), distance, StreamAccessor::getStream(stream));
+    func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches,
@@ -728,7 +724,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
             vector<DMatch>& localMatch = curMatches[queryIdx];
             vector<DMatch>& globalMatch = matches[queryIdx];
 
-            for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));
+            for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast<int>(imgIdx)));
 
             const size_t oldSize = globalMatch.size();
 
diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu
index 4cd1142d57..c2c7317b1a 100644
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -42,461 +42,83 @@
 
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/gpu/device/utility.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
 
 namespace cv { namespace gpu { namespace bfmatcher
 {
-///////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////// General funcs //////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Mask strategy
-
-    struct SingleMask
-    {
-        explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}
-        
-        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
-        {            
-            return mask.ptr(queryIdx)[trainIdx] != 0;
-        }
-
-        const PtrStep mask;
-    };
-
-    struct MaskCollection
-    {
-        explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}
-
-        __device__ __forceinline__ void nextMask()
-        {
-            curMask = *maskCollection++;
-        }
-        
-        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
-        {
-            uchar val;
-            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0));
-        }
-
-        const PtrStep* maskCollection;
-        PtrStep curMask;
-    };
-
-    struct WithOutMask
-    {
-        __device__ __forceinline__ void nextMask() const
-        {
-        }
-        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
-        {
-            return true;
-        }
-    };
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduce Sum
-
-    template <int BLOCK_DIM_X> struct SumReductor;
-    template <> struct SumReductor<16>
-    {
-        template <typename T> static __device__ void reduce(volatile T* sdiff_row, T& mySum)
-        {
-            sdiff_row[threadIdx.x] = mySum;
-            
-            if (threadIdx.x < 8) 
-            {
-                sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 8]; 
-                sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 4]; 
-                sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 2];
-                sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 1];  
-            }
-        }
-    };
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Distance
-
-    template <typename T> struct L1Dist
-    {
-        typedef int ResultType;
-        typedef int ValueType;
-
-        __device__ __forceinline__ L1Dist() : mySum(0) {}
-
-        __device__ __forceinline__ void reduceIter(int val1, int val2)
-        {
-            mySum = __sad(val1, val2, mySum);
-        }
-
-        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(int* sdiff_row)
-        {
-            SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
-        }
-
-        __device__ __forceinline__ operator int() const
-        {
-            return mySum;
-        }
-
-        int mySum;
-    };
-    template <> struct L1Dist<float>
-    {
-        typedef float ResultType;
-        typedef float ValueType;
-
-        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
-
-        __device__ __forceinline__ void reduceIter(float val1, float val2)
-        {
-            mySum += fabs(val1 - val2);
-        }
-
-        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(float* sdiff_row)
-        {
-            SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
-        }
-
-        __device__ __forceinline__ operator float() const
-        {
-            return mySum;
-        }
-
-        float mySum;
-    };
-
-    struct L2Dist
-    {
-        typedef float ResultType;
-        typedef float ValueType;
-
-        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
-
-        __device__ __forceinline__ void reduceIter(float val1, float val2)
-        {
-            float reg = val1 - val2;
-            mySum += reg * reg;
-        }
-
-        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(float* sdiff_row)
-        {
-            SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
-        }
-
-        __device__ __forceinline__ operator float() const
-        {
-            return sqrtf(mySum);
-        }
-
-        float mySum;
-    };
-
-    struct HammingDist
-    {
-        typedef int ResultType;
-        typedef int ValueType;
-
-        __device__ __forceinline__ HammingDist() : mySum(0) {}
-
-        __device__ __forceinline__ void reduceIter(int val1, int val2)
-        {
-            mySum += __popc(val1 ^ val2);
-        }
-
-        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(int* sdiff_row)
-        {
-            SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
-        }
-
-        __device__ __forceinline__ operator int() const
-        {
-            return mySum;
-        }
-
-        int mySum;
-    };
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // reduceDescDiff
-
-    template <int BLOCK_DIM_X, typename Dist, typename T> 
-    __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
-    {
-        for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X)
-        {
-            T trainVal;
-            ForceGlob<T>::Load(trainDescs, i, trainVal);
-            dist.reduceIter(queryDescs[i], trainVal);
-        }
-
-        dist.reduceAll<BLOCK_DIM_X>(sdiff_row);
-    }
 
 ///////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////// Match //////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // loadDescsVals
-
-    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, typename T, typename U> 
-    __device__ void loadDescsVals(const T* descs, int desc_len, U* queryVals, U* smem)
-    {
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        if (tid < desc_len)
-        {
-            smem[tid] = descs[tid];
-        }
-        __syncthreads();
-
-        #pragma unroll
-        for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)
-        {
-            *queryVals = smem[i];
-            ++queryVals;
-        }
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // reduceDescDiffCached
-
-    template <int N> struct UnrollDescDiff
-    {
-        template <typename Dist, typename T>
-        static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, int ind)
-        {
-            if (ind < desc_len)
-            {
-                T trainVal;
-                ForceGlob<T>::Load(trainDescs, ind, trainVal);
-                dist.reduceIter(*queryVals, trainVal);
-
-                ++queryVals;
-
-                UnrollDescDiff<N - 1>::calcCheck(queryVals, trainDescs, desc_len, dist, ind + blockDim.x);
-            }
-        }
-
-        template <typename Dist, typename T>
-        static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
-        {
-            T trainVal;
-            ForceGlob<T>::Load(trainDescs, 0, trainVal);
-            dist.reduceIter(*queryVals, trainVal);
-
-            ++queryVals;
-            trainDescs += blockDim.x;
-
-            UnrollDescDiff<N - 1>::calcWithoutCheck(queryVals, trainDescs, dist);
-        }
-    };
-    template <> struct UnrollDescDiff<0>
-    {
-        template <typename Dist, typename T>
-        static __device__ __forceinline__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, 
-            Dist& dist, int ind)
-        {
-        }
-
-        template <typename Dist, typename T>
-        static __device__ __forceinline__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
-        {
-        }
-    };
-
-    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool WITH_OUT_CHECK> struct DescDiffCalculator;
-    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> 
-    struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>
-    {
-        template <typename Dist, typename T>
-        static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
-        {
-            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(queryVals, trainDescs, desc_len, dist, threadIdx.x);
-        }
-    };
-    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> 
-    struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>
-    {
-        template <typename Dist, typename T>
-        static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
-        {
-            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals, trainDescs + threadIdx.x, dist);
-        }
-    };
-
-    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>
-    __device__ __forceinline__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
-    {        
-        DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals, trainDescs, desc_len, dist);
-        
-        dist.reduceAll<BLOCK_DIM_X>(sdiff_row);
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // warpReduceMinIdxIdx
-
-    template <int BLOCK_DIM_Y> struct MinIdxIdxWarpReductor;    
-    template <> struct MinIdxIdxWarpReductor<16>
-    {
-        template <typename T> 
-        static __device__ void reduce(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, volatile T* smin, volatile int* strainIdx, volatile int* simgIdx)
-        {
-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-            if (tid < 8)
-            {
-                myMin = smin[tid];
-                myBestTrainIdx = strainIdx[tid];
-                myBestImgIdx = simgIdx[tid];
-
-                float reg = smin[tid + 8];
-                if (reg < myMin)
-                {
-                    smin[tid] = myMin = reg;
-                    strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8];
-                    simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8];
-                }
-
-                reg = smin[tid + 4];
-                if (reg < myMin)
-                {
-                    smin[tid] = myMin = reg;
-                    strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4];
-                    simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4];
-                }
-            
-                reg = smin[tid + 2];
-                if (reg < myMin)
-                {
-                    smin[tid] = myMin = reg;
-                    strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2];
-                    simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2];
-                }
-            
-                reg = smin[tid + 1];
-                if (reg < myMin)
-                {
-                    smin[tid] = myMin = reg;
-                    strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1];
-                    simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1];
-                }
-            }
-        }
-    };
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // findBestMatch
 
     template <int BLOCK_DIM_Y, typename T>
-    __device__ void findBestMatch(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, T* smin, int* strainIdx, int* simgIdx)
+    __device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
     {
         if (threadIdx.x == 0)
         {
-            smin[threadIdx.y] = myMin;
-            strainIdx[threadIdx.y] = myBestTrainIdx;
-            simgIdx[threadIdx.y] = myBestImgIdx;
+            smin[threadIdx.y] = myDist;
+            sIdx[threadIdx.y] = myIdx;
         }
         __syncthreads();
 
-        MinIdxIdxWarpReductor<BLOCK_DIM_Y>::reduce(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);
+        reducePredVal<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T>());
     }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // ReduceDescCalculator
-
-    template <int BLOCK_DIM_X, typename T> struct ReduceDescCalculatorSimple
-    {
-        __device__ __forceinline__ void prepare(const T* queryDescs_, int, void*)
-        {
-            queryDescs = queryDescs_;
-        }
-
-        template <typename Dist>
-        __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
-        {
-            reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);
-        }
-
-        const T* queryDescs;
-    };
 
-    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T, typename U>
-    struct ReduceDescCalculatorCached
+    template <typename Dist, typename VecDiff, typename T, typename Mask>
+    __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff,
+        typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row)
     {
-        __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem)
-        {
-            loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);
-            __syncthreads();
-        }
-
-        template <typename Dist>
-        __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
-        {
-            reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, desc_len, dist, sdiff_row);
-        }
-
-        U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];
-    };
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // matchDescs loop
-
-    template <typename Dist, typename ReduceDescCalculator, typename T, typename Mask>
-    __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& trainDescs_,  
-        const Mask& m, const ReduceDescCalculator& reduceDescCalc,
-        typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row)
-    {
-        for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y)
+        for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
         {
             if (m(queryIdx, trainIdx))
             {
-                const T* trainDescs = trainDescs_.ptr(trainIdx);
+                const T* trainDescs = train.ptr(trainIdx);
 
                 Dist dist;
 
-                reduceDescCalc.calc(trainDescs, trainDescs_.cols, dist, sdiff_row);
+                vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
 
-                if (threadIdx.x == 0)
+                const typename Dist::result_type res = dist;
+
+                if (res < myDist)
                 {
-                    if (dist < myMin)
-                    {
-                        myMin = dist;
-                        myBestTrainIdx = trainIdx;
-                        myBestImgIdx = imgIdx;
-                    }
+                    myDist = res;
+                    myIdx.x = trainIdx;
+                    myIdx.y = imgIdx;
                 }
             }
         }
     }
 
-    ///////////////////////////////////////////////////////////////////////////////
-    // Train collection loop strategy
-
     template <typename T> struct SingleTrain
     {
-        explicit SingleTrain(const DevMem2D_<T>& trainDescs_) : trainDescs(trainDescs_)
+        explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
         {
         }
 
-        template <typename Dist, typename ReduceDescCalculator, typename Mask>
-        __device__ __forceinline__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, 
-            typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const
+        template <typename Dist, typename VecDiff, typename Mask>
+        __device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, 
+            typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
         {
-            matchDescs<Dist>(queryIdx, 0, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
+            matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row);
         }
 
         __device__ __forceinline__ int desc_len() const
         {
-            return trainDescs.cols;
+            return train.cols;
         }
 
-        const DevMem2D_<T> trainDescs;
+        static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, 
+            float myDist, const int2& myIdx, int queryIdx)
+        {
+            trainIdx[queryIdx] = myIdx.x;
+            distance[queryIdx] = myDist;
+        }
+
+        const DevMem2D_<T> train;
     };
 
     template <typename T> struct TrainCollection
@@ -506,15 +128,15 @@ namespace cv { namespace gpu { namespace bfmatcher
         {
         }
 
-        template <typename Dist, typename ReduceDescCalculator, typename Mask>
-        __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, 
-            typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const
+        template <typename Dist, typename VecDiff, typename Mask>
+        __device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, 
+            typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
         {
             for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
             {
-                const DevMem2D_<T> trainDescs = trainCollection[imgIdx];
-                m.nextMask();
-                matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
+                const DevMem2D_<T> train = trainCollection[imgIdx];
+                m.next();
+                matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row);
             }
         }
 
@@ -523,84 +145,93 @@ namespace cv { namespace gpu { namespace bfmatcher
             return desclen;
         }
 
+        static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, 
+            float myDist, const int2& myIdx, int queryIdx)
+        {
+            trainIdx[queryIdx] = myIdx.x;
+            imgIdx[queryIdx] = myIdx.y;
+            distance[queryIdx] = myDist;
+        }
+
         const DevMem2D_<T>* trainCollection;
-        int nImg;
-        int desclen;
+        const int nImg;
+        const int desclen;
     };
 
-    ///////////////////////////////////////////////////////////////////////////////
-    // Match kernel
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, typename Train, typename Mask>
-    __global__ void match(const PtrStep_<T> queryDescs_, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance)
+    template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
+    __device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx, 
+        typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem)
     {
-        __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y];        
-        
-        const int queryIdx = blockIdx.x;
-        
-        int myBestTrainIdx = -1;
-        int myBestImgIdx = -1;
-        typename Dist::ResultType myMin = numeric_limits<typename Dist::ResultType>::max();
+        const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
+    
+        typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y;
 
-        {
-            typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
+        Mask m = mask;
 
-            Mask m = mask;
+        myIdx.x = -1;
+        myIdx.y = -1;
+        myDist = numeric_limits<typename Dist::result_type>::max();
 
-            ReduceDescCalculator reduceDescCalc;
+        train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row);
+    }
 
-            reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), (typename Dist::ValueType*)smem);
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
+    __global__ void match(const PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance)
+    {
+        __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];        
         
-            train.template loop<Dist>(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
-        }
+        const int queryIdx = blockIdx.x;
+        
+        int2 myIdx;
+        typename Dist::result_type myDist;
+
+        distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem);
         __syncthreads();
 
-        typename Dist::ResultType* smin = smem;
-        int* strainIdx = (int*)(smin + BLOCK_DIM_Y);
-        int* simgIdx = strainIdx + BLOCK_DIM_Y;
+        typename Dist::result_type* smin = smem;
+        int2* sIdx = (int2*)(smin + BLOCK_DIM_Y);
 
-        findBestMatch<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);
+        findBestMatch<BLOCK_DIM_Y>(myDist, myIdx, smin, sIdx);
 
         if (threadIdx.x == 0 && threadIdx.y == 0)
-        {
-            imgIdx[queryIdx] = myBestImgIdx;
-            trainIdx[queryIdx] = myBestTrainIdx;
-            distance[queryIdx] = myMin;
-        }
+            Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx);
     }
-    
+
     ///////////////////////////////////////////////////////////////////////////////
-    // Match kernel callers
+    // Match kernel caller
 
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
-    void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train, 
-        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream)
+    void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
+        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        cudaStream_t stream)
     {
         StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
 
-        dim3 grid(queryDescs.rows, 1, 1);
-        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        const dim3 grid(query.rows, 1, 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
 
-        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>
-            <<<grid, threads, 0, stream>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
+            <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
-    void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train, 
-        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream)
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
+    void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
+        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        cudaStream_t stream)
     {
-        StaticAssert<BLOCK_DIM_Y <= 64>::check();                                // blockDimY vals must reduce by warp
-        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length
-        StaticAssert<MAX_DESCRIPTORS_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
+        StaticAssert<BLOCK_DIM_Y <= 64>::check();                    // blockDimY vals must reduce by warp
+        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
+        StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
 
-        dim3 grid(queryDescs.rows, 1, 1);
-        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        const dim3 grid(query.rows, 1, 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
 
-        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, Dist, T>
-              <<<grid, threads, 0, stream>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
+              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
@@ -608,187 +239,193 @@ namespace cv { namespace gpu { namespace bfmatcher
     }
     
     ///////////////////////////////////////////////////////////////////////////////
-    // Match caller
+    // Match Dispatcher
 
     template <typename Dist, typename T, typename Train, typename Mask>
-    void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train, 
-        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
-        bool cc_12, cudaStream_t stream)
+    void matchDispatcher(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
+        int cc, cudaStream_t stream)
     {
-        if (queryDescs.cols < 64)
-            matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
-        else if (queryDescs.cols == 64)
-            matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
-        else if (queryDescs.cols < 128)
-            matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
-        else if (queryDescs.cols == 128 && cc_12)
-            matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
-        else if (queryDescs.cols < 256 && cc_12)
-            matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
-        else if (queryDescs.cols == 256 && cc_12)
-            matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
+        if (query.cols < 64)
+        {
+            matchCached_caller<16, 16, 64, false, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols == 64)
+        {
+            matchCached_caller<16, 16, 64, true, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols < 128)
+        {
+            matchCached_caller<16, 16, 128, false, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols == 128 && cc >= 12)
+        {
+            matchCached_caller<16, 16, 128, true, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols < 256 && cc >= 12)
+        {
+            matchCached_caller<16, 16, 256, false, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols == 256 && cc >= 12)
+        {
+            matchCached_caller<16, 16, 256, true, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
         else
-            matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
+        {
+            matchSimple_caller<16, 16, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
     }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Match caller
 
-    template <typename T>
-    void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
-        bool cc_12, cudaStream_t stream)
+    template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance,
+        int cc, cudaStream_t stream)
     {
-        SingleTrain<T> train((DevMem2D_<T>)trainDescs);
+        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
         if (mask.data)
-        {
-            SingleMask m(mask);
-            matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
         else
-        {
-            matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
     }
 
-    template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-
-    template <typename T>
-    void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        bool cc_12, cudaStream_t stream)
+    template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
     {
-        SingleTrain<T> train((DevMem2D_<T>)trainDescs);
+        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
         if (mask.data)
-        {
-            SingleMask m(mask);
-            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
         else
-        {
-            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
     }
 
-    template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-
-    template <typename T>
-    void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        bool cc_12, cudaStream_t stream)
+    template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
     {
-        SingleTrain<T> train((DevMem2D_<T>)trainDescs);
+        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
         if (mask.data)
-        {
-            SingleMask m(mask);
-            matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
         else
-        {
-            matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
     }
 
-    template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchSingleHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
+    template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
 
-    template <typename T>
-    void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
-        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, 
-        const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
+    template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
     {
-        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
         if (maskCollection.data)
-        {
-            MaskCollection mask(maskCollection.data);
-            matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
         else
-        {
-            matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
     }
 
-    template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-
-    template <typename T>
-    void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
-        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, 
-        const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
+    template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
     {
-        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
         if (maskCollection.data)
-        {
-            MaskCollection mask(maskCollection.data);
-            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
         else
-        {
-            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
     }
 
-    template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-
-    template <typename T>
-    void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
-        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, 
-        const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
+    template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
     {
-        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
         if (maskCollection.data)
-        {
-            MaskCollection mask(maskCollection.data);
-            matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
         else
-        {
-            matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
-        }
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
     }
 
-    template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
-    template void matchCollectionHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
+    template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
     
 ///////////////////////////////////////////////////////////////////////////////////
 //////////////////////////////////// Knn Match ////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////
 
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename ReduceDescCalculator, typename T, typename Mask>
+    template <typename VecDiff, typename Dist, typename T, typename Mask>
     __device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
-        typename Dist::ResultType& distMin1, typename Dist::ResultType& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, 
-        typename Dist::ResultType* smem)
+        typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, 
+        typename Dist::result_type* smem)
     {
-        ReduceDescCalculator reduceDescCalc;
-
-        reduceDescCalc.prepare(query.ptr(queryIdx), train.cols, (typename Dist::ValueType*)smem);
+        const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
+        
+        typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
         
-        typename Dist::ResultType* sdiffRow = smem + BLOCK_DIM_X * threadIdx.y;
+        distMin1 = numeric_limits<typename Dist::result_type>::max();
+        distMin2 = numeric_limits<typename Dist::result_type>::max();
 
-        for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += BLOCK_DIM_Y)
+        bestTrainIdx1 = -1;
+        bestTrainIdx2 = -1;
+
+        for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
         {
             if (m(queryIdx, trainIdx))
             {
@@ -796,48 +433,44 @@ namespace cv { namespace gpu { namespace bfmatcher
 
                 const T* trainRow = train.ptr(trainIdx);
                 
-                reduceDescCalc.calc(trainRow, train.cols, dist, sdiffRow);
+                vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
 
-                if (threadIdx.x == 0)
-                {
-                    typename Dist::ResultType val = dist;
+                const typename Dist::result_type val = dist;
 
-                    if (val < distMin1)
-                    {
-                        distMin1 = val;
-                        bestTrainIdx1 = trainIdx;
-                    }
-                    else if (val < distMin2)
-                    {
-                        distMin2 = val;
-                        bestTrainIdx2 = trainIdx;
-                    }
+                if (val < distMin1)
+                {
+                    distMin1 = val;
+                    bestTrainIdx1 = trainIdx;
+                }
+                else if (val < distMin2)
+                {
+                    distMin2 = val;
+                    bestTrainIdx2 = trainIdx;
                 }
             }
         }
     }
 
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename ReduceDescCalculator, typename T, typename Mask>
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
     __global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, PtrStep_<int2> trainIdx, PtrStep_<float2> distance)
     {
-        typedef typename Dist::ResultType ResultType;
-        typedef typename Dist::ValueType ValueType;
+        typedef typename Dist::result_type result_type;
+        typedef typename Dist::value_type value_type;
 
-        __shared__ ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y];
+        __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
 
         const int queryIdx = blockIdx.x;
 
-        ResultType distMin1 = numeric_limits<ResultType>::max();
-        ResultType distMin2 = numeric_limits<ResultType>::max();
+        result_type distMin1;
+        result_type distMin2;
 
-        int bestTrainIdx1 = -1;
-        int bestTrainIdx2 = -1;
+        int bestTrainIdx1;
+        int bestTrainIdx2;
 
-        distanceCalcLoop<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, ReduceDescCalculator>(query, train, m, queryIdx, 
-            distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
+        distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
         __syncthreads();
 
-        volatile ResultType* sdistMinRow = smem;
+        volatile result_type* sdistMinRow = smem;
         volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);
 
         if (threadIdx.x == 0)
@@ -852,8 +485,8 @@ namespace cv { namespace gpu { namespace bfmatcher
 
         if (threadIdx.x == 0 && threadIdx.y == 0)
         {
-            distMin1 = numeric_limits<ResultType>::max();
-            distMin2 = numeric_limits<ResultType>::max();
+            distMin1 = numeric_limits<result_type>::max();
+            distMin2 = numeric_limits<result_type>::max();
 
             bestTrainIdx1 = -1;
             bestTrainIdx2 = -1;
@@ -861,7 +494,7 @@ namespace cv { namespace gpu { namespace bfmatcher
             #pragma unroll
             for (int i = 0; i < BLOCK_DIM_Y; ++i)
             {
-                ResultType val = sdistMinRow[i];
+                result_type val = sdistMinRow[i];
 
                 if (val < distMin1)
                 {
@@ -878,7 +511,7 @@ namespace cv { namespace gpu { namespace bfmatcher
             #pragma unroll
             for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
             {
-                ResultType val = sdistMinRow[i];
+                result_type val = sdistMinRow[i];
 
                 if (val < distMin2)
                 {
@@ -892,87 +525,131 @@ namespace cv { namespace gpu { namespace bfmatcher
         }
     }
 
+    ///////////////////////////////////////////////////////////////////////////////
+    // Knn 2 Match kernel caller
+
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    void knnMatch2Simple_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, 
-        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, cudaStream_t stream)
+    void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+        cudaStream_t stream)
     {
-        dim3 grid(queryDescs.rows, 1, 1);
-        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        const dim3 grid(query.rows, 1, 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
 
-        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, T>
-            <<<grid, threads, 0, stream>>>(queryDescs, trainDescs, mask, trainIdx, distance);
+        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
+            <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
-    void knnMatch2Cached_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, 
-        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, cudaStream_t stream)
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
+    void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+        cudaStream_t stream)
     {
-        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length
-        StaticAssert<MAX_DESCRIPTORS_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
+        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
+        StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
 
-        dim3 grid(queryDescs.rows, 1, 1);
-        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        const dim3 grid(query.rows, 1, 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
 
-        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, T>
-              <<<grid, threads, 0, stream>>>(queryDescs, trainDescs, mask, trainIdx, distance);
+        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
+              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Knn 2 Match Dispatcher
     
     template <typename Dist, typename T, typename Mask>
     void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, bool cc_12, cudaStream_t stream)
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
     {
         if (query.cols < 64)
-            knnMatch2Cached_caller<16, 16, 64, false, Dist>(query, train, mask, trainIdx, distance, stream);
+        {
+            knnMatch2Cached_caller<16, 16, 64, false, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
+                stream);
+        }
         else if (query.cols == 64)
-            knnMatch2Cached_caller<16, 16, 64, true, Dist>(query, train, mask, trainIdx, distance, stream);
+        {
+            knnMatch2Cached_caller<16, 16, 64, true, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
         else if (query.cols < 128)
-            knnMatch2Cached_caller<16, 16, 128, false, Dist>(query, train, mask, trainIdx, distance, stream);
-        else if (query.cols == 128 && cc_12)
-            knnMatch2Cached_caller<16, 16, 128, true, Dist>(query, train, mask, trainIdx, distance, stream);
-        else if (query.cols < 256 && cc_12)
-            knnMatch2Cached_caller<16, 16, 256, false, Dist>(query, train, mask, trainIdx, distance, stream);
-        else if (query.cols == 256 && cc_12)
-            knnMatch2Cached_caller<16, 16, 256, true, Dist>(query, train, mask, trainIdx, distance, stream);
+        {
+            knnMatch2Cached_caller<16, 16, 128, false, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
+        else if (query.cols == 128 && cc >= 12)
+        {
+            knnMatch2Cached_caller<16, 16, 128, true, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
+        else if (query.cols < 256 && cc >= 12)
+        {
+            knnMatch2Cached_caller<16, 16, 256, false, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
+        else if (query.cols == 256 && cc >= 12)
+        {
+            knnMatch2Cached_caller<16, 16, 256, true, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
         else
-            knnMatch2Simple_caller<16, 16, Dist>(query, train, mask, trainIdx, distance, stream);
+        {
+            knnMatch2Simple_caller<16, 16, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
+                stream);
+        }
     }
     
     ///////////////////////////////////////////////////////////////////////////////
     // Calc distance kernel
 
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    __global__ void calcDistance(const PtrStep_<T> queryDescs_, const DevMem2D_<T> trainDescs_, const Mask mask, PtrStepf distance)
+    __global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance)
     {
-        __shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
+        __shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
 
-        typename Dist::ResultType* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
+        typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
         
         const int queryIdx = blockIdx.x;
-        const T* queryDescs = queryDescs_.ptr(queryIdx);
+        const T* queryDescs = query.ptr(queryIdx);
 
         const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
 
-        if (trainIdx < trainDescs_.rows)
+        if (trainIdx < train.rows)
         {
-            const T* trainDescs = trainDescs_.ptr(trainIdx);
+            const T* trainDescs = train.ptr(trainIdx);
 
-            typename Dist::ResultType myDist = numeric_limits<typename Dist::ResultType>::max();
+            typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
 
             if (mask(queryIdx, trainIdx))
             {
                 Dist dist;
 
-                reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, trainDescs_.cols, dist, sdiff_row);
+                calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
 
-                if (threadIdx.x == 0)
-                    myDist = dist;
+                myDist = dist;
             }
             
             if (threadIdx.x == 0)
@@ -984,150 +661,24 @@ namespace cv { namespace gpu { namespace bfmatcher
     // Calc distance kernel caller
 
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
-        const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
+    void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
     {
-        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
-        dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
 
-        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(
-            queryDescs, trainDescs, mask, distance);
+        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
-        
-    ///////////////////////////////////////////////////////////////////////////////
-    // warpReduceMinIdx
 
-    template <int BLOCK_SIZE, typename T> 
-    __device__ void warpReduceMinIdx(volatile T* sdist, volatile int* strainIdx, T& myMin, int tid)
+    template <typename Dist, typename T, typename Mask>
+    void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
     {
-        if (tid < 32)
-        {
-            if (BLOCK_SIZE >= 64) 
-            { 
-                T reg = sdist[tid + 32];
-
-                if (reg < myMin)
-                {
-                    sdist[tid] = myMin = reg;
-                    strainIdx[tid] = strainIdx[tid + 32];
-                }
-            }
-            if (BLOCK_SIZE >= 32) 
-            { 
-                T reg = sdist[tid + 16];
-
-                if (reg < myMin)
-                {
-                    sdist[tid] = myMin = reg;
-                    strainIdx[tid] = strainIdx[tid + 16];
-                }
-            }
-            if (BLOCK_SIZE >= 16) 
-            { 
-                T reg = sdist[tid + 8];
-
-                if (reg < myMin)
-                {
-                    sdist[tid] = myMin = reg;
-                    strainIdx[tid] = strainIdx[tid + 8];
-                }
-            }
-            if (BLOCK_SIZE >= 8) 
-            { 
-                T reg = sdist[tid + 4];
-
-                if (reg < myMin)
-                {
-                    sdist[tid] = myMin = reg;
-                    strainIdx[tid] = strainIdx[tid + 4];
-                }
-            }
-            if (BLOCK_SIZE >= 4) 
-            { 
-                T reg = sdist[tid + 2];
-
-                if (reg < myMin)
-                {
-                    sdist[tid] = myMin = reg;
-                    strainIdx[tid] = strainIdx[tid + 2];
-                } 
-            }
-            if (BLOCK_SIZE >= 2) 
-            { 
-                T reg = sdist[tid + 1];
-
-                if (reg < myMin)
-                {
-                    sdist[tid] = myMin = reg;
-                    strainIdx[tid] = strainIdx[tid + 1];
-                }
-            }
-        }
+        calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
     }
-    
-    template <int BLOCK_SIZE, typename T> 
-    __device__ void reduceMinIdx(const T* dist, int n, T* sdist, int* strainIdx)
-    {
-        const int tid = threadIdx.x;
-        
-        T myMin = numeric_limits<T>::max();
-        int myMinIdx = -1;
-
-        for (int i = tid; i < n; i += BLOCK_SIZE)
-        {
-            T reg = dist[i];
-            if (reg < myMin)
-            {
-                myMin = reg;
-                myMinIdx = i;
-            }
-        }
-
-        sdist[tid] = myMin;
-        strainIdx[tid] = myMinIdx;
-        __syncthreads();
-
-        if (BLOCK_SIZE >= 512 && tid < 256) 
-        {
-            T reg = sdist[tid + 256];
 
-            if (reg < myMin)
-            {
-                sdist[tid] = myMin = reg;
-                strainIdx[tid] = strainIdx[tid + 256];
-            }
-            __syncthreads(); 
-        }
-        if (BLOCK_SIZE >= 256 && tid < 128) 
-        {
-            T reg = sdist[tid + 128];
-
-            if (reg < myMin)
-            {
-                sdist[tid] = myMin = reg;
-                strainIdx[tid] = strainIdx[tid + 128];
-            }
-            __syncthreads(); 
-        }
-        if (BLOCK_SIZE >= 128 && tid < 64) 
-        {
-            T reg = sdist[tid + 64];
-
-            if (reg < myMin)
-            {
-                sdist[tid] = myMin = reg;
-                strainIdx[tid] = strainIdx[tid + 64];
-            }
-            __syncthreads(); 
-        }
-        
-        warpReduceMinIdx<BLOCK_SIZE>(sdist, strainIdx, myMin, tid);
-    }
-    
     ///////////////////////////////////////////////////////////////////////////////
     // find knn match kernel
 
@@ -1143,14 +694,29 @@ namespace cv { namespace gpu { namespace bfmatcher
         int* trainIdx = trainIdx_.ptr(queryIdx);
         float* distance = distance_.ptr(queryIdx);
 
-        reduceMinIdx<BLOCK_SIZE>(allDist, allDist_.cols, sdist, strainIdx);
+        float dist = numeric_limits<float>::max();
+        int bestIdx = -1;
+        
+        for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE)
+        {
+            float reg = allDist[i];
+            if (reg < dist)
+            {
+                dist = reg;
+                bestIdx = i;
+            }
+        }
+
+        sdist[threadIdx.x] = dist;
+        strainIdx[threadIdx.x] = bestIdx;
+        __syncthreads();
+
+        reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>());
 
         if (threadIdx.x == 0)
         {
-            float dist = sdist[0];
             if (dist < numeric_limits<float>::max())
             {
-                int bestIdx = strainIdx[0];
                 allDist[bestIdx] = numeric_limits<float>::max();
                 trainIdx[i] = bestIdx;
                 distance[i] = dist;
@@ -1161,13 +727,12 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////
     // find knn match kernel caller
 
-    template <int BLOCK_SIZE>
-    void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
+    template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
     {
-        dim3 threads(BLOCK_SIZE, 1, 1);
-        dim3 grid(trainIdx.rows, 1, 1);
+        const dim3 threads(BLOCK_SIZE, 1, 1);
+        const dim3 grid(trainIdx.rows, 1, 1);
 
-        for (int i = 0; i < knn; ++i)
+        for (int i = 0; i < k; ++i)
         {
             findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
             cudaSafeCall( cudaGetLastError() );
@@ -1176,121 +741,116 @@ namespace cv { namespace gpu { namespace bfmatcher
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // knn match caller
-
-    template <typename Dist, typename T, typename Mask>
-    void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
-    {
-        calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist, stream);
-    }
 
-    void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
+    void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream)
     {
-        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist, stream);
+        findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream);
     }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // knn match Dispatcher
 
-    template < typename Dist, typename T >
-    void knnMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, int knn,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream)
+    template <typename Dist, typename T>
+    void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream)
     {
         if (mask.data)
         {
-            if (knn == 2)
+            if (k == 2)
             {
-                knnMatch2Dispatcher<Dist>(queryDescs, trainDescs, SingleMask(mask), (DevMem2D_<int2>)trainIdx, (DevMem2D_<float2>)distance, cc_12, stream);
+                knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
                 return;
             }
 
-            calcDistanceDispatcher<Dist>(queryDescs, trainDescs, SingleMask(mask), allDist, stream);
+            calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
         }
         else
         {
-            if (knn == 2)
+            if (k == 2)
             {
-                knnMatch2Dispatcher<Dist>(queryDescs, trainDescs, WithOutMask(), (DevMem2D_<int2>)trainIdx, (DevMem2D_<float2>)distance, cc_12, stream);
+                knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
                 return;
             }
 
-            calcDistanceDispatcher<Dist>(queryDescs, trainDescs, WithOutMask(), allDist, stream);
+            calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
         }
 
-        findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
+        findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
     }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // knn match caller
 
-    template <typename T>
-    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream)
+    template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream)
     {
-        knnMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream);
+        knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
     }
 
-    template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
+    template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
 
-    template <typename T>
-    void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream)
+    template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
+        int cc, cudaStream_t stream)
     {
-        knnMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream);
+        knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
     }
 
-    template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
+    template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
 
-    template <typename T>
-    void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream)
+    template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream)
     {
-        knnMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream);
+        knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
     }
 
-    template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
-    template void knnMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
+    template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
 
 ///////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////// Radius Match //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match kernel
 
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    __global__ void radiusMatch(const PtrStep_<T> queryDescs_, const DevMem2D_<T> trainDescs_, 
-        float maxDistance, const Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
+    __global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask, 
+        DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
     {
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+        #if __CUDA_ARCH__ >= 110
 
-        __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y];
+        __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
 
-        typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
+        typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
         
         const int queryIdx = blockIdx.x;
-        const T* queryDescs = queryDescs_.ptr(queryIdx);
+        const T* queryDescs = query.ptr(queryIdx);
 
         const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
 
-        if (trainIdx < trainDescs_.rows)
+        if (trainIdx < train.rows)
         {
-            const T* trainDescs = trainDescs_.ptr(trainIdx);
+            const T* trainDescs = train.ptr(trainIdx);
 
             if (mask(queryIdx, trainIdx))
             {
                 Dist dist;
 
-                reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, trainDescs_.cols, dist, sdiff_row);
+                calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
 
                 if (threadIdx.x == 0)
                 {
@@ -1314,15 +874,14 @@ namespace cv { namespace gpu { namespace bfmatcher
     // Radius Match kernel caller
 
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
-        float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, 
-        const DevMem2Df& distance, cudaStream_t stream)
+    void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+        const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance, 
+        cudaStream_t stream)
     {
-        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
-        dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
 
-        radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(
-            queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
+        radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
@@ -1330,82 +889,92 @@ namespace cv { namespace gpu { namespace bfmatcher
     }
     
     ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match caller
+    // Radius Match Dispatcher
 
     template <typename Dist, typename T, typename Mask>
-    void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
-        float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, 
-        const DevMem2Df& distance, cudaStream_t stream)
+    void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream)
     {
-        radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask, 
-            trainIdx, nMatches, distance, stream);
+        radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, 
+            static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance), 
+            stream);
     }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Radius Match caller
 
-    template <typename T>
-    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
+    template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream)
     {
         if (mask.data)
         {
-            radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
-                maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
+            radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                trainIdx, nMatches, distance, 
+                stream);
         }
         else
         {
-            radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
-                maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
+            radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                trainIdx, nMatches, distance, 
+                stream);
         }
     }
 
-    template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
 
-    template <typename T>
-    void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
+    template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream)
     {
         if (mask.data)
         {
-            radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
-                maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
+            radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                trainIdx, nMatches, distance, 
+                stream);
         }
         else
         {
-            radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
-                maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
+            radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                trainIdx, nMatches, distance, 
+                stream);
         }
     }
 
-    template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
+    template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
 
-    template <typename T>
-    void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
+    template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream)
     {
         if (mask.data)
         {
-            radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
-                maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
+            radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                trainIdx, nMatches, distance, 
+                stream);
         }
         else
         {
-            radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
-                maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
+            radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                trainIdx, nMatches, distance, 
+                stream);
         }
     }
 
-    template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
-    template void radiusMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
+    template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
 }}}
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index a22077e2f6..709d62b716 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -566,8 +566,8 @@ namespace cv { namespace gpu { namespace surf
 
                 float* s_sum_row = s_sum + threadIdx.y * 32;
 
-                warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
-                warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
+                reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
+                reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
 
                 const float temp_mod = sumx * sumx + sumy * sumy;
                 if (temp_mod > best_mod)
diff --git a/modules/gpu/src/opencv2/gpu/device/color.hpp b/modules/gpu/src/opencv2/gpu/device/color.hpp
index d620ead481..f6bdde92f7 100644
--- a/modules/gpu/src/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_COLOR_HPP__
 #define __OPENCV_GPU_COLOR_HPP__
 
-#include "detail/color.hpp"
+#include "detail/color_detail.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
index 407aea2f93..c8937c1d90 100644
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -44,7 +44,14 @@
 #define __OPENCV_GPU_DATAMOV_UTILS_HPP__
 
 #include "internal_shared.hpp"
-#include "utility.hpp"
+
+#if defined(_WIN64) || defined(__LP64__)		
+    // 64-bit register modifier for inlined asm
+    #define OPENCV_GPU_ASM_PTR "l"
+#else	
+    // 32-bit register modifier for inlined asm
+    #define OPENCV_GPU_ASM_PTR "r"
+#endif
 
 namespace cv { namespace gpu { namespace device
 {
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/color.hpp b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
similarity index 100%
rename from modules/gpu/src/opencv2/gpu/device/detail/color.hpp
rename to modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
similarity index 100%
rename from modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
rename to modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
new file mode 100644
index 0000000000..f6acce1887
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
@@ -0,0 +1,186 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
+#define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
+
+#include "../vec_traits.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace detail
+    {
+        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+
+        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+
+        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+
+        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+        template <> struct IsIntegral<char> { enum {value = 1}; };
+        template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+        template <typename T> struct IsFloat { enum {value = 0}; };
+        template <> struct IsFloat<float> { enum {value = 1}; };
+        template <> struct IsFloat<double> { enum {value = 1}; };
+
+        template <typename T> struct IsVec { enum {value = 0}; };
+        template <> struct IsVec<uchar1> { enum {value = 1}; };
+        template <> struct IsVec<uchar2> { enum {value = 1}; };
+        template <> struct IsVec<uchar3> { enum {value = 1}; };
+        template <> struct IsVec<uchar4> { enum {value = 1}; };
+        template <> struct IsVec<uchar8> { enum {value = 1}; };
+        template <> struct IsVec<char1> { enum {value = 1}; };
+        template <> struct IsVec<char2> { enum {value = 1}; };
+        template <> struct IsVec<char3> { enum {value = 1}; };
+        template <> struct IsVec<char4> { enum {value = 1}; };
+        template <> struct IsVec<char8> { enum {value = 1}; };
+        template <> struct IsVec<ushort1> { enum {value = 1}; };
+        template <> struct IsVec<ushort2> { enum {value = 1}; };
+        template <> struct IsVec<ushort3> { enum {value = 1}; };
+        template <> struct IsVec<ushort4> { enum {value = 1}; };
+        template <> struct IsVec<ushort8> { enum {value = 1}; };
+        template <> struct IsVec<short1> { enum {value = 1}; };
+        template <> struct IsVec<short2> { enum {value = 1}; };
+        template <> struct IsVec<short3> { enum {value = 1}; };
+        template <> struct IsVec<short4> { enum {value = 1}; };
+        template <> struct IsVec<short8> { enum {value = 1}; };
+        template <> struct IsVec<uint1> { enum {value = 1}; };
+        template <> struct IsVec<uint2> { enum {value = 1}; };
+        template <> struct IsVec<uint3> { enum {value = 1}; };
+        template <> struct IsVec<uint4> { enum {value = 1}; };
+        template <> struct IsVec<uint8> { enum {value = 1}; };
+        template <> struct IsVec<int1> { enum {value = 1}; };
+        template <> struct IsVec<int2> { enum {value = 1}; };
+        template <> struct IsVec<int3> { enum {value = 1}; };
+        template <> struct IsVec<int4> { enum {value = 1}; };
+        template <> struct IsVec<int8> { enum {value = 1}; };
+        template <> struct IsVec<float1> { enum {value = 1}; };
+        template <> struct IsVec<float2> { enum {value = 1}; };
+        template <> struct IsVec<float3> { enum {value = 1}; };
+        template <> struct IsVec<float4> { enum {value = 1}; };
+        template <> struct IsVec<float8> { enum {value = 1}; };
+        template <> struct IsVec<double1> { enum {value = 1}; };
+        template <> struct IsVec<double2> { enum {value = 1}; };
+        template <> struct IsVec<double3> { enum {value = 1}; };
+        template <> struct IsVec<double4> { enum {value = 1}; };
+        template <> struct IsVec<double8> { enum {value = 1}; };
+
+        template <class U> struct AddParameterType { typedef const U& type; };
+        template <class U> struct AddParameterType<U&> { typedef U& type; };
+        template <> struct AddParameterType<void> { typedef void type; };
+
+        template <class U> struct ReferenceTraits 
+        {
+            enum { value = false };
+            typedef U type;
+        };        
+        template <class U> struct ReferenceTraits<U&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+               
+        template <class U> struct PointerTraits
+        {
+            enum { value = false };
+            typedef void type;
+        };        
+        template <class U> struct PointerTraits<U*>
+        {
+            enum { value = true };
+            typedef U type;
+        };        
+        template <class U> struct PointerTraits<U*&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+         
+        template <class U> struct UnConst
+        {
+            typedef U type;
+            enum { value = 0 };
+        };        
+        template <class U> struct UnConst<const U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnConst<const U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+
+        template <class U> struct UnVolatile
+        {
+            typedef U type;
+            enum { value = 0 };
+        };       
+        template <class U> struct UnVolatile<volatile U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnVolatile<volatile U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
new file mode 100644
index 0000000000..de3f3f7287
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
@@ -0,0 +1,576 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__
+#define __OPENCV_GPU_UTILITY_DETAIL_HPP__
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace detail
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
+
+        template <int n> struct WarpReductor
+        {
+            template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+            {
+                if (tid < n)
+                    data[tid] = partial_reduction;                
+                if (n > 32) __syncthreads();
+
+                if (n > 32)
+                {
+                    if (tid < n - 32) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
+                    if (tid < 16)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
+                }
+                else if (n > 16)
+                {
+                    if (tid < n - 16) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                    if (tid < 8)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
+                }
+                else if (n > 8)
+                {
+                    if (tid < n - 8) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                    if (tid < 4)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
+                }
+                else if (n > 4)
+                {
+                    if (tid < n - 4) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                    if (tid < 2)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
+                }   
+                else if (n > 2)
+                {
+                    if (tid < n - 2) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                    if (tid < 2)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
+                }      
+            }
+        };
+        template <> struct WarpReductor<64>
+        {
+            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+            {
+                data[tid] = partial_reduction;
+                __syncthreads();
+                
+                if (tid < 32) 
+                {
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
+                }
+            }
+        };
+        template <> struct WarpReductor<32>
+        {
+            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+            {
+                data[tid] = partial_reduction;
+                
+                if (tid < 16) 
+                {
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
+                }
+            }
+        };
+        template <> struct WarpReductor<16>
+        {
+            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+            {
+                data[tid] = partial_reduction;
+                
+                if (tid < 8) 
+                {
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
+                }
+            }
+        };
+        template <> struct WarpReductor<8>
+        {
+            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+            {
+                data[tid] = partial_reduction;
+                
+                if (tid < 4) 
+                {
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
+                }
+            }
+        };
+
+        template <bool warp> struct ReductionDispatcher;
+        template <> struct ReductionDispatcher<true>
+        {
+            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+            {
+                WarpReductor<n>::reduce(data, partial_reduction, tid, op);
+            }
+        };
+        template <> struct ReductionDispatcher<false>
+        {
+            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+            {
+                if (tid < n)
+                    data[tid] = partial_reduction;
+                __syncthreads();
+
+
+                if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
+                if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
+                if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }
+
+                if (tid < 32)
+                {
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                }
+            }
+        };
+
+        
+        template <int n> struct PredValWarpReductor;
+        template <> struct PredValWarpReductor<64>
+        {
+            template <typename T, typename V, typename Pred> 
+            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+            {
+                if (tid < 32)
+                {
+                    myData = sdata[tid];
+                    myVal = sval[tid];
+
+                    T reg = sdata[tid + 32];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 32];
+                    }
+
+                    reg = sdata[tid + 16];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 16];
+                    }
+
+                    reg = sdata[tid + 8];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 8];
+                    }
+
+                    reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 1];
+                    }
+                }
+            }
+        };
+        template <> struct PredValWarpReductor<32>
+        {
+            template <typename T, typename V, typename Pred> 
+            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+            {
+                if (tid < 16)
+                {
+                    myData = sdata[tid];
+                    myVal = sval[tid];
+
+                    T reg = sdata[tid + 16];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 16];
+                    }
+
+                    reg = sdata[tid + 8];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 8];
+                    }
+
+                    reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 1];
+                    }
+                }
+            }
+        };
+
+        template <> struct PredValWarpReductor<16>
+        {
+            template <typename T, typename V, typename Pred> 
+            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+            {
+                if (tid < 8)
+                {
+                    myData = sdata[tid];
+                    myVal = sval[tid];
+
+                    T reg = reg = sdata[tid + 8];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 8];
+                    }
+
+                    reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 1];
+                    }
+                }
+            }
+        };
+        template <> struct PredValWarpReductor<8>
+        {
+            template <typename T, typename V, typename Pred> 
+            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+            {
+                if (tid < 4)
+                {
+                    myData = sdata[tid];
+                    myVal = sval[tid];
+
+                    T reg = reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 1];
+                    }
+                }
+            }
+        };
+
+        template <bool warp> struct PredValReductionDispatcher;
+        template <> struct PredValReductionDispatcher<true>
+        {
+            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+            {
+                PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
+            }
+        };
+        template <> struct PredValReductionDispatcher<false>
+        {
+            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+            {
+                myData = sdata[tid];
+                myVal = sval[tid];
+
+                if (n >= 512 && tid < 256) 
+                {
+                    T reg = sdata[tid + 256];
+
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 256];
+                    }
+                    __syncthreads(); 
+                }
+                if (n >= 256 && tid < 128) 
+                {
+                    T reg = sdata[tid + 128];
+
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 128];
+                    }
+                    __syncthreads(); 
+                }
+                if (n >= 128 && tid < 64) 
+                {
+                    T reg = sdata[tid + 64];
+
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 64];
+                    }
+                    __syncthreads(); 
+                }        
+
+                if (tid < 32)
+                {
+                    if (n >= 64) 
+                    { 
+                        T reg = sdata[tid + 32];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 32];
+                        }
+                    }
+                    if (n >= 32) 
+                    { 
+                        T reg = sdata[tid + 16];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 16];
+                        }
+                    }
+                    if (n >= 16) 
+                    { 
+                        T reg = sdata[tid + 8];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 8];
+                        }
+                    }
+                    if (n >= 8) 
+                    { 
+                        T reg = sdata[tid + 4];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 4];
+                        }
+                    }
+                    if (n >= 4) 
+                    { 
+                        T reg = sdata[tid + 2];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 2];
+                        } 
+                    }
+                    if (n >= 2) 
+                    { 
+                        T reg = sdata[tid + 1];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 1];
+                        }
+                    }
+                }
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Vector Distance
+
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+
+                    dist.reduceIter(val1, val2);
+
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+
+                dist.reduceIter(val1, val2);
+
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
index 151c03a53b..f9c8d81cf5 100644
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -55,7 +55,7 @@ namespace cv
 			{
 #if __CUDA_ARCH__ >= 200
 				(void)cta_buffer;
-				return __ballot(predicat);
+				return __ballot(predicate);
 #else
 				int tid = threadIdx.x;				
 				cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp
index be3ea7d278..58af91d9dc 100644
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -47,6 +47,7 @@
 #include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
+#include "type_traits.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -57,55 +58,188 @@ namespace cv { namespace gpu { namespace device
 
     // Arithmetic Operations
 
-    using thrust::plus;
-    using thrust::minus;
-    using thrust::multiplies;
-    using thrust::divides;
-    using thrust::modulus;
-    using thrust::negate;
+    template <typename T> struct plus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a + b;
+        }
+    };
+    template <typename T> struct minus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a - b;
+        }
+    };
+    template <typename T> struct multiplies : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a * b;
+        }
+    };
+    template <typename T> struct divides : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a / b;
+        }
+    };
+    template <typename T> struct modulus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a % b;
+        }
+    };
+    template <typename T> struct negate : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return -a;
+        }
+    };
 
     // Comparison Operations
     
-    using thrust::equal_to;
-    using thrust::not_equal_to;
-    using thrust::greater;
-    using thrust::less;
-    using thrust::greater_equal;
-    using thrust::less_equal;
+    template <typename T> struct equal_to : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a == b;
+        }
+    };
+    template <typename T> struct not_equal_to : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a != b;
+        }
+    };
+    template <typename T> struct greater : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a > b;
+        }
+    };
+    template <typename T> struct less : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a < b;
+        }
+    };
+    template <typename T> struct greater_equal : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a >= b;
+        }
+    };
+    template <typename T> struct less_equal : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a <= b;
+        }
+    };
 
     // Logical Operations
     
-    using thrust::logical_and;
-    using thrust::logical_or;
-    using thrust::logical_not;
+    template <typename T> struct logical_and : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a && b;
+        }
+    };
+    template <typename T> struct logical_or : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a || b;
+        }
+    };
+    template <typename T> struct logical_not : unary_function<T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return !a;
+        }
+    };
 
     // Bitwise Operations
 
-    using thrust::bit_and;
-    using thrust::bit_or;
-    using thrust::bit_xor;
+    template <typename T> struct bit_and : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a & b;
+        }
+    };
+    template <typename T> struct bit_or : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a | b;
+        }
+    };
+    template <typename T> struct bit_xor : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a ^ b;
+        }
+    };
     template <typename T> struct bit_not : unary_function<T, T>
     {
-        __forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const 
+        {
+            return ~v;
+        }
     };
 
     // Generalized Identity Operations
 
-    using thrust::identity;    
-    using thrust::project1st;
-    using thrust::project2nd;
+    template <typename T> struct identity : unary_function<T, T>
+    {
+        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const 
+        {
+            return x;
+        }
+    };
+
+    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
+    {
+        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        {
+            return lhs;
+        }
+    };
+    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
+    {
+        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        {
+            return rhs;
+        }
+    };
 
     // Min/Max Operations
 
 #define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
     template <> struct name<type> : binary_function<type, type, type> \
     { \
-        __forceinline__ __device__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
+        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
     };
 
     template <typename T> struct maximum : binary_function<T, T, T>
     {
-        __forceinline__ __device__ T operator()(const T& lhs, const T& rhs) const {return lhs < rhs ? rhs : lhs;}
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+        {
+            return lhs < rhs ? rhs : lhs;
+        }
     };
     OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, max)
     OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, max)
@@ -119,7 +253,10 @@ namespace cv { namespace gpu { namespace device
 
     template <typename T> struct minimum : binary_function<T, T, T>
     {
-        __forceinline__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;}
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+        {
+            return lhs < rhs ? lhs : rhs;
+        }
     };
     OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, min)
     OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, min)
@@ -138,14 +275,14 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
     template <typename T> struct func ## _func : unary_function<T, float> \
     { \
-        __forceinline__ __device__ float operator ()(const T& v) const \
+        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \
         { \
             return func ## f(v); \
         } \
     }; \
     template <> struct func ## _func<double> : unary_function<double, double> \
     { \
-        __forceinline__ __device__ double operator ()(double v) const \
+        __device__ __forceinline__ double operator ()(double v) const \
         { \
             return func(v); \
         } \
@@ -153,14 +290,14 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \
     template <typename T> struct func ## _func : binary_function<T, T, float> \
     { \
-        __forceinline__ __device__ float operator ()(const T& v1, const T& v2) const \
+        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \
         { \
             return func ## f(v1, v2); \
         } \
     }; \
     template <> struct func ## _func<double> : binary_function<double, double, double> \
     { \
-        __forceinline__ __device__ double operator ()(double v1, double v2) const \
+        __device__ __forceinline__ double operator ()(double v1, double v2) const \
         { \
             return func(v1, v2); \
         } \
@@ -196,7 +333,7 @@ namespace cv { namespace gpu { namespace device
 
     template<typename T> struct hypot_sqr_func : binary_function<T, T, float> 
     {
-        __forceinline__ __device__ T operator ()(T src1, T src2) const
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
         {
             return src1 * src1 + src2 * src2;
         }
@@ -206,7 +343,7 @@ namespace cv { namespace gpu { namespace device
 
     template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
     {
-        __forceinline__ __device__ D operator ()(const T& v) const
+        __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
         {
             return saturate_cast<D>(v);
         }
@@ -216,11 +353,11 @@ namespace cv { namespace gpu { namespace device
 
     template <typename T> struct thresh_binary_func : unary_function<T, T>
     {
-        __forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+        __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
 
-        __forceinline__ __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
         {
-            return src > thresh ? maxVal : 0;
+            return (src > thresh) * maxVal;
         }
 
         const T thresh;
@@ -228,11 +365,11 @@ namespace cv { namespace gpu { namespace device
     };
     template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
     {
-        __forceinline__ __host__ __device__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
 
-        __forceinline__ __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
         {
-            return src > thresh ? 0 : maxVal;
+            return (src <= thresh) * maxVal;
         }
 
         const T thresh;
@@ -240,9 +377,9 @@ namespace cv { namespace gpu { namespace device
     };
     template <typename T> struct thresh_trunc_func : unary_function<T, T>
     {
-        explicit __forceinline__ __host__ __device__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+        explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
-        __forceinline__ __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
         {
             return minimum<T>()(src, thresh);
         }
@@ -251,22 +388,22 @@ namespace cv { namespace gpu { namespace device
     };
     template <typename T> struct thresh_to_zero_func : unary_function<T, T>
     {
-        explicit __forceinline__ __host__ __device__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
-        __forceinline__ __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
         {
-            return src > thresh ? src : 0;
+            return (src > thresh) * src;
         }
 
         const T thresh;
     };
     template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
     {
-        explicit __forceinline__ __host__ __device__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
-        __forceinline__ __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
         {
-            return src > thresh ? 0 : src;
+            return (src <= thresh) * src;
         }
 
         const T thresh;
@@ -274,17 +411,43 @@ namespace cv { namespace gpu { namespace device
 
     // Function Object Adaptors
 
-    using thrust::unary_negate;
-    using thrust::not1;
+    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
+    {
+      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
+
+      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
+      { 
+          return !pred(x); 
+      }
+
+      const Predicate pred;
+    };
+    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
+    {
+        return unary_negate<Predicate>(pred);
+    }
 
-    using thrust::binary_negate;
-    using thrust::not2;
+    template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
+    {
+        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
+
+        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
+        { 
+            return !pred(x,y); 
+        }
+
+        const Predicate pred;
+    };
+    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
+    {
+        return binary_negate<BinaryPredicate>(pred);
+    }
 
     template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
     {
-        __forceinline__ __host__ __device__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
+        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
 
-        __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::second_argument_type& a) const
+        __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
         {
             return op(arg1, a);
         }
@@ -292,15 +455,16 @@ namespace cv { namespace gpu { namespace device
         const Op op;
         const typename Op::first_argument_type arg1;
     };
-    template <typename Op, typename T> static __forceinline__ __host__ __device__ binder1st<Op> bind1st(const Op& op, const T& x)
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
     {
         return binder1st<Op>(op, typename Op::first_argument_type(x));
     }
+
     template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> 
     {
-        __forceinline__ __host__ __device__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
+        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
 
-        __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::first_argument_type& a) const
+        __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
         {
             return op(a, arg2);
         }
@@ -308,7 +472,7 @@ namespace cv { namespace gpu { namespace device
         const Op op;
         const typename Op::second_argument_type arg2;
     };
-    template <typename Op, typename T> static __forceinline__ __host__ __device__ binder2nd<Op> bind2nd(const Op& op, const T& x)
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
     {
         return binder2nd<Op>(op, typename Op::second_argument_type(x));
     }
@@ -317,24 +481,28 @@ namespace cv { namespace gpu { namespace device
 
     template <typename F> struct IsUnaryFunction
     {
-        struct Yes {};
+        typedef char Yes;
         struct No {Yes a[2];};
 
-        template <typename T, typename D> static Yes check(unary_function<T, D>*);
+        template <typename T, typename D> static Yes check(unary_function<T, D>);
         static No check(...);
 
-        enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
+        static F makeF();
+
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
     };
 
     template <typename F> struct IsBinaryFunction
     {
-        struct Yes {};
+        typedef char Yes;
         struct No {Yes a[2];};
 
-        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>*);
+        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
         static No check(...);
 
-        enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
+        static F makeF();
+
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
     };
 
     namespace detail
diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
index 55c9cb99b8..ca6159f3fd 100644
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@@ -47,29 +47,29 @@
 
 namespace cv { namespace gpu { namespace device
 {
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
 
-    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
     { return (uchar)max((int)v, 0); }
-    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
     { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
-    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
     { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
     { return (uchar)min(v, (uint)UCHAR_MAX); }
-    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
     { return saturate_cast<uchar>((uint)v); }
 
-    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
     { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
-    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
     {
     #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
         int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
@@ -78,23 +78,23 @@ namespace cv { namespace gpu { namespace device
     #endif
     }
 
-    template<> static __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
     { return (schar)min((int)v, SCHAR_MAX); }
-    template<> static __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
     { return (schar)min((uint)v, (uint)SCHAR_MAX); }
-    template<> static __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
     {
         return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
                     v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
     }
-    template<> static __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
     { return saturate_cast<schar>((int)v); }
-    template<> static __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
     { return (schar)min(v, (uint)SCHAR_MAX); }
 
-    template<> static __device__ __forceinline__ schar saturate_cast<schar>(float v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
     { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
-    template<> static __device__ __forceinline__ schar saturate_cast<schar>(double v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
     {             
     #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
         int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
@@ -103,17 +103,17 @@ namespace cv { namespace gpu { namespace device
     #endif
     }
 
-    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
     { return (ushort)max((int)v, 0); }
-    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
     { return (ushort)max((int)v, 0); }
-    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
     { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
     { return (ushort)min(v, (uint)USHRT_MAX); }
-    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
     { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
-    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
     {             
     #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
         int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
@@ -122,18 +122,18 @@ namespace cv { namespace gpu { namespace device
     #endif
     }
 
-    template<> static __device__ __forceinline__ short saturate_cast<short>(ushort v)
+    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
     { return (short)min((int)v, SHRT_MAX); }
-    template<> static __device__ __forceinline__ short saturate_cast<short>(int v)
+    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
     {
         return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
                 v : v > 0 ? SHRT_MAX : SHRT_MIN);
     }
-    template<> static __device__ __forceinline__ short saturate_cast<short>(uint v)
+    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
     { return (short)min(v, (uint)SHRT_MAX); }
-    template<> static __device__ __forceinline__ short saturate_cast<short>(float v)
+    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
     { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
-    template<> static __device__ __forceinline__ short saturate_cast<short>(double v)
+    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
     {            
     #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
         int iv = __double2int_rn(v); return saturate_cast<short>(iv);
@@ -142,8 +142,8 @@ namespace cv { namespace gpu { namespace device
     #endif
     }
 
-    template<> static __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
-    template<> static __device__ __forceinline__ int saturate_cast<int>(double v) 
+    template<> __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
+    template<> __device__ __forceinline__ int saturate_cast<int>(double v) 
     {
     #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
         return __double2int_rn(v);
@@ -152,8 +152,8 @@ namespace cv { namespace gpu { namespace device
     #endif
     }
 
-    template<> static __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
-    template<> static __device__ __forceinline__ uint saturate_cast<uint>(double v) 
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) 
     {            
     #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
         return __double2uint_rn(v);
diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp
index 4f756e36da..92d5065066 100644
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -43,33 +43,31 @@
 #ifndef __OPENCV_GPU_TRANSFORM_HPP__
 #define __OPENCV_GPU_TRANSFORM_HPP__
 
-#include "detail/transform.hpp"
+#include "detail/transform_detail.hpp"
+#include "utility.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
     template <typename T, typename D, typename UnOp>
-    static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
+    void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
     {
-        detail::transform_caller(src, dst, op, detail::NoMask(), stream);
+        detail::transform_caller(src, dst, op, WithOutMask(), stream);
     }
     template <typename T, typename D, typename UnOp>
-    static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, const UnOp& op, 
-        cudaStream_t stream = 0)
+    void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, const UnOp& op, cudaStream_t stream = 0)
     {
-        detail::transform_caller(src, dst, op, detail::MaskReader(mask), stream);
+        detail::transform_caller(src, dst, op, SingleMask(mask), stream);
     }
 
     template <typename T1, typename T2, typename D, typename BinOp>
-    static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-        const BinOp& op, cudaStream_t stream = 0)
+    void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
     {
-        detail::transform_caller(src1, src2, dst, op, detail::NoMask(), stream);
+        detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
     }
     template <typename T1, typename T2, typename D, typename BinOp>
-    static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-        const PtrStep& mask, const BinOp& op, cudaStream_t stream = 0)
+    void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStep& mask, const BinOp& op, cudaStream_t stream = 0)
     {
-        detail::transform_caller(src1, src2, dst, op, detail::MaskReader(mask), stream);
+        detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
     }
 }}}
 
diff --git a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
new file mode 100644
index 0000000000..24f02ef019
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
@@ -0,0 +1,80 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
+#define __OPENCV_GPU_TYPE_TRAITS_HPP__
+
+#include "detail/type_traits_detail.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct IsSimpleParameter
+    {
+        enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};
+    };
+
+    template <typename T> struct TypeTraits
+    {
+        typedef typename detail::UnConst<T>::type                                       NonConstType;
+        typedef typename detail::UnVolatile<T>::type                                    NonVolatileType;
+        typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type    UnqualifiedType;
+        typedef typename detail::PointerTraits<UnqualifiedType>::type                   PointeeType;
+        typedef typename detail::ReferenceTraits<T>::type                               ReferredType;
+
+        enum { isConst          = detail::UnConst<T>::value };
+        enum { isVolatile       = detail::UnVolatile<T>::value };
+
+        enum { isReference      = detail::ReferenceTraits<UnqualifiedType>::value };
+        enum { isPointer        = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value };        
+
+        enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };
+        enum { isSignedInt   = detail::IsSignedIntergral<UnqualifiedType>::value };
+        enum { isIntegral    = detail::IsIntegral<UnqualifiedType>::value };
+        enum { isFloat       = detail::IsFloat<UnqualifiedType>::value  };
+        enum { isArith       = isIntegral || isFloat };
+        enum { isVec         = detail::IsVec<UnqualifiedType>::value  };
+        
+        typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;
+    };
+}}}
+
+#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp
index b0dca8afeb..3fd84a0a02 100644
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -45,112 +45,275 @@
 
 #include "internal_shared.hpp"
 #include "saturate_cast.hpp"
-
-#ifndef __CUDA_ARCH__
-	#define __CUDA_ARCH__ 0
-#endif
+#include "datamov_utils.hpp"
+#include "functional.hpp"
+#include "detail/utility_detail.hpp"
 
 #define OPENCV_GPU_LOG_WARP_SIZE	    (5)
 #define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
 #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
 #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
 
-#if defined(_WIN64) || defined(__LP64__)		
-    // 64-bit register modifier for inlined asm
-    #define OPENCV_GPU_ASM_PTR "l"
-#else	
-    // 32-bit register modifier for inlined asm
-    #define OPENCV_GPU_ASM_PTR "r"
-#endif
-
 namespace cv {  namespace gpu { namespace device
 {
-    template <typename T> void __host__ __device__ __forceinline__ swap(T& a, T& b) 
+    ///////////////////////////////////////////////////////////////////////////////
+    // swap
+
+    template <typename T> void __device__ __forceinline__ swap(T& a, T& b) 
     {
         const T temp = a;
         a = b;
         b = temp;
     }
 
-    // warp-synchronous 32 elements reduction
-    template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, const Op& op)
+    ///////////////////////////////////////////////////////////////////////////////
+    // Mask Reader
+
+    struct SingleMask
     {
-        data[tid] = partial_reduction;
+        explicit __host__ __device__ __forceinline__ SingleMask(const PtrStep& mask_) : mask(mask_) {}
+        
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {            
+            return mask.ptr(y)[x] != 0;
+        }
+
+        const PtrStep mask;
+    };
 
-        if (tid < 16)
+    struct MaskCollection
+    {
+        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}
+
+        __device__ __forceinline__ void next()
         {
-            data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-            data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-            data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-            data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-            data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
+            curMask = *maskCollection++;
+        }
+        __device__ __forceinline__ void setMask(int z)
+        {
+            curMask = maskCollection[z];
+        }
+        
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            uchar val;
+            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
         }
-    }
 
-    // warp-synchronous 16 elements reduction
-    template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, const Op& op)
+        const PtrStep* maskCollection;
+        PtrStep curMask;
+    };
+
+    struct WithOutMask
     {
-        data[tid] = partial_reduction;
+        __device__ __forceinline__ void next() const
+        {
+        }
+        __device__ __forceinline__ void setMask(int) const
+        {
+        }
 
-        if (tid < 8)
+        __device__ __forceinline__ bool operator()(int, int) const
         {
-            data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 8 ]);
-            data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 4 ]);
-            data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 2 ]);
-            data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 1 ]);
+            return true;
         }
+    };
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Reduction
+
+    // reduction
+    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+    {
+        StaticAssert<n >= 8 && n <= 512>::check();
+        detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
+    }
+
+    template <int n, typename T, typename V, typename Pred> 
+    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
+    {
+        StaticAssert<n >= 8 && n <= 512>::check();
+        detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
     }
 
-    // warp-synchronous reduction
-    template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+    ///////////////////////////////////////////////////////////////////////////////
+    // Vector Distance
+
+    template <typename T> struct L1Dist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum = __sad(val1, val2, mySum);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+    template <> struct L1Dist<float>
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            mySum += ::fabs(val1 - val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return mySum;
+        }
+
+        float mySum;
+    };
+
+    struct L2Dist
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            float reg = val1 - val2;
+            mySum += reg * reg;
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return sqrtf(mySum);
+        }
+
+        float mySum;
+    };
+
+    struct HammingDist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ HammingDist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum += __popc(val1 ^ val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
+    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
     {
-        if (tid < n)
-            data[tid] = partial_reduction;
-
-        if (n > 16)
-        {
-            if (tid < n - 16) 
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-            if (tid < 8)
-            {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-            }
-        }
-        else if (n > 8)
-        {
-            if (tid < n - 8) 
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-            if (tid < 4)
-            {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-            }
-        }
-        else if (n > 4)
-        {
-            if (tid < n - 4) 
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-            if (tid < 2)
-            {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-            }
-        }   
-        else if (n > 2)
-        {
-            if (tid < n - 2) 
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-            if (tid < 2)
-            {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-            }
-        }      
+        for (int i = tid; i < len; i += THREAD_DIM)
+        {
+            T1 val1;
+            ForceGlob<T1>::Load(vec1, i, val1);
+
+            T2 val2;
+            ForceGlob<T2>::Load(vec2, i, val2);
+
+            dist.reduceIter(val1, val2);
+        }
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
     }
 
+    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
+    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {        
+        detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
+        
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
+    {
+        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
+        {
+            vec1 = vec1_;
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+        }
+
+        const T1* vec1;
+    };
+
+    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
+    {
+        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+        {
+            if (glob_tid < len)
+                smem[glob_tid] = vec1[glob_tid];
+            __syncthreads();
+
+            U* vec1ValsPtr = vec1Vals;
+
+            #pragma unroll
+            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
+                *vec1ValsPtr++ = smem[i];
+
+            __syncthreads();
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
+        }
+
+        U vec1Vals[MAX_LEN / THREAD_DIM];
+    };
+
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Solve linear system
+
     // solve 2x2 linear system Ax=b
     template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
     {
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
index 5c0051c29a..48aa62f3f7 100644
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -55,7 +55,7 @@ namespace cv {  namespace gpu { namespace device
         template <int cn, typename VecD> struct SatCastHelper;
         template <typename VecD> struct SatCastHelper<1, VecD>
         {
-            template <typename VecS> static __device__ VecD cast(const VecS& v)
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
             {
                 typedef typename VecTraits<VecD>::elem_type D;
                 return VecTraits<VecD>::make(saturate_cast<D>(v.x));
@@ -63,7 +63,7 @@ namespace cv {  namespace gpu { namespace device
         };
         template <typename VecD> struct SatCastHelper<2, VecD>
         {
-            template <typename VecS> static __device__ VecD cast(const VecS& v)
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
             {
                 typedef typename VecTraits<VecD>::elem_type D;
                 return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
@@ -71,7 +71,7 @@ namespace cv {  namespace gpu { namespace device
         };
         template <typename VecD> struct SatCastHelper<3, VecD>
         {
-            template <typename VecS> static __device__ VecD cast(const VecS& v)
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
             {
                 typedef typename VecTraits<VecD>::elem_type D;
                 return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
@@ -79,72 +79,72 @@ namespace cv {  namespace gpu { namespace device
         };
         template <typename VecD> struct SatCastHelper<4, VecD>
         {
-            template <typename VecS> static __device__ VecD cast(const VecS& v)
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
             {
                 typedef typename VecTraits<VecD>::elem_type D;
                 return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
             }
         };
 
-        template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)
+        template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
         {
             return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
         }
     }
 
-    template<typename _Tp> static __device__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
 
-    template<typename _Tp> static __device__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
 
-    template<typename _Tp> static __device__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
 
-    template<typename _Tp> static __device__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
 
 #define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
-    static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
     { \
         func<type> f; \
         return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x)); \
     } \
-    static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
     { \
         func<type> f; \
         return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x), f(a.y)); \
     } \
-    static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
     { \
         func<type> f; \
         return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x), f(a.y), f(a.z)); \
     } \
-    static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
     { \
         func<type> f; \
         return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
@@ -195,70 +195,70 @@ namespace cv {  namespace gpu { namespace device
     }
 
 #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
-    static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
     { \
         func<type> f; \
         return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
     { \
         func<typename detail::BinOpTraits<type, T>::argument_type> f; \
         return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
     { \
         func<typename detail::BinOpTraits<type, T>::argument_type> f; \
         return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
     } \
-    static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
     { \
         func<type> f; \
         return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
     { \
         func<typename detail::BinOpTraits<type, T>::argument_type> f; \
         return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
     { \
         func<typename detail::BinOpTraits<type, T>::argument_type> f; \
         return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
     } \
-    static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
     { \
         func<type> f; \
         return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
     { \
         func<typename detail::BinOpTraits<type, T>::argument_type> f; \
         return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
     { \
         func<typename detail::BinOpTraits<type, T>::argument_type> f; \
         return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
     } \
-    static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
     { \
         func<type> f; \
         return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
     { \
         func<typename detail::BinOpTraits<type, T>::argument_type> f; \
         return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
     { \
         func<typename detail::BinOpTraits<T, type>::argument_type> f; \
         return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index 169a10b2d2..b3db078f17 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -3642,19 +3642,24 @@ INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplateBlackSource, testing::Combine(
                         testing::Values((int)CV_TM_CCOEFF_NORMED, (int)CV_TM_CCORR_NORMED)));
 
 
-struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, std::tr1::tuple<const char*, const char*> > >
+struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, std::pair<std::string, std::string> > >
 {
     cv::gpu::DeviceInfo devInfo;
+    std::string imageName;
+    std::string patternName;
+
     cv::Mat image, pattern;
 
     virtual void SetUp()
     {
         devInfo = std::tr1::get<0>(GetParam());
+        imageName = std::tr1::get<1>(GetParam()).first;
+        patternName = std::tr1::get<1>(GetParam()).second;
 
-        image = readImage(std::tr1::get<0>(std::tr1::get<1>(GetParam())));
+        image = readImage(imageName);
         ASSERT_FALSE(image.empty());
 
-        pattern = readImage(std::tr1::get<1>(std::tr1::get<1>(GetParam())));
+        pattern = readImage(patternName);
         ASSERT_FALSE(pattern.empty());
     }
 };
@@ -3662,6 +3667,8 @@ struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple<cv::
 TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
 {
     PRINT_PARAM(devInfo);
+    PRINT_PARAM(imageName);
+    PRINT_PARAM(patternName);
 
     cv::Mat dstGold;
     cv::matchTemplate(image, pattern, dstGold, CV_TM_CCOEFF_NORMED);
@@ -3688,8 +3695,8 @@ TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
 
 INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_CCOEF_NORMED, testing::Combine(
                         testing::ValuesIn(devices()),
-                        testing::Values(std::tr1::make_tuple("matchtemplate/source-0.png", "matchtemplate/target-0.png"),
-                                        std::tr1::make_tuple("matchtemplate/source-1.png", "matchtemplate/target-1.png"))));
+                        testing::Values(std::make_pair(std::string("matchtemplate/source-0.png"), std::string("matchtemplate/target-0.png")),
+                                        std::make_pair(std::string("matchtemplate/source-1.png"), std::string("matchtemplate/target-1.png")))));
 
 
 ////////////////////////////////////////////////////////////////////////////
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index b846efd730..fd96119211 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -286,7 +286,7 @@ TEST(BruteForceMatcher)
 {
     // Init CPU matcher
 
-    int desc_len = 128;
+    int desc_len = 64;
 
     BruteForceMatcher< L2<float> > matcher;
 
@@ -329,7 +329,7 @@ TEST(BruteForceMatcher)
     GPU_OFF;
 
     SUBTEST << "radiusMatch";
-    float max_distance = 3.8f;
+    float max_distance = 2.0f;
 
     CPU_ON;
     matcher.radiusMatch(query, train, matches, max_distance);