From 8bb9e4302e097dd3a0ff432da4d8e4037dae3040 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Wed, 10 Aug 2011 11:32:48 +0000
Subject: [PATCH] added registerPageLocked/unregisterPageLocked functions added
 convert functions to BruteForceMatcher_GPU other minor fixes

---
 modules/gpu/include/opencv2/gpu/gpu.hpp       | 29 ++++--
 modules/gpu/src/brute_force_matcher.cpp       | 95 +++++++++++++------
 modules/gpu/src/cuda/brute_force_matcher.cu   | 62 ++++--------
 modules/gpu/src/cuda/calib3d.cu               |  9 +-
 modules/gpu/src/cuda/canny.cu                 |  7 +-
 modules/gpu/src/cuda/element_operations.cu    | 18 ++--
 modules/gpu/src/cuda/hist.cu                  | 21 ++--
 modules/gpu/src/cuda/imgproc.cu               | 10 +-
 modules/gpu/src/cuda/internal_shared.hpp      | 44 ++++++++-
 modules/gpu/src/cuda/matrix_operations.cu     | 16 ++--
 modules/gpu/src/cuda/surf.cu                  |  4 +-
 modules/gpu/src/matrix_operations.cpp         | 19 ++--
 .../opencv2/gpu/device/detail/transform.hpp   | 44 ++++-----
 .../gpu/src/opencv2/gpu/device/functional.hpp | 72 +++++++-------
 .../gpu/src/opencv2/gpu/device/transform.hpp  |  8 +-
 .../gpu/src/opencv2/gpu/device/utility.hpp    | 10 +-
 modules/gpu/src/surf.cpp                      | 34 +++----
 samples/gpu/performance/tests.cpp             |  2 +-
 18 files changed, 276 insertions(+), 228 deletions(-)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 3d73633cba..9ef46a477d 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -149,6 +149,11 @@ namespace cv
         // It is convertable to cv::Mat header without reference counting
         // so you can use it with other opencv functions.
 
+        // Page-locks the matrix m memory and maps it for the device(s)
+        CV_EXPORTS void registerPageLocked(Mat& m);
+        // Unmaps the memory of matrix m, and makes it pageable again.
+        CV_EXPORTS void unregisterPageLocked(Mat& m);
+
         class CV_EXPORTS CudaMem
         {
         public:
@@ -1254,8 +1259,10 @@ namespace cv
                 GpuMat& trainIdx, GpuMat& distance,
                 const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
 
-            // Download trainIdx and distance to CPU vector with DMatch
+            // Download trainIdx and distance and convert it to CPU vector with DMatch
             static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
+            // Convert trainIdx and distance to vector with DMatch
+            static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);
 
             // Find one best match for each query descriptor.
             void match(const GpuMat& queryDescs, const GpuMat& trainDescs, std::vector<DMatch>& matches,
@@ -1273,13 +1280,13 @@ namespace cv
                 GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
                 const GpuMat& maskCollection, Stream& stream = Stream::Null());
 
-            // Download trainIdx, imgIdx and distance to CPU vector with DMatch
-            static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
-                std::vector<DMatch>& matches);
+            // Download trainIdx, imgIdx and distance and convert it to vector with DMatch
+            static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);
+            // Convert trainIdx, imgIdx and distance to vector with DMatch
+            static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);
 
             // Find one best match from train collection for each query descriptor.
-            void match(const GpuMat& queryDescs, std::vector<DMatch>& matches,
-                const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+            void match(const GpuMat& queryDescs, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
 
             // Find k best matches for each query descriptor (in increasing order of distances).
             // trainIdx.at<int>(queryIdx, i) will contain index of i'th best trains (i < k).
@@ -1291,12 +1298,15 @@ namespace cv
             void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
                 GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
 
-            // Download trainIdx and distance to CPU vector with DMatch
+            // Download trainIdx and distance and convert it to vector with DMatch
             // compactResult is used when mask is not empty. If compactResult is false matches
             // vector will have the same size as queryDescriptors rows. If compactResult is true
             // matches vector will not contain matches for fully masked out query descriptors.
             static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
                 std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+            // Convert trainIdx and distance to vector with DMatch
+            static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
+                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
 
             // Find k best matches for each query descriptor (in increasing order of distances).
             // compactResult is used when mask is not empty. If compactResult is false matches
@@ -1326,13 +1336,16 @@ namespace cv
                 GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance,
                 const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
 
-            // Download trainIdx, nMatches and distance to CPU vector with DMatch.
+            // Download trainIdx, nMatches and distance and convert it to vector with DMatch.
             // matches will be sorted in increasing order of distances.
             // compactResult is used when mask is not empty. If compactResult is false matches
             // vector will have the same size as queryDescriptors rows. If compactResult is true
             // matches vector will not contain matches for fully masked out query descriptors.
             static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, const GpuMat& distance,
                 std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+            // Convert trainIdx, nMatches and distance to vector with DMatch.
+            static void radiusMatchConvert(const Mat& trainIdx, const Mat& nMatches, const Mat& distance,
+                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
 
             // Find best matches for each query descriptor which have distance less than maxDistance
             // in increasing order of distances).
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index f4ec3cc52f..19521a2a64 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -56,17 +56,21 @@ bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return
 bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; }
 void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, std::vector<DMatch>&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, const Mat&, std::vector<DMatch>&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
 
@@ -216,8 +220,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
     func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance,
-    vector<DMatch>& matches)
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
+{
+    if (trainIdx.empty() || distance.empty())
+        return;
+
+    Mat trainIdxCPU = trainIdx;
+    Mat distanceCPU = distance;
+
+    matchConvert(trainIdxCPU, distanceCPU, matches);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches)
 {
     if (trainIdx.empty() || distance.empty())
         return;
@@ -227,14 +241,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
 
     const int nQuery = trainIdx.cols;
 
-    Mat trainIdxCPU = trainIdx;
-    Mat distanceCPU = distance;
-
     matches.clear();
     matches.reserve(nQuery);
 
-    const int* trainIdx_ptr = trainIdxCPU.ptr<int>();
-    const float* distance_ptr =  distanceCPU.ptr<float>();
+    const int* trainIdx_ptr = trainIdx.ptr<int>();
+    const float* distance_ptr =  distance.ptr<float>();
     for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
     {
         int trainIdx = *trainIdx_ptr;
@@ -347,8 +358,19 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
     func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx,
-    const GpuMat& distance, vector<DMatch>& matches)
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
+{
+    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
+        return;
+
+    Mat trainIdxCPU = trainIdx;
+    Mat imgIdxCPU = imgIdx;
+    Mat distanceCPU = distance;
+
+    matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches)
 {
     if (trainIdx.empty() || imgIdx.empty() || distance.empty())
         return;
@@ -359,16 +381,12 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
 
     const int nQuery = trainIdx.cols;
 
-    Mat trainIdxCPU = trainIdx;
-    Mat imgIdxCPU = imgIdx;
-    Mat distanceCPU = distance;
-
     matches.clear();
     matches.reserve(nQuery);
 
-    const int* trainIdx_ptr = trainIdxCPU.ptr<int>();
-    const int* imgIdx_ptr = imgIdxCPU.ptr<int>();
-    const float* distance_ptr =  distanceCPU.ptr<float>();
+    const int* trainIdx_ptr = trainIdx.ptr<int>();
+    const int* imgIdx_ptr = imgIdx.ptr<int>();
+    const float* distance_ptr =  distance.ptr<float>();
     for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
     {
         int trainIdx = *trainIdx_ptr;
@@ -385,8 +403,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
     }
 }
 
-void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector<DMatch>& matches,
-    const vector<GpuMat>& masks)
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector<DMatch>& matches, const vector<GpuMat>& masks)
 {
     GpuMat trainCollection;
     GpuMat maskCollection;
@@ -462,15 +479,24 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
     if (trainIdx.empty() || distance.empty())
         return;
 
+    Mat trainIdxCPU = trainIdx;
+    Mat distanceCPU = distance;
+
+    knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance, 
+                                                          std::vector< std::vector<DMatch> >& matches, bool compactResult)
+{
+    if (trainIdx.empty() || distance.empty())
+        return;
+
     CV_Assert(trainIdx.type() == CV_32SC1);
     CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());
 
     const int nQuery = distance.rows;
     const int k = trainIdx.cols;
 
-    Mat trainIdxCPU = trainIdx;
-    Mat distanceCPU = distance;
-
     matches.clear();
     matches.reserve(nQuery);
 
@@ -480,8 +506,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
         vector<DMatch>& curMatches = matches.back();
         curMatches.reserve(k);
 
-        int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);
-        float* distance_ptr = distanceCPU.ptr<float>(queryIdx);
+        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
+        const float* distance_ptr = distance.ptr<float>(queryIdx);
         for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
         {
             int trainIdx = *trainIdx_ptr;
@@ -614,24 +640,33 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
     if (trainIdx.empty() || nMatches.empty() || distance.empty())
         return;
 
+    Mat trainIdxCPU = trainIdx;
+    Mat nMatchesCPU = nMatches;
+    Mat distanceCPU = distance;
+
+    radiusMatchConvert(trainIdxCPU, nMatchesCPU, distanceCPU, matches, compactResult);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& nMatches, const Mat& distance,
+                std::vector< std::vector<DMatch> >& matches, bool compactResult)
+{
+    if (trainIdx.empty() || nMatches.empty() || distance.empty())
+        return;
+
     CV_Assert(trainIdx.type() == CV_32SC1);
     CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.cols >= trainIdx.rows);
     CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());
 
     const int nQuery = trainIdx.rows;
 
-    Mat trainIdxCPU = trainIdx;
-    Mat nMatchesCPU = nMatches;
-    Mat distanceCPU = distance;
-
     matches.clear();
     matches.reserve(nQuery);
 
-    const unsigned int* nMatches_ptr = nMatchesCPU.ptr<unsigned int>();
+    const unsigned int* nMatches_ptr = nMatches.ptr<unsigned int>();
     for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
     {
-        const int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);
-        const float* distance_ptr = distanceCPU.ptr<float>(queryIdx);
+        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
+        const float* distance_ptr = distance.ptr<float>(queryIdx);
 
         const int nMatches = std::min(static_cast<int>(nMatches_ptr[queryIdx]), trainIdx.cols);
 
diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu
index 17c5c802f7..6b1361901c 100644
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -56,9 +56,8 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////
     // Mask strategy
 
-    class SingleMask
+    struct SingleMask
     {
-    public:
         explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}
         
         __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
@@ -66,13 +65,11 @@ namespace cv { namespace gpu { namespace bfmatcher
             return mask.ptr(queryIdx)[trainIdx] != 0;
         }
 
-    private:
-        PtrStep mask;
+        const PtrStep mask;
     };
 
-    class MaskCollection
+    struct MaskCollection
     {
-    public:
         explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}
 
         __device__ __forceinline__ void nextMask()
@@ -86,15 +83,14 @@ namespace cv { namespace gpu { namespace bfmatcher
             return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0));
         }
 
-    private:
-        PtrStep* maskCollection;
+        const PtrStep* maskCollection;
         PtrStep curMask;
     };
 
     class WithOutMask
     {
     public:
-        __device__ __forceinline__ void nextMask()
+        __device__ __forceinline__ void nextMask() const
         {
         }
         __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
@@ -128,9 +124,8 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////
     // Distance
 
-    template <typename T> class L1Dist
+    template <typename T> struct L1Dist
     {
-    public:
         typedef int ResultType;
         typedef int ValueType;
 
@@ -151,12 +146,10 @@ namespace cv { namespace gpu { namespace bfmatcher
             return mySum;
         }
 
-    private:
         int mySum;
     };
-    template <> class L1Dist<float>
+    template <> struct L1Dist<float>
     {
-    public:
         typedef float ResultType;
         typedef float ValueType;
 
@@ -177,13 +170,11 @@ namespace cv { namespace gpu { namespace bfmatcher
             return mySum;
         }
 
-    private:
         float mySum;
     };
 
-    class L2Dist
+    struct L2Dist
     {
-    public:
         typedef float ResultType;
         typedef float ValueType;
 
@@ -205,13 +196,11 @@ namespace cv { namespace gpu { namespace bfmatcher
             return sqrtf(mySum);
         }
 
-    private:
         float mySum;
     };
 
-    class HammingDist
+    struct HammingDist
     {
-    public:
         typedef int ResultType;
         typedef int ValueType;
 
@@ -232,7 +221,6 @@ namespace cv { namespace gpu { namespace bfmatcher
             return mySum;
         }
 
-    private:
         int mySum;
     };
     
@@ -425,10 +413,8 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////
     // ReduceDescCalculator
 
-    template <int BLOCK_DIM_X, typename T>
-    class ReduceDescCalculatorSimple
+    template <int BLOCK_DIM_X, typename T> struct ReduceDescCalculatorSimple
     {
-    public:
         __device__ __forceinline__ void prepare(const T* queryDescs_, int, void*)
         {
             queryDescs = queryDescs_;
@@ -440,14 +426,12 @@ namespace cv { namespace gpu { namespace bfmatcher
             reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);
         }
 
-    private:
         const T* queryDescs;
     };
 
     template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T, typename U>
-    class ReduceDescCalculatorCached
+    struct ReduceDescCalculatorCached
     {
-    public:
         __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem)
         {
             loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);
@@ -459,7 +443,6 @@ namespace cv { namespace gpu { namespace bfmatcher
             reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, desc_len, dist, sdiff_row);
         }
 
-    private:
         U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];
     };
     
@@ -497,10 +480,8 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////
     // Train collection loop strategy
 
-    template <typename T>
-    class SingleTrain
+    template <typename T> struct SingleTrain
     {
-    public:
         explicit SingleTrain(const DevMem2D_<T>& trainDescs_) : trainDescs(trainDescs_)
         {
         }
@@ -517,14 +498,11 @@ namespace cv { namespace gpu { namespace bfmatcher
             return trainDescs.cols;
         }
 
-    private:
-        DevMem2D_<T> trainDescs;
+        const DevMem2D_<T> trainDescs;
     };
 
-    template <typename T>
-    class TrainCollection
+    template <typename T> struct TrainCollection
     {
-    public:
         TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : 
             trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
         {
@@ -536,7 +514,7 @@ namespace cv { namespace gpu { namespace bfmatcher
         {
             for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
             {
-                DevMem2D_<T> trainDescs = trainCollection[imgIdx];
+                const DevMem2D_<T> trainDescs = trainCollection[imgIdx];
                 m.nextMask();
                 matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
             }
@@ -547,7 +525,6 @@ namespace cv { namespace gpu { namespace bfmatcher
             return desclen;
         }
 
-    private:
         const DevMem2D_<T>* trainCollection;
         int nImg;
         int desclen;
@@ -806,7 +783,7 @@ namespace cv { namespace gpu { namespace bfmatcher
     // Calc distance kernel
 
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    __global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)
+    __global__ void calcDistance(const PtrStep_<T> queryDescs_, const DevMem2D_<T> trainDescs_, const Mask mask, PtrStepf distance)
     {
         __shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
 
@@ -989,8 +966,7 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////
     // find knn match kernel
 
-    template <int BLOCK_SIZE>
-    __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
+    template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
     {
         const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
         __shared__ float sdist[SMEM_SIZE];
@@ -1130,8 +1106,8 @@ namespace cv { namespace gpu { namespace bfmatcher
     // Radius Match kernel
 
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    __global__ void radiusMatch(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, 
-        float maxDistance, Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
+    __global__ void radiusMatch(const PtrStep_<T> queryDescs_, const DevMem2D_<T> trainDescs_, 
+        float maxDistance, const Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
     {
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
 
diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu
index 13136d9a48..2a30393817 100644
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -42,6 +42,7 @@
 
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 
 #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
 
@@ -56,9 +57,9 @@ namespace cv { namespace gpu
         __constant__ float3 crot2;
         __constant__ float3 ctransl;
 
-        struct TransformOp
+        struct TransformOp : unary_function<float3, float3>
         {
-            __device__ __forceinline__ float3 operator()(float3 p) const
+            __device__ __forceinline__ float3 operator()(const float3& p) const
             {
                 return make_float3(
                         crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
@@ -89,9 +90,9 @@ namespace cv { namespace gpu
         __constant__ float3 cproj0;
         __constant__ float3 cproj1;
 
-        struct ProjectOp
+        struct ProjectOp : unary_function<float3, float3>
         {
-            __device__ __forceinline__ float2 operator()(float3 p) const
+            __device__ __forceinline__ float2 operator()(const float3& p) const
             {
                 // Rotate and translate in 3D
                 float3 t = make_float3(
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu
index b2acae9711..19f9b384c4 100644
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -49,7 +49,7 @@ using namespace cv::gpu::device;
 
 namespace cv { namespace gpu { namespace canny
 {
-    __global__ void calcSobelRowPass(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+    __global__ void calcSobelRowPass(const PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
     {
         __shared__ int smem[16][18];
 
@@ -100,7 +100,8 @@ namespace cv { namespace gpu { namespace canny
         }
     };
 
-    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+    template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
+        PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
     {
         __shared__ int sdx[18][16];
         __shared__ int sdy[18][16];
@@ -179,7 +180,7 @@ namespace cv { namespace gpu { namespace canny
 #define CANNY_SHIFT 15
 #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
 
-    __global__ void calcMap(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+    __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
     {
         __shared__ float smem[18][18];
 
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index ba92fab1de..0626d8938a 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -56,10 +56,9 @@ namespace cv { namespace gpu { namespace mathfunc
     //////////////////////////////////////////////////////////////////////////////////////
     // Compare
 
-    template <typename T1, typename T2>
-    struct NotEqual
+    template <typename T1, typename T2> struct NotEqual : binary_function<T1, T2, uchar>
     {
-        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2)
+        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const
         {
             return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
         }
@@ -467,8 +466,7 @@ namespace cv { namespace gpu { namespace mathfunc
     //////////////////////////////////////////////////////////////////////////
     // pow
     
-    template<typename T, bool Signed = device::numeric_limits<T>::is_signed>
-    struct PowOp
+    template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
     {    
         float power;
         PowOp(float power_) : power(power_) {}
@@ -479,13 +477,12 @@ namespace cv { namespace gpu { namespace mathfunc
         }      
     };
 
-    template<typename T>
-    struct PowOp<T, true>
+    template<typename T> struct PowOp<T, true> : unary_function<T, T>
     {
         float power;
         PowOp(float power_) : power(power_) {}
 
-        __device__ __forceinline__ float operator()(const T& e)
+        __device__ __forceinline__ float operator()(const T& e) const
         {
             T res = saturate_cast<T>(__powf((float)e, power));            
             
@@ -495,13 +492,12 @@ namespace cv { namespace gpu { namespace mathfunc
         }
     };
 
-    template<>
-    struct PowOp<float>
+    template<> struct PowOp<float> : unary_function<float, float>
     {
         float power;
         PowOp(float power_) : power(power_) {}
 
-        __device__ __forceinline__ float operator()(const float& e)
+        __device__ __forceinline__ float operator()(const float& e) const
         {
             return __powf(fabs(e), power);
         }
diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu
index 23f8733fb1..cafcb0909a 100644
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -105,7 +105,7 @@ namespace cv { namespace gpu { namespace histograms
         if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
     }
 
-    __global__ void histogram256(PtrStep_<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+    __global__ void histogram256(const PtrStep_<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
     {
         //Per-warp subhistogram storage
         __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
@@ -189,21 +189,18 @@ namespace cv { namespace gpu { namespace histograms
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    __global__ void equalizeHist(DevMem2D src, PtrStep dst, const int* lut)
-    {
-        __shared__ int s_lut[256];
-
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        s_lut[tid] = lut[tid];
-        __syncthreads();
+    __constant__ int c_lut[256];
 
+    __global__ void equalizeHist(const DevMem2D src, PtrStep dst)
+    {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
         if (x < src.cols && y < src.rows)
         {
-            dst.ptr(y)[x] = __float2int_rn(255.0f * s_lut[src.ptr(y)[x]] / (src.cols * src.rows));
+            const uchar val = src.ptr(y)[x];
+            const int lut = c_lut[val];
+            dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
         }
     }
 
@@ -212,7 +209,9 @@ namespace cv { namespace gpu { namespace histograms
         dim3 block(16, 16);
         dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
-        equalizeHist<<<grid, block, 0, stream>>>(src, dst, lut);
+        cudaSafeCall( cudaMemcpyToSymbol(cv::gpu::histograms::c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+
+        equalizeHist<<<grid, block, 0, stream>>>(src, dst);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 1f760d7d33..ab6d9cd49c 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -49,7 +49,7 @@ using namespace cv::gpu::device;
 /////////////////////////////////// Remap ///////////////////////////////////////////////
 namespace cv { namespace gpu { namespace imgproc
 {
-    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
+    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap(0, cudaFilterModeLinear, cudaAddressModeWrap);
 
     __global__ void remap_1c(const float* mapx, const float* mapy, size_t map_step, uchar* out, size_t out_step, int width, int height)
     {    
@@ -131,16 +131,12 @@ namespace cv { namespace gpu { namespace imgproc
         grid.x = divUp(dst.cols, threads.x);
         grid.y = divUp(dst.rows, threads.y);
 
-        tex_remap.filterMode = cudaFilterModeLinear;
-        tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap;
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-        cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
+        TextureBinder tex_remap(&tex_remap, src);
 
         remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
         cudaSafeCall( cudaGetLastError() );
 
         cudaSafeCall( cudaDeviceSynchronize() );
-        cudaSafeCall( cudaUnbindTexture(tex_remap) );
     }
     
     void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
@@ -151,8 +147,8 @@ namespace cv { namespace gpu { namespace imgproc
         grid.y = divUp(dst.rows, threads.y);
 
         remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
-
         cudaSafeCall( cudaGetLastError() );
+
         cudaSafeCall( cudaDeviceSynchronize() );
     }
 
diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp
index 860d627151..9a3086b67b 100644
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -77,7 +77,6 @@ namespace cv
         // Returns true if the GPU analogue exists, false otherwise.
         bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
 
-
         static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
 
         template<class T> static inline void uploadConstant(const char* name, const T& value) 
@@ -117,6 +116,49 @@ namespace cv
             cudaSafeCall( cudaUnbindTexture(tex) );
         }
 
+        class TextureBinder
+        {
+        public:
+            TextureBinder() : tex_(0) {}
+            template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
+            {
+                bind(tex, img);
+            }
+            template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
+            {
+                bind(tex_name, img);
+            }
+            ~TextureBinder() { unbind(); }
+
+            template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
+            {
+                unbind();
+
+                cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+                cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+
+                tex_ = tex;
+            }
+            template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
+            {
+                const textureReference* tex; 
+                cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); 
+                bind(tex, img);
+            }
+
+            void unbind()
+            {
+                if (tex_)
+                {
+                    cudaUnbindTexture(tex_);
+                    tex_ = 0;
+                }
+            }
+
+        private:
+            const textureReference* tex_;
+        };
+
         class NppStreamHandler
         {
         public:
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu
index 2602c05749..3636853b4d 100644
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -43,6 +43,7 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 
 using namespace cv::gpu::device;
 
@@ -62,7 +63,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
 ///////////////////////////////////////////////////////////////////////////
 
     template<typename T>
-    __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
+    __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
     {
         size_t x = blockIdx.x * blockDim.x + threadIdx.x;
         size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -162,7 +163,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
     }
 
     template<typename T>
-    __global__ void set_to_without_mask(T * mat, int cols, int rows, size_t step, int channels)
+    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
     {
         size_t x = blockIdx.x * blockDim.x + threadIdx.x;
         size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -175,7 +176,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
     }
 
     template<typename T>
-    __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, size_t step, int channels, size_t step_mask)
+    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
     {
         size_t x = blockIdx.x * blockDim.x + threadIdx.x;
         size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -237,19 +238,16 @@ namespace cv { namespace gpu { namespace matrix_operations {
 //////////////////////////////// ConvertTo ////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////
 
-    template <typename T, typename D>
-    class Convertor
+    template <typename T, typename D> struct Convertor : unary_function<T, D>
     {
-    public:
         Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
 
-        __device__ __forceinline__ D operator()(const T& src)
+        __device__ __forceinline__ D operator()(const T& src) const
         {
             return saturate_cast<D>(alpha * src + beta);
         }
 
-    private:
-        double alpha, beta;
+        const double alpha, beta;
     };
     
     template<typename T, typename D>
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index 551ccd9a72..a22077e2f6 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -225,7 +225,7 @@ namespace cv { namespace gpu { namespace surf
     };
 
     template <typename Mask>
-    __global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
+    __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
     {
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
 
@@ -346,7 +346,7 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////
     // INTERPOLATION
     
-    __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer,
+    __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
         float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
         unsigned int* featureCounter)
     {
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp
index a49c8a5569..874e6e506d 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -45,13 +45,10 @@
 using namespace cv;
 using namespace cv::gpu;
 
-////////////////////////////////////////////////////////////////////////
-//////////////////////////////// GpuMat ////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-
-
 #if !defined (HAVE_CUDA)
 
+void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }
+void cv::gpu::unregisterPageLocked(Mat&) { throw_nogpu(); }
 void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); }
 bool cv::gpu::CudaMem::canMapHostMemory() { throw_nogpu(); return false; }
 void cv::gpu::CudaMem::release() { throw_nogpu(); }
@@ -59,9 +56,15 @@ GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_nogpu(); return Gpu
 
 #else /* !defined (HAVE_CUDA) */
 
-///////////////////////////////////////////////////////////////////////
-//////////////////////////////// CudaMem //////////////////////////////
-///////////////////////////////////////////////////////////////////////
+void registerPageLocked(Mat& m)
+{
+    cudaSafeCall( cudaHostRegister(m.ptr(), m.step * m.rows, cudaHostRegisterPortable) );
+}
+
+void unregisterPageLocked(Mat& m)
+{
+    cudaSafeCall( cudaHostUnregister(m.ptr()) );
+}
 
 bool cv::gpu::CudaMem::canMapHostMemory()
 {
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
index a0b931ca7f..22eb7ffada 100644
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
@@ -52,15 +52,13 @@ namespace cv { namespace gpu { namespace device
     {
         //! Mask accessor
 
-        class MaskReader
+        struct MaskReader
         {
-        public:
             explicit MaskReader(const PtrStep& mask_): mask(mask_) {}
 
             __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
 
-        private:
-            PtrStep mask;
+            const PtrStep mask;
         };
 
         struct NoMask 
@@ -159,7 +157,7 @@ namespace cv { namespace gpu { namespace device
         template <> struct OpUnroller<3>
         {
             template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
             {
                 if (mask(y, x_shifted))
                     dst.x = op(src.x);
@@ -170,7 +168,7 @@ namespace cv { namespace gpu { namespace device
             }
 
             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
             {
                 if (mask(y, x_shifted))
                     dst.x = op(src1.x, src2.x);
@@ -183,7 +181,7 @@ namespace cv { namespace gpu { namespace device
         template <> struct OpUnroller<4>
         {
             template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
             {
                 if (mask(y, x_shifted))
                     dst.x = op(src.x);
@@ -196,7 +194,7 @@ namespace cv { namespace gpu { namespace device
             }
 
             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
             {
                 if (mask(y, x_shifted))
                     dst.x = op(src1.x, src2.x);
@@ -210,7 +208,7 @@ namespace cv { namespace gpu { namespace device
         };
 
         template <typename T, typename D, typename UnOp, typename Mask>
-        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, UnOp op)
+        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, const UnOp op)
         {
             typedef typename UnReadWriteTraits<T, D>::read_type read_type;
             typedef typename UnReadWriteTraits<T, D>::write_type write_type;
@@ -227,7 +225,7 @@ namespace cv { namespace gpu { namespace device
 
                 if (x_shifted + shift - 1 < src_.cols)
                 {
-                    read_type src_n_el = ((const read_type*)src)[x];
+                    const read_type src_n_el = ((const read_type*)src)[x];
                     write_type dst_n_el;
 
                     OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
@@ -246,7 +244,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <typename T, typename D, typename UnOp, typename Mask>
-        static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep_<D> dst, const Mask mask, UnOp op)
+        static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep_<D> dst, const Mask mask, const UnOp op)
         {
 		    const int x = blockDim.x * blockIdx.x + threadIdx.x;
 		    const int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -259,7 +257,7 @@ namespace cv { namespace gpu { namespace device
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
         __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
-            const Mask mask, BinOp op)
+            const Mask mask, const BinOp op)
         {
             typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
             typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
@@ -278,8 +276,8 @@ namespace cv { namespace gpu { namespace device
 
                 if (x_shifted + shift - 1 < src1_.cols)
                 {
-                    read_type1 src1_n_el = ((const read_type1*)src1)[x];
-                    read_type2 src2_n_el = ((const read_type2*)src2)[x];
+                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
                     write_type dst_n_el;
                     
                     OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
@@ -299,15 +297,15 @@ namespace cv { namespace gpu { namespace device
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
         static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, 
-            const Mask mask, BinOp op)
+            const Mask mask, const BinOp op)
         {
 		    const int x = blockDim.x * blockIdx.x + threadIdx.x;
 		    const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
             if (x < src1.cols && y < src1.rows && mask(y, x))
             {
-                T1 src1_data = src1.ptr(y)[x];
-                T2 src2_data = src2.ptr(y)[x];
+                const T1 src1_data = src1.ptr(y)[x];
+                const T2 src2_data = src2.ptr(y)[x];
                 dst.ptr(y)[x] = op(src1_data, src2_data);
             }
         }        
@@ -316,7 +314,7 @@ namespace cv { namespace gpu { namespace device
         template<> struct TransformDispatcher<false>
         {
             template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, cudaStream_t stream)
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
             {
                 dim3 threads(16, 16, 1);
                 dim3 grid(1, 1, 1);
@@ -332,7 +330,7 @@ namespace cv { namespace gpu { namespace device
             }
 
             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, const Mask& mask, cudaStream_t stream)
+            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
             {
                 dim3 threads(16, 16, 1);
                 dim3 grid(1, 1, 1);
@@ -350,7 +348,7 @@ namespace cv { namespace gpu { namespace device
         template<> struct TransformDispatcher<true>
         {
             template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, cudaStream_t stream)
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
             {
                 const int shift = UnReadWriteTraits<T, D>::shift;
 
@@ -368,7 +366,7 @@ namespace cv { namespace gpu { namespace device
             }
 
             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, const Mask& mask, cudaStream_t stream)
+            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
             {
                 const int shift = BinReadWriteTraits<T1, T2, D>::shift;
 
@@ -413,13 +411,13 @@ namespace cv { namespace gpu { namespace device
         };
 
         template <typename T, typename D, typename UnOp, typename Mask>
-        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, cudaStream_t stream)
+        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
         {
             TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, const Mask& mask, cudaStream_t stream)
+        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
         {
             TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
         }
diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp
index 28889945e6..b3abdf64a1 100644
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -73,7 +73,7 @@ namespace cv { namespace gpu { namespace device
     using thrust::bit_and;
     using thrust::bit_or;
     using thrust::bit_xor;
-    template <typename T> struct bit_not : public unary_function<T, T>
+    template <typename T> struct bit_not : unary_function<T, T>
     {
         __forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
     };
@@ -81,12 +81,12 @@ namespace cv { namespace gpu { namespace device
     using thrust::identity;
 
 #define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
-    template <> struct name<type> : public binary_function<type, type, type> \
+    template <> struct name<type> : binary_function<type, type, type> \
     { \
         __forceinline__ __device__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
     };
 
-    template <typename T> struct maximum : public binary_function<T, T, T>
+    template <typename T> struct maximum : binary_function<T, T, T>
     {
         __forceinline__ __device__ T operator()(const T& lhs, const T& rhs) const {return lhs < rhs ? rhs : lhs;}
     };
@@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device
     OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, fmax)
     OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, fmax)
 
-    template <typename T> struct minimum : public binary_function<T, T, T>
+    template <typename T> struct minimum : binary_function<T, T, T>
     {
         __forceinline__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;}
     };
@@ -126,31 +126,31 @@ namespace cv { namespace gpu { namespace device
     using thrust::not2;
 
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
-    template <typename T> struct func ## _func : public unary_function<T, float> \
+    template <typename T> struct func ## _func : unary_function<T, float> \
     { \
-        __forceinline__ __device__ float operator ()(const T& v) \
+        __forceinline__ __device__ float operator ()(const T& v) const \
         { \
             return func ## f(v); \
         } \
     }; \
-    template <> struct func ## _func<double> : public unary_function<double, double> \
+    template <> struct func ## _func<double> : unary_function<double, double> \
     { \
-        __forceinline__ __device__ double operator ()(double v) \
+        __forceinline__ __device__ double operator ()(double v) const \
         { \
             return func(v); \
         } \
     };
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \
-    template <typename T> struct func ## _func : public binary_function<T, T, float> \
+    template <typename T> struct func ## _func : binary_function<T, T, float> \
     { \
-        __forceinline__ __device__ float operator ()(const T& v1, const T& v2) \
+        __forceinline__ __device__ float operator ()(const T& v1, const T& v2) const \
         { \
             return func ## f(v1, v2); \
         } \
     }; \
-    template <> struct func ## _func<double> : public binary_function<double, double, double> \
+    template <> struct func ## _func<double> : binary_function<double, double, double> \
     { \
-        __forceinline__ __device__ double operator ()(double v1, double v2) \
+        __forceinline__ __device__ double operator ()(double v1, double v2) const \
         { \
             return func(v1, v2); \
         } \
@@ -184,7 +184,7 @@ namespace cv { namespace gpu { namespace device
 #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
 #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
 
-    template<typename T> struct hypot_sqr_func : public binary_function<T, T, float> 
+    template<typename T> struct hypot_sqr_func : binary_function<T, T, float> 
     {
         __forceinline__ __device__ T operator ()(T src1, T src2) const
         {
@@ -192,15 +192,15 @@ namespace cv { namespace gpu { namespace device
         }
     };
 
-    template <typename T, typename D> struct saturate_cast_func : public unary_function<T, D>
+    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
     {
-        __forceinline__ __device__ D operator ()(const T& v)
+        __forceinline__ __device__ D operator ()(const T& v) const
         {
             return saturate_cast<D>(v);
         }
     };
 
-    template <typename T> struct thresh_binary_func : public unary_function<T, T>
+    template <typename T> struct thresh_binary_func : unary_function<T, T>
     {
         __forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
 
@@ -209,10 +209,10 @@ namespace cv { namespace gpu { namespace device
             return src > thresh ? maxVal : 0;
         }
 
-        T thresh;
-        T maxVal;
+        const T thresh;
+        const T maxVal;
     };
-    template <typename T> struct thresh_binary_inv_func : public unary_function<T, T>
+    template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
     {
         __forceinline__ __host__ __device__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
 
@@ -221,10 +221,10 @@ namespace cv { namespace gpu { namespace device
             return src > thresh ? 0 : maxVal;
         }
 
-        T thresh;
-        T maxVal;
+        const T thresh;
+        const T maxVal;
     };
-    template <typename T> struct thresh_trunc_func : public unary_function<T, T>
+    template <typename T> struct thresh_trunc_func : unary_function<T, T>
     {
         explicit __forceinline__ __host__ __device__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
@@ -233,11 +233,10 @@ namespace cv { namespace gpu { namespace device
             return minimum<T>()(src, thresh);
         }
 
-        T thresh;
+        const T thresh;
     };
-    template <typename T> struct thresh_to_zero_func : public unary_function<T, T>
+    template <typename T> struct thresh_to_zero_func : unary_function<T, T>
     {
-    public:
         explicit __forceinline__ __host__ __device__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
         __forceinline__ __device__ T operator()(const T& src) const
@@ -245,11 +244,10 @@ namespace cv { namespace gpu { namespace device
             return src > thresh ? src : 0;
         }
 
-        T thresh;
+        const T thresh;
     };
-    template <typename T> struct thresh_to_zero_inv_func : public unary_function<T, T>
+    template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
     {
-    public:
         explicit __forceinline__ __host__ __device__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
         __forceinline__ __device__ T operator()(const T& src) const
@@ -257,36 +255,36 @@ namespace cv { namespace gpu { namespace device
             return src > thresh ? 0 : src;
         }
 
-        T thresh;
+        const T thresh;
     };
 
-    template <typename Op> struct binder1st : public unary_function<typename Op::second_argument_type, typename Op::result_type> 
+    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
     {
         __forceinline__ __host__ __device__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
 
-        __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::second_argument_type& a)
+        __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::second_argument_type& a) const
         {
             return op(arg1, a);
         }
 
-        Op op;
-        typename Op::first_argument_type arg1;
+        const Op op;
+        const typename Op::first_argument_type arg1;
     };
     template <typename Op, typename T> static __forceinline__ __host__ __device__ binder1st<Op> bind1st(const Op& op, const T& x)
     {
         return binder1st<Op>(op, typename Op::first_argument_type(x));
     }
-    template <typename Op> struct binder2nd : public unary_function<typename Op::first_argument_type, typename Op::result_type> 
+    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> 
     {
         __forceinline__ __host__ __device__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
 
-        __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::first_argument_type& a)
+        __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::first_argument_type& a) const
         {
             return op(a, arg2);
         }
 
-        Op op;
-        typename Op::second_argument_type arg2;
+        const Op op;
+        const typename Op::second_argument_type arg2;
     };
     template <typename Op, typename T> static __forceinline__ __host__ __device__ binder2nd<Op> bind2nd(const Op& op, const T& x)
     {
diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp
index f2e447269e..4f756e36da 100644
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -48,12 +48,12 @@
 namespace cv { namespace gpu { namespace device
 {
     template <typename T, typename D, typename UnOp>
-    static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream = 0)
+    static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
     {
         detail::transform_caller(src, dst, op, detail::NoMask(), stream);
     }
     template <typename T, typename D, typename UnOp>
-    static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, UnOp op, 
+    static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, const UnOp& op, 
         cudaStream_t stream = 0)
     {
         detail::transform_caller(src, dst, op, detail::MaskReader(mask), stream);
@@ -61,13 +61,13 @@ namespace cv { namespace gpu { namespace device
 
     template <typename T1, typename T2, typename D, typename BinOp>
     static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-        BinOp op, cudaStream_t stream = 0)
+        const BinOp& op, cudaStream_t stream = 0)
     {
         detail::transform_caller(src1, src2, dst, op, detail::NoMask(), stream);
     }
     template <typename T1, typename T2, typename D, typename BinOp>
     static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-        const PtrStep& mask, BinOp op, cudaStream_t stream = 0)
+        const PtrStep& mask, const BinOp& op, cudaStream_t stream = 0)
     {
         detail::transform_caller(src1, src2, dst, op, detail::MaskReader(mask), stream);
     }
diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp
index a3b4dafa94..b0dca8afeb 100644
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -65,15 +65,15 @@
 
 namespace cv {  namespace gpu { namespace device
 {
-    template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b) 
+    template <typename T> void __host__ __device__ __forceinline__ swap(T& a, T& b) 
     {
-        T temp = a;
+        const T temp = a;
         a = b;
         b = temp;
     }
 
     // warp-synchronous 32 elements reduction
-    template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, Op op)
+    template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, const Op& op)
     {
         data[tid] = partial_reduction;
 
@@ -88,7 +88,7 @@ namespace cv {  namespace gpu { namespace device
     }
 
     // warp-synchronous 16 elements reduction
-    template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, Op op)
+    template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, const Op& op)
     {
         data[tid] = partial_reduction;
 
@@ -102,7 +102,7 @@ namespace cv {  namespace gpu { namespace device
     }
 
     // warp-synchronous reduction
-    template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, Op op)
+    template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
     {
         if (tid < n)
             data[tid] = partial_reduction;
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp
index e4c06ac92c..9a4e6ec91c 100644
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -107,9 +107,9 @@ namespace
             maxCandidates = min(static_cast<int>(1.5 * maxFeatures), 65535);
 
             CV_Assert(maxFeatures > 0);
-            
-            cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 1) * sizeof(unsigned int)) );
-            cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 1) * sizeof(unsigned int)) );
+
+            counters.create(1, nOctaves + 1, CV_32SC1);
+            counters.setTo(Scalar::all(0));
 
             uploadConstant("cv::gpu::surf::c_max_candidates",    maxCandidates);
             uploadConstant("cv::gpu::surf::c_max_features",      maxFeatures);
@@ -118,30 +118,20 @@ namespace
             uploadConstant("cv::gpu::surf::c_nOctaveLayers",     nOctaveLayers);
             uploadConstant("cv::gpu::surf::c_hessianThreshold",  static_cast<float>(hessianThreshold));
 
-            bindTexture("cv::gpu::surf::imgTex", (DevMem2D)img);
+            imgTex.bind("cv::gpu::surf::imgTex", (DevMem2D)img);
 
             integralBuffered(img, sum, intBuffer);
-            bindTexture("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum);
+            sumTex.bind("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum);
 
             if (use_mask)
             {
                 min(mask, 1.0, mask1);
                 integralBuffered(mask1, maskSum, intBuffer);
 
-                bindTexture("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);
+                maskSumTex.bind("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);
             }
         }
 
-        ~SURF_GPU_Invoker()
-        {
-            cudaSafeCall( cudaFree(d_counters) );
-
-            unbindTexture("cv::gpu::surf::imgTex");
-            unbindTexture("cv::gpu::surf::sumTex");
-            if (use_mask)
-                unbindTexture("cv::gpu::surf::maskSumTex");
-        }
-
         void detectKeypoints(GpuMat& keypoints)
         {
             ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, det);
@@ -162,11 +152,11 @@ namespace
 
                 icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);
 
-                icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr<int4>(), d_counters + 1 + octave,
+                icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
                     img_rows, img_cols, octave, use_mask, nOctaveLayers);
 
                 unsigned int maxCounter;
-                cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+                cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
                 maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
 
                 if (maxCounter > 0)
@@ -174,11 +164,11 @@ namespace
                     icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr<int4>(), maxCounter, 
                         keypoints.ptr<float>(SURF_GPU::SF_X), keypoints.ptr<float>(SURF_GPU::SF_Y),
                         keypoints.ptr<int>(SURF_GPU::SF_LAPLACIAN), keypoints.ptr<float>(SURF_GPU::SF_SIZE),
-                        keypoints.ptr<float>(SURF_GPU::SF_HESSIAN), d_counters);
+                        keypoints.ptr<float>(SURF_GPU::SF_HESSIAN), counters.ptr<unsigned int>());
                 }
             }
             unsigned int featureCounter;
-            cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpy(&featureCounter, counters.ptr<unsigned int>(), sizeof(unsigned int), cudaMemcpyDeviceToHost) );
             featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
 
             keypoints.cols = featureCounter;
@@ -226,7 +216,9 @@ namespace
         int maxCandidates;
         int maxFeatures;
 
-        unsigned int* d_counters;
+        GpuMat counters;
+
+        TextureBinder imgTex, sumTex, maskSumTex;
     };
 }
 
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index ef2e2b7a68..1b2cbbc30b 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -318,7 +318,7 @@ TEST(BruteForceMatcher)
     GPU_OFF;
 
     SUBTEST << "knnMatch";
-    int knn = 10;
+    int knn = 2;
 
     CPU_ON;
     matcher.knnMatch(query, train, matches, knn);