diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 269e11b635..53e118ad71 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -361,13 +361,13 @@ size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const
     return deviceProps.get(device_id_)->sharedMemPerBlock;
 }
 
-void cv::gpu::DeviceInfo::queryMemory(size_t& totalMemory, size_t& freeMemory) const
+void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
 {
     int prevDeviceID = getDevice();
     if (prevDeviceID != device_id_)
         setDevice(device_id_);
 
-    cudaSafeCall( cudaMemGetInfo(&freeMemory, &totalMemory) );
+    cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
 
     if (prevDeviceID != device_id_)
         setDevice(prevDeviceID);
@@ -375,16 +375,16 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& totalMemory, size_t& freeMemory) c
 
 size_t cv::gpu::DeviceInfo::freeMemory() const
 {
-    size_t totalMemory, freeMemory;
-    queryMemory(totalMemory, freeMemory);
-    return freeMemory;
+    size_t _totalMemory, _freeMemory;
+    queryMemory(_totalMemory, _freeMemory);
+    return _freeMemory;
 }
 
 size_t cv::gpu::DeviceInfo::totalMemory() const
 {
-    size_t totalMemory, freeMemory;
-    queryMemory(totalMemory, freeMemory);
-    return totalMemory;
+    size_t _totalMemory, _freeMemory;
+    queryMemory(_totalMemory, _freeMemory);
+    return _totalMemory;
 }
 
 bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const
diff --git a/modules/gpu/doc/feature_detection_and_description.rst b/modules/gpu/doc/feature_detection_and_description.rst
index aafc35a50d..ec75248040 100644
--- a/modules/gpu/doc/feature_detection_and_description.rst
+++ b/modules/gpu/doc/feature_detection_and_description.rst
@@ -640,4 +640,3 @@ Converts matrices obtained via :ocv:func:`gpu::BruteForceMatcher_GPU_base::radiu
 .. ocv:function:: void gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
 
 If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
-
diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst
index 20e301300e..133660236a 100644
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -271,7 +271,9 @@ gpu::CascadeClassifier_GPU::detectMultiScale
 ------------------------------------------------
 Detects objects of different sizes in the input image.
 
-.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.1, int minNeighbors=4, Size minSize=Size() )
+.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size())
+
+.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4)
 
     :param image: Matrix of type  ``CV_8U``  containing an image where objects should be detected.
 
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 7cc57e49af..afbe067a3e 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -804,31 +804,24 @@ private:
     GpuMat lab, l, ab;
 };
 
-
-struct CV_EXPORTS CannyBuf;
-
-CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
-CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
-
 struct CV_EXPORTS CannyBuf
 {
-    CannyBuf() {}
-    explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
-    CannyBuf(const GpuMat& dx_, const GpuMat& dy_);
-
     void create(const Size& image_size, int apperture_size = 3);
-
     void release();
 
     GpuMat dx, dy;
-    GpuMat dx_buf, dy_buf;
-    GpuMat edgeBuf;
-    GpuMat trackBuf1, trackBuf2;
+    GpuMat mag;
+    GpuMat map;
+    GpuMat st1, st2;
+    GpuMat unused;
     Ptr<FilterEngine_GPU> filterDX, filterDY;
 };
 
+CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
+CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
+
 class CV_EXPORTS ImagePyramid
 {
 public:
@@ -1504,6 +1497,12 @@ public:
     explicit BruteForceMatcher_GPU(Hamming /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {}
 };
 
+class CV_EXPORTS BFMatcher_GPU : public BruteForceMatcher_GPU_base
+{
+public:
+    explicit BFMatcher_GPU(int norm = NORM_L2) : BruteForceMatcher_GPU_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
+};
+
 ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////
 // The cascade classifier class for object detection: supports old haar and new lbp xlm formats and nvbin for haar cascades olny.
 class CV_EXPORTS CascadeClassifier_GPU
@@ -1518,7 +1517,8 @@ public:
     void release();
 
     /* returns number of detected objects */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4, Size minSize = Size());
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 
     bool findLargestObject;
     bool visualizeInPlace;
@@ -1526,7 +1526,6 @@ public:
     Size getClassifierSize() const;
 
 private:
-
     struct CascadeClassifierImpl;
     CascadeClassifierImpl* impl;
     struct HaarCascade;
@@ -1858,64 +1857,33 @@ inline GoodFeaturesToTrackDetector_GPU::GoodFeaturesToTrackDetector_GPU(int maxC
 class CV_EXPORTS PyrLKOpticalFlow
 {
 public:
-    PyrLKOpticalFlow()
-    {
-        winSize = Size(21, 21);
-        maxLevel = 3;
-        iters = 30;
-        derivLambda = 0.5;
-        useInitialFlow = false;
-        minEigThreshold = 1e-4f;
-        getMinEigenVals = false;
-        isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
-    }
+    PyrLKOpticalFlow();
 
     void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
         GpuMat& status, GpuMat* err = 0);
 
     void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
 
+    void releaseMemory();
+
     Size winSize;
     int maxLevel;
     int iters;
-    double derivLambda;
+    double derivLambda; //unused
     bool useInitialFlow;
-    float minEigThreshold;
-    bool getMinEigenVals;
-
-    void releaseMemory()
-    {
-        dx_calcBuf_.release();
-        dy_calcBuf_.release();
-
-        prevPyr_.clear();
-        nextPyr_.clear();
-
-        dx_buf_.release();
-        dy_buf_.release();
-
-        uPyr_.clear();
-        vPyr_.clear();
-    }
+    float minEigThreshold; //unused
+    bool getMinEigenVals;  //unused
 
 private:
-    void calcSharrDeriv(const GpuMat& src, GpuMat& dx, GpuMat& dy);
-
-    void buildImagePyramid(const GpuMat& img0, vector<GpuMat>& pyr, bool withBorder);
-
-    GpuMat dx_calcBuf_;
-    GpuMat dy_calcBuf_;
-
+    GpuMat uPyr_[2];
     vector<GpuMat> prevPyr_;
     vector<GpuMat> nextPyr_;
+    GpuMat vPyr_[2];
+    vector<GpuMat> unused1;
+    vector<GpuMat> unused2;
+    bool unused3;
 
-    GpuMat dx_buf_;
-    GpuMat dy_buf_;
-
-    vector<GpuMat> uPyr_;
-    vector<GpuMat> vPyr_;
-
-    bool isDeviceArch11_;
+    GpuMat buf_;
 };
 
 
diff --git a/modules/gpu/perf/perf_features2d.cpp b/modules/gpu/perf/perf_features2d.cpp
index 7c966af302..a93cef9b33 100644
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -161,8 +161,7 @@ PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Val
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::BruteForceMatcher_GPU_base d_matcher(
-            cv::gpu::BruteForceMatcher_GPU_base::DistType((normType -2) / 2));
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
 
         cv::gpu::GpuMat d_query(query);
         cv::gpu::GpuMat d_train(train);
@@ -221,8 +220,7 @@ PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::BruteForceMatcher_GPU_base d_matcher(
-            cv::gpu::BruteForceMatcher_GPU_base::DistType((normType -2) / 2));
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
 
         cv::gpu::GpuMat d_query(query);
         cv::gpu::GpuMat d_train(train);
@@ -275,8 +273,7 @@ PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::BruteForceMatcher_GPU_base d_matcher(
-            cv::gpu::BruteForceMatcher_GPU_base::DistType((normType -2) / 2));
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
 
         cv::gpu::GpuMat d_query(query);
         cv::gpu::GpuMat d_train(train);
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index cfaa753114..3603933979 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -58,6 +58,7 @@ bool cv::gpu::CascadeClassifier_GPU::load(const string&)              { throw_no
 Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const        { throw_nogpu(); return Size();}
 void cv::gpu::CascadeClassifier_GPU::release()                        { throw_nogpu(); }
 int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_nogpu(); return -1;}
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_nogpu(); return -1;}
 
 #else
 
@@ -682,6 +683,12 @@ int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMa
     return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
 }
 
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
+{
+    CV_Assert( !this->empty());
+    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, maxObjectSize);
+}
+
 bool cv::gpu::CascadeClassifier_GPU::load(const string& filename)
 {
     release();
@@ -771,6 +778,8 @@ NCVStatus loadFromXML(const std::string &filename,
     haar.bNeedsTiltedII = false;
     Ncv32u curMaxTreeDepth;
 
+    std::vector<char> xmlFileCont;
+
     std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
     haarStages.resize(0);
     haarClassifierNodes.resize(0);
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
index fecb717cd9..09cf01850e 100644
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -121,9 +121,7 @@ void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, f
     buf.accum.setTo(Scalar::all(0));
 
     DeviceInfo devInfo;
-    cudaDeviceProp prop;
-    cudaSafeCall(cudaGetDeviceProperties(&prop, devInfo.deviceID()));
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, prop.sharedMemPerBlock, devInfo.supports(FEATURE_SET_COMPUTE_20));
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 
     ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
 
@@ -196,9 +194,7 @@ void cv::gpu::HoughLinesP(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf,
     buf.accum.setTo(Scalar::all(0));
 
     DeviceInfo devInfo;
-    cudaDeviceProp prop;
-    cudaSafeCall(cudaGetDeviceProperties(&prop, devInfo.deviceID()));
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, prop.sharedMemPerBlock, devInfo.supports(FEATURE_SET_COMPUTE_20));
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 
     ensureSizeIsEnough(1, maxLines, CV_32SC4, lines);
 
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 3e992b59c9..24f015ba7c 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -91,7 +91,6 @@ void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_n
 void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); }
-cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); }
 void cv::gpu::CannyBuf::release() { throw_nogpu(); }
 
@@ -1429,12 +1428,6 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 //////////////////////////////////////////////////////////////////////////////
 // Canny
 
-cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_)
-{
-    (void) dx_;
-    (void) dy_;
-}
-
 void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
 {
     if (apperture_size > 0)
@@ -1449,22 +1442,21 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
         }
     }
 
-    ensureSizeIsEnough(image_size, CV_32FC1, edgeBuf);
-    ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
+    ensureSizeIsEnough(image_size, CV_32FC1, mag);
+    ensureSizeIsEnough(image_size, CV_32SC1, map);
 
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1);
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
 }
 
 void cv::gpu::CannyBuf::release()
 {
     dx.release();
     dy.release();
-    dx_buf.release();
-    dy_buf.release();
-    edgeBuf.release();
-    trackBuf1.release();
-    trackBuf2.release();
+    mag.release();
+    map.release();
+    st1.release();
+    st2.release();
 }
 
 namespace canny
@@ -1487,13 +1479,14 @@ namespace
     {
         using namespace canny;
 
-        calcMap(dx, dy, buf.edgeBuf, buf.dx_buf, low_thresh, high_thresh);
+        buf.map.setTo(Scalar::all(0));
+        calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
 
-        edgesHysteresisLocal(buf.dx_buf, buf.trackBuf1.ptr<ushort2>());
+        edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
 
-        edgesHysteresisGlobal(buf.dx_buf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>());
+        edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
 
-        getEdges(buf.dx_buf, dst);
+        getEdges(buf.map, dst);
     }
 }
 
@@ -1525,14 +1518,14 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
         src.locateROI(wholeSize, ofs);
         GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
 
-        calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.edgeBuf, L2gradient);
+        calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient);
     }
     else
     {
         buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
         buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
 
-        calcMagnitude(buf.dx, buf.dy, buf.edgeBuf, L2gradient);
+        calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
     }
 
     CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
@@ -1557,7 +1550,7 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d
     dst.create(dx.size(), CV_8U);
     buf.create(dx.size(), -1);
 
-    calcMagnitude(dx, dy, buf.edgeBuf, L2gradient);
+    calcMagnitude(dx, dy, buf.mag, L2gradient);
 
     CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }
diff --git a/modules/gpu/src/pyrlk.cpp b/modules/gpu/src/pyrlk.cpp
index d94341deea..49a6c5a88c 100644
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -48,8 +48,10 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
+cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_nogpu(); }
 void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_nogpu(); }
 void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_nogpu(); }
+void cv::gpu::PyrLKOpticalFlow::releaseMemory() {}
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -66,6 +68,14 @@ namespace pyrlk
                PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
 }
 
+cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow()
+{
+    winSize = Size(21, 21);
+    maxLevel = 3;
+    iters = 30;
+    useInitialFlow = false;
+}
+
 namespace
 {
     void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
@@ -137,11 +147,11 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
     }
     else
     {
-        cvtColor(prevImg, dx_calcBuf_, COLOR_BGR2BGRA);
-        dx_calcBuf_.convertTo(prevPyr_[0], CV_32F);
+        cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
+        buf_.convertTo(prevPyr_[0], CV_32F);
 
-        cvtColor(nextImg, dx_calcBuf_, COLOR_BGR2BGRA);
-        dx_calcBuf_.convertTo(nextPyr_[0], CV_32F);
+        cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
+        buf_.convertTo(nextPyr_[0], CV_32F);
     }
 
     for (int level = 1; level <= maxLevel; ++level)
@@ -193,9 +203,6 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
         pyrDown(nextPyr_[level - 1], nextPyr_[level]);
     }
 
-    uPyr_.resize(2);
-    vPyr_.resize(2);
-
     ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
     ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
     ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
@@ -225,4 +232,18 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
     vPyr_[idx].copyTo(v);
 }
 
+void cv::gpu::PyrLKOpticalFlow::releaseMemory()
+{
+    prevPyr_.clear();
+    nextPyr_.clear();
+
+    buf_.release();
+
+    uPyr_[0].release();
+    vPyr_[0].release();
+
+    uPyr_[1].release();
+    vPyr_[1].release();
+}
+
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp
index 1fa4fe7255..3879ac0534 100644
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -583,8 +583,7 @@ PARAM_TEST_CASE(BruteForceMatcher, cv::gpu::DeviceInfo, NormCode, DescriptorSize
 
 GPU_TEST_P(BruteForceMatcher, Match_Single)
 {
-    cv::gpu::BruteForceMatcher_GPU_base matcher(
-                cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
+    cv::gpu::BFMatcher_GPU matcher(normCode);
 
     cv::gpu::GpuMat mask;
     if (useMask)
@@ -611,8 +610,7 @@ GPU_TEST_P(BruteForceMatcher, Match_Single)
 
 GPU_TEST_P(BruteForceMatcher, Match_Collection)
 {
-    cv::gpu::BruteForceMatcher_GPU_base matcher(
-                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
+    cv::gpu::BFMatcher_GPU matcher(normCode);
 
     cv::gpu::GpuMat d_train(train);
 
@@ -666,8 +664,7 @@ GPU_TEST_P(BruteForceMatcher, Match_Collection)
 
 GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
-    cv::gpu::BruteForceMatcher_GPU_base matcher(
-                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
+    cv::gpu::BFMatcher_GPU matcher(normCode);
 
     const int knn = 2;
 
@@ -706,8 +703,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 
 GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 {
-    cv::gpu::BruteForceMatcher_GPU_base matcher(
-                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
+    cv::gpu::BFMatcher_GPU matcher(normCode);
 
     const int knn = 3;
 
@@ -746,8 +742,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 
 GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
-    cv::gpu::BruteForceMatcher_GPU_base matcher(
-                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
+    cv::gpu::BFMatcher_GPU matcher(normCode);
 
     const int knn = 2;
 
@@ -809,8 +804,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 
 GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 {
-    cv::gpu::BruteForceMatcher_GPU_base matcher(
-                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
+    cv::gpu::BFMatcher_GPU matcher(normCode);
 
     const int knn = 3;
 
@@ -872,8 +866,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 
 GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
-    cv::gpu::BruteForceMatcher_GPU_base matcher(
-                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
+    cv::gpu::BFMatcher_GPU matcher(normCode);
 
     const float radius = 1.f / countFactor;
 
@@ -922,8 +915,7 @@ GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 
 GPU_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
-    cv::gpu::BruteForceMatcher_GPU_base matcher(
-                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
+    cv::gpu::BFMatcher_GPU matcher(normCode);
 
     const int n = 3;
     const float radius = 1.f / countFactor * n;
diff --git a/modules/gpu/test/test_gpumat.cpp b/modules/gpu/test/test_gpumat.cpp
index fc57512e94..9ece87caa3 100644
--- a/modules/gpu/test/test_gpumat.cpp
+++ b/modules/gpu/test/test_gpumat.cpp
@@ -322,4 +322,38 @@ INSTANTIATE_TEST_CASE_P(GPU_GpuMat, ConvertTo, testing::Combine(
     ALL_DEPTH,
     WHOLE_SUBMAT));
 
+////////////////////////////////////////////////////////////////////////////////
+// ensureSizeIsEnough
+
+struct EnsureSizeIsEnough : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    virtual void SetUp()
+    {
+        cv::gpu::DeviceInfo devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(EnsureSizeIsEnough, BufferReuse)
+{
+    cv::gpu::GpuMat buffer(100, 100, CV_8U);
+    cv::gpu::GpuMat old = buffer;
+
+    // don't reallocate memory
+    cv::gpu::ensureSizeIsEnough(10, 20, CV_8U, buffer);
+    EXPECT_EQ(10, buffer.rows);
+    EXPECT_EQ(20, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
+
+    // don't reallocate memory
+    cv::gpu::ensureSizeIsEnough(20, 30, CV_8U, buffer);
+    EXPECT_EQ(20, buffer.rows);
+    EXPECT_EQ(30, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_GpuMat, EnsureSizeIsEnough, ALL_DEVICES);
+
 #endif // HAVE_CUDA
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 9abfd6b9d3..dc6b5fa4db 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -219,7 +219,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
     descriptors1_.upload(features1.descriptors);
     descriptors2_.upload(features2.descriptors);
 
-    BruteForceMatcher_GPU_base matcher(BruteForceMatcher_GPU_base::L2Dist);
+    BFMatcher_GPU matcher(NORM_L2);
     MatchesSet matches;
 
     // Find 1->2 matches
diff --git a/samples/gpu/morfology.cpp b/samples/gpu/morphology.cpp
similarity index 100%
rename from samples/gpu/morfology.cpp
rename to samples/gpu/morphology.cpp
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index bc46353e8e..b35102d861 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -364,7 +364,7 @@ TEST(BruteForceMatcher)
 
     // Init GPU matcher
 
-    gpu::BruteForceMatcher_GPU_base d_matcher(gpu::BruteForceMatcher_GPU_base::L2Dist);
+    gpu::BFMatcher_GPU d_matcher(NORM_L2);
 
     gpu::GpuMat d_query(query);
     gpu::GpuMat d_train(train);
diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp
index 76a3e6d377..617cda52bd 100644
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@@ -57,7 +57,7 @@ int main(int argc, char* argv[])
     cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
 
     // matching descriptors
-    gpu::BruteForceMatcher_GPU_base matcher(gpu::BruteForceMatcher_GPU_base::L2Dist);
+    BFMatcher_GPU matcher(NORM_L2);
     GpuMat trainIdx, distance;
     matcher.matchSingle(descriptors1GPU, descriptors2GPU, trainIdx, distance);
 
@@ -69,7 +69,7 @@ int main(int argc, char* argv[])
     surf.downloadKeypoints(keypoints2GPU, keypoints2);
     surf.downloadDescriptors(descriptors1GPU, descriptors1);
     surf.downloadDescriptors(descriptors2GPU, descriptors2);
-    BruteForceMatcher_GPU_base::matchDownload(trainIdx, distance, matches);
+    BFMatcher_GPU::matchDownload(trainIdx, distance, matches);
 
     // drawing the results
     Mat img_matches;