From de56163f9734b35550dafe6c0366635b49e277b8 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 30 Apr 2013 17:27:06 +0400
Subject: [PATCH] refactored gpu::matchTemplate (converted it into Algorithm)

---
 .../gpuimgproc/include/opencv2/gpuimgproc.hpp |  24 +-
 .../gpuimgproc/perf/perf_match_template.cpp   |   8 +-
 modules/gpuimgproc/src/match_template.cpp     | 472 +++++++++++++-----
 .../gpuimgproc/test/test_match_template.cpp   |  24 +-
 samples/gpu/performance/tests.cpp             |  16 +-
 5 files changed, 390 insertions(+), 154 deletions(-)
diff --git a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
index 24b8b64f0a..886e10cabc 100644
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -424,20 +424,24 @@ CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, i
 
 /////////////////////////// Match Template ////////////////////////////
 
-struct CV_EXPORTS MatchTemplateBuf
+//! computes the proximity map for the raster template and the image where the template is searched for
+class CV_EXPORTS TemplateMatching : public Algorithm
 {
-    Size user_block_size;
-    GpuMat imagef, templf;
-    std::vector<GpuMat> images;
-    std::vector<GpuMat> image_sums;
-    std::vector<GpuMat> image_sqsums;
+public:
+    virtual void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null()) = 0;
 };
 
-//! computes the proximity map for the raster template and the image where the template is searched for
-CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null());
+CV_EXPORTS Ptr<TemplateMatching> createTemplateMatching(int srcType, int method, Size user_block_size = Size());
 
-//! computes the proximity map for the raster template and the image where the template is searched for
-CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null());
+// obsolete
+
+__OPENCV_GPUIMGPROC_DEPR_BEFORE__ void matchTemplate(InputArray image, InputArray templ, OutputArray result,
+                                                     int method, Stream& stream = Stream::Null()) __OPENCV_GPUIMGPROC_DEPR_AFTER__;
+
+inline void matchTemplate(InputArray image, InputArray templ, OutputArray result, int method, Stream& stream)
+{
+    gpu::createTemplateMatching(image.type(), method)->match(image, templ, result, stream);
+}
 
 ////////////////////////// Bilateral Filter ///////////////////////////
 
diff --git a/modules/gpuimgproc/perf/perf_match_template.cpp b/modules/gpuimgproc/perf/perf_match_template.cpp
index f3af149144..35f36596c6 100644
--- a/modules/gpuimgproc/perf/perf_match_template.cpp
+++ b/modules/gpuimgproc/perf/perf_match_template.cpp
@@ -76,7 +76,9 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate8U,
         const cv::gpu::GpuMat d_templ(templ);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+        cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
+        TEST_CYCLE() alg->match(d_image, d_templ, dst);
 
         GPU_SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
     }
@@ -116,7 +118,9 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate32F,
         const cv::gpu::GpuMat d_templ(templ);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+        cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
+        TEST_CYCLE() alg->match(d_image, d_templ, dst);
 
         GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
     }
diff --git a/modules/gpuimgproc/src/match_template.cpp b/modules/gpuimgproc/src/match_template.cpp
index 059d41ca9f..2b5d5cb1ca 100644
--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
@@ -47,7 +47,7 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_GPUARITHM) || defined (CUDA_DISABLER)
 
-void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+Ptr<gpu::TemplateMatching> cv::gpu::createTemplateMatching(int, int, Size) { throw_no_cuda(); return Ptr<gpu::TemplateMatching>(); }
 
 #else
 
@@ -137,11 +137,8 @@ namespace cv { namespace gpu { namespace cudev
     }
 }}}
 
-using namespace ::cv::gpu::cudev::match_template;
-
 namespace
 {
-
     // Evaluates optimal template's area threshold. If
     // template's area is less  than the threshold, we use naive match
     // template version, otherwise FFT-based (if available)
@@ -149,135 +146,317 @@ namespace
     {
         switch (method)
         {
-        case cv::TM_CCORR:
+        case TM_CCORR:
             if (depth == CV_32F) return 250;
             if (depth == CV_8U) return 300;
             break;
-        case cv::TM_SQDIFF:
+
+        case TM_SQDIFF:
             if (depth == CV_8U) return 300;
             break;
         }
-        CV_Error(cv::Error::StsBadArg, "getTemplateThreshold: unsupported match template mode");
+
+        CV_Error(Error::StsBadArg, "unsupported match template mode");
         return 0;
     }
 
+    ///////////////////////////////////////////////////////////////
+    // CCORR_32F
+
+    class Match_CCORR_32F : public TemplateMatching
+    {
+    public:
+        explicit Match_CCORR_32F(Size user_block_size);
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
 
-    void matchTemplate_CCORR_32F(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    private:
+        Ptr<gpu::Convolution> conv_;
+        GpuMat result_;
+    };
+
+    Match_CCORR_32F::Match_CCORR_32F(Size user_block_size)
+    {
+        conv_ = gpu::createConvolution(user_block_size);
+    }
+
+    void Match_CCORR_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& _stream)
     {
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_32F))
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_32F );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_32F))
         {
-            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), stream);
             return;
         }
 
-        Ptr<gpu::Convolution> conv = gpu::createConvolution(buf.user_block_size);
-
         if (image.channels() == 1)
         {
-            conv->convolve(image.reshape(1), templ.reshape(1), result, true, stream);
+            conv_->convolve(image.reshape(1), templ.reshape(1), result, true, _stream);
         }
         else
         {
-            GpuMat result_;
-            conv->convolve(image.reshape(1), templ.reshape(1), result_, true, stream);
-            extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
+            conv_->convolve(image.reshape(1), templ.reshape(1), result_, true, _stream);
+            extractFirstChannel_32F(result_, result, image.channels(), stream);
         }
     }
 
+    ///////////////////////////////////////////////////////////////
+    // CCORR_8U
 
-    void matchTemplate_CCORR_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCORR_8U : public TemplateMatching
     {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_8U))
+    public:
+        explicit Match_CCORR_8U(Size user_block_size) : match32F_(user_block_size)
         {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat imagef_, templf_;
+        Match_CCORR_32F match32F_;
+    };
+
+    void Match_CCORR_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_8U))
+        {
+            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+            GpuMat result = _result.getGpuMat();
+
             matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
             return;
         }
 
-        image.convertTo(buf.imagef, CV_32F, stream);
-        templ.convertTo(buf.templf, CV_32F, stream);
+        image.convertTo(imagef_, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);
 
-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
+        match32F_.match(imagef_, templf_, _result, stream);
     }
 
+    ///////////////////////////////////////////////////////////////
+    // CCORR_NORMED_8U
 
-    void matchTemplate_CCORR_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCORR_NORMED_8U : public TemplateMatching
     {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+    public:
+        explicit Match_CCORR_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        Match_CCORR_8U match_CCORR_;
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+    };
+
+    void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
 
-        buf.image_sqsums.resize(1);
-        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
 
-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
-        normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+
+        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];
+
+        normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
     }
 
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_32F
 
-    void matchTemplate_SQDIFF_32F(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_SQDIFF_32F : public TemplateMatching
     {
-        (void)buf;
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    public:
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+    };
+
+    void Match_SQDIFF_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_32F );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
         matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
     }
 
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_8U
 
-    void matchTemplate_SQDIFF_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_SQDIFF_8U : public TemplateMatching
     {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_SQDIFF, CV_8U))
+    public:
+        explicit Match_SQDIFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
         {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_SQDIFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        if (templ.size().area() < getTemplateThreshold(TM_SQDIFF, CV_8U))
+        {
+            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+            GpuMat result = _result.getGpuMat();
+
             matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
             return;
         }
 
-        buf.image_sqsums.resize(1);
-        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
 
-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
+        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];
 
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
     }
 
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_NORMED_8U
+
+    class Match_SQDIFF_NORMED_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_SQDIFF_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+        Match_CCORR_8U match_CCORR_;
+    };
 
-    void matchTemplate_SQDIFF_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    void Match_SQDIFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
     {
-        buf.image_sqsums.resize(1);
-        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+        using namespace cv::gpu::cudev::match_template;
 
-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
 
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+
+        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
     }
 
+    ///////////////////////////////////////////////////////////////
+    // CCOFF_8U
 
-    void matchTemplate_CCOFF_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCOEFF_8U : public TemplateMatching
     {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+    public:
+        explicit Match_CCOEFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat intBuffer_;
+        std::vector<GpuMat> images_;
+        std::vector<GpuMat> image_sums_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_CCOEFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
 
         if (image.channels() == 1)
         {
-            buf.image_sums.resize(1);
-            gpu::integral(image, buf.image_sums[0], stream);
+            image_sums_.resize(1);
+            gpu::integral(image, image_sums_[0], intBuffer_, stream);
+
+            unsigned int templ_sum = (unsigned int) gpu::sum(templ)[0];
 
-            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
-            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
+            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
         }
         else
         {
-            gpu::split(image, buf.images);
-            buf.image_sums.resize(buf.images.size());
+            gpu::split(image, images_);
+
+            image_sums_.resize(images_.size());
             for (int i = 0; i < image.channels(); ++i)
-                gpu::integral(buf.images[i], buf.image_sums[i], stream);
+                gpu::integral(images_[i], image_sums_[i], intBuffer_, stream);
 
             Scalar templ_sum = gpu::sum(templ);
 
@@ -285,60 +464,91 @@ namespace
             {
             case 2:
                 matchTemplatePrepared_CCOFF_8UC2(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1],
                         result, StreamAccessor::getStream(stream));
                 break;
             case 3:
                 matchTemplatePrepared_CCOFF_8UC3(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2],
                         result, StreamAccessor::getStream(stream));
                 break;
             case 4:
                 matchTemplatePrepared_CCOFF_8UC4(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], buf.image_sums[3],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (unsigned int) templ_sum[3],
+                        result, StreamAccessor::getStream(stream));
                 break;
             default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
             }
         }
     }
 
+    ///////////////////////////////////////////////////////////////
+    // CCOFF_NORMED_8U
 
-    void matchTemplate_CCOFF_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCOEFF_NORMED_8U : public TemplateMatching
     {
-        image.convertTo(buf.imagef, CV_32F, stream);
-        templ.convertTo(buf.templf, CV_32F, stream);
+    public:
+        explicit Match_CCOEFF_NORMED_8U(Size user_block_size) : match_CCORR_32F_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
 
-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
+    private:
+        GpuMat imagef_, templf_;
+        Match_CCORR_32F match_CCORR_32F_;
+        GpuMat intBuffer_;
+        std::vector<GpuMat> images_;
+        std::vector<GpuMat> image_sums_;
+        std::vector<GpuMat> image_sqsums_;
+    };
+
+    void Match_CCOEFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        image.convertTo(imagef_, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);
+
+        match_CCORR_32F_.match(imagef_, templf_, _result, stream);
+        GpuMat result = _result.getGpuMat();
 
         if (image.channels() == 1)
         {
-            buf.image_sums.resize(1);
-            gpu::integral(image, buf.image_sums[0], stream);
-            buf.image_sqsums.resize(1);
-            gpu::sqrIntegral(image, buf.image_sqsums[0], stream);
+            image_sums_.resize(1);
+            gpu::integral(image, image_sums_[0], intBuffer_, stream);
+
+            image_sqsums_.resize(1);
+            gpu::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
 
-            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
-            unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ)[0];
+            unsigned int templ_sum = (unsigned int) gpu::sum(templ)[0];
+            unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ)[0];
 
             matchTemplatePrepared_CCOFF_NORMED_8U(
-                    templ.cols, templ.rows, buf.image_sums[0], buf.image_sqsums[0],
+                    templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
                     templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
         }
         else
         {
-            gpu::split(image, buf.images);
-            buf.image_sums.resize(buf.images.size());
-            buf.image_sqsums.resize(buf.images.size());
+            gpu::split(image, images_);
+
+            image_sums_.resize(images_.size());
+            image_sqsums_.resize(images_.size());
             for (int i = 0; i < image.channels(); ++i)
             {
-                gpu::integral(buf.images[i], buf.image_sums[i], stream);
-                gpu::sqrIntegral(buf.images[i], buf.image_sqsums[i], stream);
+                gpu::integral(images_[i], image_sums_[i], intBuffer_, stream);
+                gpu::sqrIntegral(images_[i], image_sqsums_[i], intBuffer_, stream);
             }
 
             Scalar templ_sum = gpu::sum(templ);
@@ -349,8 +559,8 @@ namespace
             case 2:
                 matchTemplatePrepared_CCOFF_NORMED_8UC2(
                         templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
                         (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                         (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                         result, StreamAccessor::getStream(stream));
@@ -358,9 +568,9 @@ namespace
             case 3:
                 matchTemplatePrepared_CCOFF_NORMED_8UC3(
                         templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        image_sums_[2], image_sqsums_[2],
                         (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                         (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                         (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
@@ -369,10 +579,10 @@ namespace
             case 4:
                 matchTemplatePrepared_CCOFF_NORMED_8UC4(
                         templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
-                        buf.image_sums[3], buf.image_sqsums[3],
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        image_sums_[2], image_sqsums_[2],
+                        image_sums_[3], image_sqsums_[3],
                         (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                         (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                         (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
@@ -380,46 +590,60 @@ namespace
                         result, StreamAccessor::getStream(stream));
                 break;
             default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
             }
         }
     }
 }
 
-
-void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
+Ptr<gpu::TemplateMatching> cv::gpu::createTemplateMatching(int srcType, int method, Size user_block_size)
 {
-    MatchTemplateBuf buf;
-    matchTemplate(image, templ, result, method, buf, stream);
-}
-
+    const int sdepth = CV_MAT_DEPTH(srcType);
 
-void cv::gpu::matchTemplate(
-        const GpuMat& image, const GpuMat& templ, GpuMat& result, int method,
-        MatchTemplateBuf &buf, Stream& stream)
-{
-    CV_Assert(image.type() == templ.type());
-    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
+    CV_Assert( sdepth == CV_8U || sdepth == CV_32F );
 
-    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, MatchTemplateBuf&, Stream& stream);
+    if (sdepth == CV_32F)
+    {
+        switch (method)
+        {
+        case TM_SQDIFF:
+            return new Match_SQDIFF_32F;
 
-    static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U,
-                                        ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U,
-                                        ::matchTemplate_CCOFF_8U, ::matchTemplate_CCOFF_NORMED_8U };
-    static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0,
-                                         ::matchTemplate_CCORR_32F, 0, 0, 0 };
+        case TM_CCORR:
+            return new Match_CCORR_32F(user_block_size);
 
-    const Caller* callers = 0;
-    switch (image.depth())
-    {
-        case CV_8U: callers = callers8U; break;
-        case CV_32F: callers = callers32F; break;
-        default: CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported data type");
+        default:
+            CV_Error( Error::StsBadFlag, "Unsopported method" );
+            return Ptr<gpu::TemplateMatching>();
+        }
     }
+    else
+    {
+        switch (method)
+        {
+        case TM_SQDIFF:
+            return new Match_SQDIFF_8U(user_block_size);
+
+        case TM_SQDIFF_NORMED:
+            return new Match_SQDIFF_NORMED_8U(user_block_size);
 
-    Caller caller = callers[method];
-    CV_Assert(caller);
-    caller(image, templ, result, buf, stream);
+        case TM_CCORR:
+            return new Match_CCORR_8U(user_block_size);
+
+        case TM_CCORR_NORMED:
+            return new Match_CCORR_NORMED_8U(user_block_size);
+
+        case TM_CCOEFF:
+            return new Match_CCOEFF_8U(user_block_size);
+
+        case TM_CCOEFF_NORMED:
+            return new Match_CCOEFF_NORMED_8U(user_block_size);
+
+        default:
+            CV_Error( Error::StsBadFlag, "Unsopported method" );
+            return Ptr<gpu::TemplateMatching>();
+        }
+    }
 }
 
 #endif
diff --git a/modules/gpuimgproc/test/test_match_template.cpp b/modules/gpuimgproc/test/test_match_template.cpp
index d187579230..b6fd161408 100644
--- a/modules/gpuimgproc/test/test_match_template.cpp
+++ b/modules/gpuimgproc/test/test_match_template.cpp
@@ -82,8 +82,10 @@ GPU_TEST_P(MatchTemplate8U, Accuracy)
     cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
     cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
 
+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
     cv::gpu::GpuMat dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
+    alg->match(loadMat(image), loadMat(templ), dst);
 
     cv::Mat dst_gold;
     cv::matchTemplate(image, templ, dst_gold, method);
@@ -128,8 +130,10 @@ GPU_TEST_P(MatchTemplate32F, Regression)
     cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
     cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
 
+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
     cv::gpu::GpuMat dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
+    alg->match(loadMat(image), loadMat(templ), dst);
 
     cv::Mat dst_gold;
     cv::matchTemplate(image, templ, dst_gold, method);
@@ -169,8 +173,10 @@ GPU_TEST_P(MatchTemplateBlackSource, Accuracy)
     cv::Mat pattern = readImage("matchtemplate/cat.png");
     ASSERT_FALSE(pattern.empty());
 
+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
     cv::gpu::GpuMat d_dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, method);
+    alg->match(loadMat(image), loadMat(pattern), d_dst);
 
     cv::Mat dst(d_dst);
 
@@ -214,8 +220,10 @@ GPU_TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
     cv::Mat pattern = readImage(patternName);
     ASSERT_FALSE(pattern.empty());
 
+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), cv::TM_CCOEFF_NORMED);
+
     cv::gpu::GpuMat d_dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, cv::TM_CCOEFF_NORMED);
+    alg->match(loadMat(image), loadMat(pattern), d_dst);
 
     cv::Mat dst(d_dst);
 
@@ -263,8 +271,10 @@ GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
     cv::Mat templ = readImage("matchtemplate/template.png");
     ASSERT_FALSE(templ.empty());
 
+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(scene.type(), cv::TM_SQDIFF_NORMED);
+
     cv::gpu::GpuMat d_result;
-    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF_NORMED);
+    alg->match(loadMat(scene), loadMat(templ), d_result);
 
     cv::Mat result(d_result);
 
@@ -286,8 +296,10 @@ GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
     cv::Mat templ = readImage("matchtemplate/template.png");
     ASSERT_FALSE(templ.empty());
 
+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(scene.type(), cv::TM_SQDIFF);
+
     cv::gpu::GpuMat d_result;
-    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF);
+    alg->match(loadMat(scene), loadMat(templ), d_result);
 
     cv::Mat result(d_result);
 
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index da2d4e2263..3c26d16b04 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -17,24 +17,16 @@
 using namespace std;
 using namespace cv;
 
-static void InitMatchTemplate()
-{
-    Mat src; gen(src, 500, 500, CV_32F, 0, 1);
-    Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
-    gpu::GpuMat d_src(src), d_templ(templ), d_dst;
-    gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
-}
-
 
 TEST(matchTemplate)
 {
-    InitMatchTemplate();
-
     Mat src, templ, dst;
     gen(src, 3000, 3000, CV_32F, 0, 1);
 
     gpu::GpuMat d_src(src), d_templ, d_dst;
 
+    Ptr<gpu::TemplateMatching> alg = gpu::createTemplateMatching(src.type(), TM_CCORR);
+
     for (int templ_size = 5; templ_size < 200; templ_size *= 5)
     {
         SUBTEST << src.cols << 'x' << src.rows << ", 32FC1" << ", templ " << templ_size << 'x' << templ_size << ", CCORR";
@@ -47,10 +39,10 @@ TEST(matchTemplate)
         CPU_OFF;
 
         d_templ.upload(templ);
-        gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
+        alg->match(d_src, d_templ, d_dst);
 
         GPU_ON;
-        gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
+        alg->match(d_src, d_templ, d_dst);
         GPU_OFF;
     }
 }