From 71c391cd8dbac3b2203af757ddb1fbdeed41d87c Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 11:27:31 +0300
Subject: [PATCH 01/10] remove unused memory transfer from TVL1 CUDA
 implementation

it caused runtime failures
---
 modules/cudaoptflow/src/tvl1flow.cpp | 1 -
 1 file changed, 1 deletion(-)
diff --git a/modules/cudaoptflow/src/tvl1flow.cpp b/modules/cudaoptflow/src/tvl1flow.cpp
index d0efffa57f..b8dfea56f1 100644
--- a/modules/cudaoptflow/src/tvl1flow.cpp
+++ b/modules/cudaoptflow/src/tvl1flow.cpp
@@ -248,7 +248,6 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat& I0, const G
         {
             // some tweaks to make sum operation less frequently
             bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon);
-            cv::Mat m1(u3);
             estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast<float>(theta), gamma, calcError);
             if (calcError)
             {

From c4b2058d233347b83b53b3afd293b9eed80ed806 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 11:27:51 +0300
Subject: [PATCH 02/10] simplify TVL1 accuracy test to reduce run time

---
 modules/cudaoptflow/test/test_optflow.cpp | 35 +++++++++++------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/modules/cudaoptflow/test/test_optflow.cpp b/modules/cudaoptflow/test/test_optflow.cpp
index 2b976563b0..dce9cc59bc 100644
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -325,15 +325,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FarnebackOpticalFlow, testing::Combine(
 //////////////////////////////////////////////////////
 // OpticalFlowDual_TVL1
 
-PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::cuda::DeviceInfo, UseRoi)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(Gamma, double)
+}
+
+PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::cuda::DeviceInfo, Gamma)
 {
     cv::cuda::DeviceInfo devInfo;
-    bool useRoi;
+    double gamma;
 
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
-        useRoi = GET_PARAM(1);
+        gamma = GET_PARAM(1);
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -348,30 +353,22 @@ CUDA_TEST_P(OpticalFlowDual_TVL1, Accuracy)
     ASSERT_FALSE(frame1.empty());
 
     cv::cuda::OpticalFlowDual_TVL1_CUDA d_alg;
-    cv::cuda::GpuMat d_flowx = createMat(frame0.size(), CV_32FC1, useRoi);
-    cv::cuda::GpuMat d_flowy = createMat(frame0.size(), CV_32FC1, useRoi);
-    d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy);
+    d_alg.iterations = 10;
+    d_alg.gamma = gamma;
+
+    cv::cuda::GpuMat d_flowx, d_flowy;
+    d_alg(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
 
     cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
     alg->set("medianFiltering", 1);
     alg->set("innerIterations", 1);
     alg->set("outerIterations", d_alg.iterations);
+    alg->set("gamma", gamma);
+
     cv::Mat flow;
     alg->calc(frame0, frame1, flow);
     cv::Mat gold[2];
     cv::split(flow, gold);
-    cv::Mat mx(d_flowx);
-    cv::Mat my(d_flowx);
-
-    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 4e-3);
-    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 4e-3);
-    d_alg.gamma = 1;
-    alg->set("gamma", 1);
-    d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy);
-    alg->calc(frame0, frame1, flow);
-    cv::split(flow, gold);
-    mx = cv::Mat(d_flowx);
-    my = cv::Mat(d_flowx);
 
     EXPECT_MAT_SIMILAR(gold[0], d_flowx, 4e-3);
     EXPECT_MAT_SIMILAR(gold[1], d_flowy, 4e-3);
@@ -379,7 +376,7 @@ CUDA_TEST_P(OpticalFlowDual_TVL1, Accuracy)
 
 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
     ALL_DEVICES,
-    WHOLE_SUBMAT));
+    testing::Values(Gamma(0.0), Gamma(1.0))));
 
 //////////////////////////////////////////////////////
 // FastOpticalFlowBM

From 19c6bbe7d91efc74039636b6ce00a26810f40ef9 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 15:35:23 +0300
Subject: [PATCH 03/10] move obsolete algorithms from cudaoptflow to cudalegacy

---
 .../cudalegacy/include/opencv2/cudalegacy.hpp |  51 +++++-
 .../{cudaoptflow => cudalegacy}/src/bm.cpp    |   0
 .../src/bm_fast.cpp                           |   0
 .../src/cuda/bm.cu                            |   0
 .../src/cuda/bm_fast.cu                       |   0
 .../src/cuda/needle_map.cu                    |   0
 .../src/interpolate_frames.cpp                |   0
 .../src/needle_map.cpp                        |   0
 .../include/opencv2/cudaoptflow.hpp           |  41 -----
 modules/cudaoptflow/perf/perf_optflow.cpp     | 150 ------------------
 modules/cudaoptflow/test/test_optflow.cpp     | 118 --------------
 11 files changed, 49 insertions(+), 311 deletions(-)
 rename modules/{cudaoptflow => cudalegacy}/src/bm.cpp (100%)
 rename modules/{cudaoptflow => cudalegacy}/src/bm_fast.cpp (100%)
 rename modules/{cudaoptflow => cudalegacy}/src/cuda/bm.cu (100%)
 rename modules/{cudaoptflow => cudalegacy}/src/cuda/bm_fast.cu (100%)
 rename modules/{cudaoptflow => cudalegacy}/src/cuda/needle_map.cu (100%)
 rename modules/{cudaoptflow => cudalegacy}/src/interpolate_frames.cpp (100%)
 rename modules/{cudaoptflow => cudalegacy}/src/needle_map.cpp (100%)

diff --git a/modules/cudalegacy/include/opencv2/cudalegacy.hpp b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
index 328836c530..f0107499d3 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@@ -71,8 +71,9 @@ public:
 
 CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());
 
-////////////////////////////////////////////////////
+//
 // GMG
+//
 
 /** @brief Background/Foreground Segmentation Algorithm.
 
@@ -125,8 +126,9 @@ public:
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorGMG>
     createBackgroundSubtractorGMG(int initializationFrames = 120, double decisionThreshold = 0.8);
 
-////////////////////////////////////////////////////
+//
 // FGD
+//
 
 /** @brief The class discriminates between foreground and background pixels by building and maintaining a model
 of the background.
@@ -180,6 +182,51 @@ struct CV_EXPORTS FGDParams
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorFGD>
     createBackgroundSubtractorFGD(const FGDParams& params = FGDParams());
 
+//
+// Optical flow
+//
+
+//! Calculates optical flow for 2 images using block matching algorithm */
+CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
+                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
+                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+class CV_EXPORTS FastOpticalFlowBM
+{
+public:
+    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
+
+private:
+    GpuMat buffer;
+    GpuMat extended_I0;
+    GpuMat extended_I1;
+};
+
+/** @brief Interpolates frames (images) using provided optical flow (displacement field).
+
+@param frame0 First frame (32-bit floating point images, single channel).
+@param frame1 Second frame. Must have the same type and size as frame0 .
+@param fu Forward horizontal displacement.
+@param fv Forward vertical displacement.
+@param bu Backward horizontal displacement.
+@param bv Backward vertical displacement.
+@param pos New frame position.
+@param newFrame Output image.
+@param buf Temporary buffer, will have width x 6\*height size, CV_32FC1 type and contain 6
+GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward
+horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow,
+interpolated backward vertical flow.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
+                                  const GpuMat& fu, const GpuMat& fv,
+                                  const GpuMat& bu, const GpuMat& bv,
+                                  float pos, GpuMat& newFrame, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);
+
 //! @}
 
 }}
diff --git a/modules/cudaoptflow/src/bm.cpp b/modules/cudalegacy/src/bm.cpp
similarity index 100%
rename from modules/cudaoptflow/src/bm.cpp
rename to modules/cudalegacy/src/bm.cpp
diff --git a/modules/cudaoptflow/src/bm_fast.cpp b/modules/cudalegacy/src/bm_fast.cpp
similarity index 100%
rename from modules/cudaoptflow/src/bm_fast.cpp
rename to modules/cudalegacy/src/bm_fast.cpp
diff --git a/modules/cudaoptflow/src/cuda/bm.cu b/modules/cudalegacy/src/cuda/bm.cu
similarity index 100%
rename from modules/cudaoptflow/src/cuda/bm.cu
rename to modules/cudalegacy/src/cuda/bm.cu
diff --git a/modules/cudaoptflow/src/cuda/bm_fast.cu b/modules/cudalegacy/src/cuda/bm_fast.cu
similarity index 100%
rename from modules/cudaoptflow/src/cuda/bm_fast.cu
rename to modules/cudalegacy/src/cuda/bm_fast.cu
diff --git a/modules/cudaoptflow/src/cuda/needle_map.cu b/modules/cudalegacy/src/cuda/needle_map.cu
similarity index 100%
rename from modules/cudaoptflow/src/cuda/needle_map.cu
rename to modules/cudalegacy/src/cuda/needle_map.cu
diff --git a/modules/cudaoptflow/src/interpolate_frames.cpp b/modules/cudalegacy/src/interpolate_frames.cpp
similarity index 100%
rename from modules/cudaoptflow/src/interpolate_frames.cpp
rename to modules/cudalegacy/src/interpolate_frames.cpp
diff --git a/modules/cudaoptflow/src/needle_map.cpp b/modules/cudalegacy/src/needle_map.cpp
similarity index 100%
rename from modules/cudaoptflow/src/needle_map.cpp
rename to modules/cudalegacy/src/needle_map.cpp
diff --git a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
index f65b1447b2..7882a8e622 100644
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -347,47 +347,6 @@ private:
     GpuMat norm_buf;
 };
 
-//! Calculates optical flow for 2 images using block matching algorithm */
-CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
-                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
-                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
-                                  Stream& stream = Stream::Null());
-
-class CV_EXPORTS FastOpticalFlowBM
-{
-public:
-    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
-
-private:
-    GpuMat buffer;
-    GpuMat extended_I0;
-    GpuMat extended_I1;
-};
-
-/** @brief Interpolates frames (images) using provided optical flow (displacement field).
-
-@param frame0 First frame (32-bit floating point images, single channel).
-@param frame1 Second frame. Must have the same type and size as frame0 .
-@param fu Forward horizontal displacement.
-@param fv Forward vertical displacement.
-@param bu Backward horizontal displacement.
-@param bv Backward vertical displacement.
-@param pos New frame position.
-@param newFrame Output image.
-@param buf Temporary buffer, will have width x 6\*height size, CV_32FC1 type and contain 6
-GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward
-horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow,
-interpolated backward vertical flow.
-@param stream Stream for the asynchronous version.
- */
-CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
-                                  const GpuMat& fu, const GpuMat& fv,
-                                  const GpuMat& bu, const GpuMat& bv,
-                                  float pos, GpuMat& newFrame, GpuMat& buf,
-                                  Stream& stream = Stream::Null());
-
-CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);
-
 //! @}
 
 }} // namespace cv { namespace cuda {
diff --git a/modules/cudaoptflow/perf/perf_optflow.cpp b/modules/cudaoptflow/perf/perf_optflow.cpp
index d22eb7e60d..12612b0622 100644
--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@@ -46,91 +46,10 @@ using namespace std;
 using namespace testing;
 using namespace perf;
 
-//////////////////////////////////////////////////////
-// InterpolateFrames
-
 typedef pair<string, string> pair_string;
 
 DEF_PARAM_TEST_1(ImagePair, pair_string);
 
-PERF_TEST_P(ImagePair, InterpolateFrames,
-    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
-{
-    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    frame0.convertTo(frame0, CV_32FC1, 1.0 / 255.0);
-    frame1.convertTo(frame1, CV_32FC1, 1.0 / 255.0);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_frame0(frame0);
-        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat d_fu, d_fv;
-        cv::cuda::GpuMat d_bu, d_bv;
-
-        cv::cuda::BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-            10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
-
-        d_flow(d_frame0, d_frame1, d_fu, d_fv);
-        d_flow(d_frame1, d_frame0, d_bu, d_bv);
-
-        cv::cuda::GpuMat newFrame;
-        cv::cuda::GpuMat d_buf;
-
-        TEST_CYCLE() cv::cuda::interpolateFrames(d_frame0, d_frame1, d_fu, d_fv, d_bu, d_bv, 0.5f, newFrame, d_buf);
-
-        CUDA_SANITY_CHECK(newFrame, 1e-4);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////
-// CreateOpticalFlowNeedleMap
-
-PERF_TEST_P(ImagePair, CreateOpticalFlowNeedleMap,
-            Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
-{
-    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    frame0.convertTo(frame0, CV_32FC1, 1.0 / 255.0);
-    frame1.convertTo(frame1, CV_32FC1, 1.0 / 255.0);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_frame0(frame0);
-        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
-
-        cv::cuda::BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                        10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
-
-        d_flow(d_frame0, d_frame1, u, v);
-
-        cv::cuda::GpuMat vertex, colors;
-
-        TEST_CYCLE() cv::cuda::createOpticalFlowNeedleMap(u, v, vertex, colors);
-
-        CUDA_SANITY_CHECK(vertex, 1e-6);
-        CUDA_SANITY_CHECK(colors);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
 //////////////////////////////////////////////////////
 // BroxOpticalFlow
 
@@ -383,72 +302,3 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
         CPU_SANITY_CHECK(flow);
     }
 }
-
-//////////////////////////////////////////////////////
-// OpticalFlowBM
-
-PERF_TEST_P(ImagePair, OpticalFlowBM,
-    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
-{
-    declare.time(400);
-
-    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    const cv::Size block_size(16, 16);
-    const cv::Size shift_size(1, 1);
-    const cv::Size max_range(16, 16);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_frame0(frame0);
-        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u, v, buf;
-
-        TEST_CYCLE() cv::cuda::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, u, v, buf);
-
-        CUDA_SANITY_CHECK(u);
-        CUDA_SANITY_CHECK(v);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-PERF_TEST_P(ImagePair, DISABLED_FastOpticalFlowBM,
-    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
-{
-    declare.time(400);
-
-    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    const cv::Size block_size(16, 16);
-    const cv::Size shift_size(1, 1);
-    const cv::Size max_range(16, 16);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_frame0(frame0);
-        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u, v;
-
-        cv::cuda::FastOpticalFlowBM fastBM;
-
-        TEST_CYCLE() fastBM(d_frame0, d_frame1, u, v, max_range.width, block_size.width);
-
-        CUDA_SANITY_CHECK(u, 2);
-        CUDA_SANITY_CHECK(v, 2);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
diff --git a/modules/cudaoptflow/test/test_optflow.cpp b/modules/cudaoptflow/test/test_optflow.cpp
index dce9cc59bc..7a6e68310f 100644
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -378,122 +378,4 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
     ALL_DEVICES,
     testing::Values(Gamma(0.0), Gamma(1.0))));
 
-//////////////////////////////////////////////////////
-// FastOpticalFlowBM
-
-namespace
-{
-    void FastOpticalFlowBM_gold(const cv::Mat_<uchar>& I0, const cv::Mat_<uchar>& I1, cv::Mat_<float>& velx, cv::Mat_<float>& vely, int search_window, int block_window)
-    {
-        velx.create(I0.size());
-        vely.create(I0.size());
-
-        int search_radius = search_window / 2;
-        int block_radius = block_window / 2;
-
-        for (int y = 0; y < I0.rows; ++y)
-        {
-            for (int x = 0; x < I0.cols; ++x)
-            {
-                int bestDist = std::numeric_limits<int>::max();
-                int bestDx = 0;
-                int bestDy = 0;
-
-                for (int dy = -search_radius; dy <= search_radius; ++dy)
-                {
-                    for (int dx = -search_radius; dx <= search_radius; ++dx)
-                    {
-                        int dist = 0;
-
-                        for (int by = -block_radius; by <= block_radius; ++by)
-                        {
-                            for (int bx = -block_radius; bx <= block_radius; ++bx)
-                            {
-                                int I0_val = I0(cv::borderInterpolate(y + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + bx, I0.cols, cv::BORDER_DEFAULT));
-                                int I1_val = I1(cv::borderInterpolate(y + dy + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + dx + bx, I0.cols, cv::BORDER_DEFAULT));
-
-                                dist += std::abs(I0_val - I1_val);
-                            }
-                        }
-
-                        if (dist < bestDist)
-                        {
-                            bestDist = dist;
-                            bestDx = dx;
-                            bestDy = dy;
-                        }
-                    }
-                }
-
-                velx(y, x) = (float) bestDx;
-                vely(y, x) = (float) bestDy;
-            }
-        }
-    }
-
-    double calc_rmse(const cv::Mat_<float>& flow1, const cv::Mat_<float>& flow2)
-    {
-        double sum = 0.0;
-
-        for (int y = 0; y < flow1.rows; ++y)
-        {
-            for (int x = 0; x < flow1.cols; ++x)
-            {
-                double diff = flow1(y, x) - flow2(y, x);
-                sum += diff * diff;
-            }
-        }
-
-        return std::sqrt(sum / flow1.size().area());
-    }
-}
-
-struct FastOpticalFlowBM : testing::TestWithParam<cv::cuda::DeviceInfo>
-{
-};
-
-CUDA_TEST_P(FastOpticalFlowBM, Accuracy)
-{
-    const double MAX_RMSE = 0.6;
-
-    int search_window = 15;
-    int block_window = 5;
-
-    cv::cuda::DeviceInfo devInfo = GetParam();
-    cv::cuda::setDevice(devInfo.deviceID());
-
-    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Size smallSize(320, 240);
-    cv::Mat frame0_small;
-    cv::Mat frame1_small;
-
-    cv::resize(frame0, frame0_small, smallSize);
-    cv::resize(frame1, frame1_small, smallSize);
-
-    cv::cuda::GpuMat d_flowx;
-    cv::cuda::GpuMat d_flowy;
-    cv::cuda::FastOpticalFlowBM fastBM;
-
-    fastBM(loadMat(frame0_small), loadMat(frame1_small), d_flowx, d_flowy, search_window, block_window);
-
-    cv::Mat_<float> flowx;
-    cv::Mat_<float> flowy;
-    FastOpticalFlowBM_gold(frame0_small, frame1_small, flowx, flowy, search_window, block_window);
-
-    double err;
-
-    err = calc_rmse(flowx, cv::Mat(d_flowx));
-    EXPECT_LE(err, MAX_RMSE);
-
-    err = calc_rmse(flowy, cv::Mat(d_flowy));
-    EXPECT_LE(err, MAX_RMSE);
-}
-
-INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FastOpticalFlowBM, ALL_DEVICES);
-
 #endif // HAVE_CUDA

From 381216aa5411fd20faf897cc8900c0358e7dc695 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 15:36:15 +0300
Subject: [PATCH 04/10] refactor cudaoptflow public API:

* use opaque algorithm interfaces
* add stream support
---
 .../include/opencv2/cudaoptflow.hpp           | 397 ++++++-------
 modules/cudaoptflow/src/brox.cpp              | 168 ++++--
 modules/cudaoptflow/src/cuda/pyrlk.cu         |  12 +-
 modules/cudaoptflow/src/cuda/tvl1flow.cu      |  35 +-
 modules/cudaoptflow/src/farneback.cpp         | 559 ++++++++++--------
 modules/cudaoptflow/src/pyrlk.cpp             | 350 +++++++----
 modules/cudaoptflow/src/tvl1flow.cpp          | 474 +++++++++------
 7 files changed, 1164 insertions(+), 831 deletions(-)

diff --git a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
index 7882a8e622..6ea75594d2 100644
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -61,49 +61,94 @@ namespace cv { namespace cuda {
 //! @addtogroup cudaoptflow
 //! @{
 
-/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm
-(@cite Brox2004). :
+//
+// Interface
+//
+
+/** @brief Base interface for dense optical flow algorithms.
  */
-class CV_EXPORTS BroxOpticalFlow
+class CV_EXPORTS DenseOpticalFlow : public Algorithm
 {
 public:
-    BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :
-        alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_),
-        inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)
-    {
-    }
+    /** @brief Calculates a dense optical flow.
 
-    //! Compute optical flow
-    //! frame0 - source frame (supports only CV_32FC1 type)
-    //! frame1 - frame to track (with the same size and type as frame0)
-    //! u      - flow horizontal component (along x axis)
-    //! v      - flow vertical component (along y axis)
-    void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());
+    @param I0 first input image.
+    @param I1 second input image of the same size and the same type as I0.
+    @param flow computed flow image that has the same size as I0 and type CV_32FC2.
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream = Stream::Null()) = 0;
+};
 
-    //! flow smoothness
-    float alpha;
+/** @brief Base interface for sparse optical flow algorithms.
+ */
+class CV_EXPORTS SparseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates a sparse optical flow.
+
+    @param prevImg First input image.
+    @param nextImg Second input image of the same size and the same type as prevImg.
+    @param prevPts Vector of 2D points for which the flow needs to be found.
+    @param nextPts Output vector of 2D points containing the calculated new positions of input features in the second image.
+    @param status Output status vector. Each element of the vector is set to 1 if the
+                  flow for the corresponding features has been found. Otherwise, it is set to 0.
+    @param err Optional output vector that contains error response for each point (inverse confidence).
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void calc(InputArray prevImg, InputArray nextImg,
+                      InputArray prevPts, InputOutputArray nextPts,
+                      OutputArray status,
+                      OutputArray err = cv::noArray(),
+                      Stream& stream = Stream::Null()) = 0;
+};
 
-    //! gradient constancy importance
-    float gamma;
+//
+// BroxOpticalFlow
+//
 
-    //! pyramid scale factor
-    float scale_factor;
+/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm (@cite Brox2004).
+ */
+class CV_EXPORTS BroxOpticalFlow : public DenseOpticalFlow
+{
+public:
+    virtual double getFlowSmoothness() const = 0;
+    virtual void setFlowSmoothness(double alpha) = 0;
+
+    virtual double getGradientConstancyImportance() const = 0;
+    virtual void setGradientConstancyImportance(double gamma) = 0;
+
+    virtual double getPyramidScaleFactor() const = 0;
+    virtual void setPyramidScaleFactor(double scale_factor) = 0;
 
     //! number of lagged non-linearity iterations (inner loop)
-    int inner_iterations;
+    virtual int getInnerIterations() const = 0;
+    virtual void setInnerIterations(int inner_iterations) = 0;
 
     //! number of warping iterations (number of pyramid levels)
-    int outer_iterations;
+    virtual int getOuterIterations() const = 0;
+    virtual void setOuterIterations(int outer_iterations) = 0;
 
     //! number of linear system solver iterations
-    int solver_iterations;
-
-    GpuMat buf;
+    virtual int getSolverIterations() const = 0;
+    virtual void setSolverIterations(int solver_iterations) = 0;
+
+    static Ptr<BroxOpticalFlow> create(
+            double alpha = 0.197,
+            double gamma = 50.0,
+            double scale_factor = 0.8,
+            int inner_iterations = 5,
+            int outer_iterations = 150,
+            int solver_iterations = 10);
 };
 
-/** @brief Class used for calculating an optical flow.
+//
+// PyrLKOpticalFlow
+//
+
+/** @brief Class used for calculating a sparse optical flow.
 
-The class can calculate an optical flow for a sparse feature set or dense optical flow using the
+The class can calculate an optical flow for a sparse feature set using the
 iterative Lucas-Kanade method with pyramids.
 
 @sa calcOpticalFlowPyrLK
@@ -112,158 +157,116 @@ iterative Lucas-Kanade method with pyramids.
    -   An example of the Lucas Kanade optical flow algorithm can be found at
         opencv_source_code/samples/gpu/pyrlk_optical_flow.cpp
  */
-class CV_EXPORTS PyrLKOpticalFlow
+class CV_EXPORTS SparsePyrLKOpticalFlow : public SparseOpticalFlow
 {
 public:
-    PyrLKOpticalFlow();
-
-    /** @brief Calculate an optical flow for a sparse feature set.
-
-    @param prevImg First 8-bit input image (supports both grayscale and color images).
-    @param nextImg Second input image of the same size and the same type as prevImg .
-    @param prevPts Vector of 2D points for which the flow needs to be found. It must be one row matrix
-    with CV_32FC2 type.
-    @param nextPts Output vector of 2D points (with single-precision floating-point coordinates)
-    containing the calculated new positions of input features in the second image. When useInitialFlow
-    is true, the vector must have the same size as in the input.
-    @param status Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the
-    flow for the corresponding features has been found. Otherwise, it is set to 0.
-    @param err Output vector (CV_32FC1 type) that contains the difference between patches around the
-    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
-    needed.
-
-    @sa calcOpticalFlowPyrLK
-     */
-    void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
-        GpuMat& status, GpuMat* err = 0);
-
-    /** @brief Calculate dense optical flow.
-
-    @param prevImg First 8-bit grayscale input image.
-    @param nextImg Second input image of the same size and the same type as prevImg .
-    @param u Horizontal component of the optical flow of the same size as input images, 32-bit
-    floating-point, single-channel
-    @param v Vertical component of the optical flow of the same size as input images, 32-bit
-    floating-point, single-channel
-    @param err Output vector (CV_32FC1 type) that contains the difference between patches around the
-    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
-    needed.
-     */
-    void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
-
-    /** @brief Releases inner buffers memory.
-    */
-    void releaseMemory();
+    virtual Size getWinSize() const = 0;
+    virtual void setWinSize(Size winSize) = 0;
 
-    Size winSize;
-    int maxLevel;
-    int iters;
-    bool useInitialFlow;
+    virtual int getMaxLevel() const = 0;
+    virtual void setMaxLevel(int maxLevel) = 0;
 
-private:
-    std::vector<GpuMat> prevPyr_;
-    std::vector<GpuMat> nextPyr_;
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;
 
-    GpuMat buf_;
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;
 
-    GpuMat uPyr_[2];
-    GpuMat vPyr_[2];
+    static Ptr<SparsePyrLKOpticalFlow> create(
+            Size winSize = Size(21, 21),
+            int maxLevel = 3,
+            int iters = 30,
+            bool useInitialFlow = false);
 };
 
-/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm. :
+/** @brief Class used for calculating a dense optical flow.
+
+The class can calculate an optical flow for a dense optical flow using the
+iterative Lucas-Kanade method with pyramids.
  */
-class CV_EXPORTS FarnebackOpticalFlow
+class CV_EXPORTS DensePyrLKOpticalFlow : public DenseOpticalFlow
 {
 public:
-    FarnebackOpticalFlow()
-    {
-        numLevels = 5;
-        pyrScale = 0.5;
-        fastPyramids = false;
-        winSize = 13;
-        numIters = 10;
-        polyN = 5;
-        polySigma = 1.1;
-        flags = 0;
-    }
-
-    int numLevels;
-    double pyrScale;
-    bool fastPyramids;
-    int winSize;
-    int numIters;
-    int polyN;
-    double polySigma;
-    int flags;
-
-    /** @brief Computes a dense optical flow using the Gunnar Farneback’s algorithm.
-
-    @param frame0 First 8-bit gray-scale input image
-    @param frame1 Second 8-bit gray-scale input image
-    @param flowx Flow horizontal component
-    @param flowy Flow vertical component
-    @param s Stream
-
-    @sa calcOpticalFlowFarneback
-     */
-    void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());
+    virtual Size getWinSize() const = 0;
+    virtual void setWinSize(Size winSize) = 0;
 
-    /** @brief Releases unused auxiliary memory buffers.
-     */
-    void releaseMemory()
-    {
-        frames_[0].release();
-        frames_[1].release();
-        pyrLevel_[0].release();
-        pyrLevel_[1].release();
-        M_.release();
-        bufM_.release();
-        R_[0].release();
-        R_[1].release();
-        blurredFrame_[0].release();
-        blurredFrame_[1].release();
-        pyramid0_.clear();
-        pyramid1_.clear();
-    }
-
-private:
-    void prepareGaussian(
-            int n, double sigma, float *g, float *xg, float *xxg,
-            double &ig11, double &ig03, double &ig33, double &ig55);
-
-    void setPolynomialExpansionConsts(int n, double sigma);
-
-    void updateFlow_boxFilter(
-            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
-            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
-
-    void updateFlow_gaussianBlur(
-            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
-            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
-
-    GpuMat frames_[2];
-    GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
-    std::vector<GpuMat> pyramid0_, pyramid1_;
+    virtual int getMaxLevel() const = 0;
+    virtual void setMaxLevel(int maxLevel) = 0;
+
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;
+
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;
+
+    static Ptr<DensePyrLKOpticalFlow> create(
+            Size winSize = Size(13, 13),
+            int maxLevel = 3,
+            int iters = 30,
+            bool useInitialFlow = false);
 };
 
-// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
 //
-// see reference:
-//   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
-//   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
-class CV_EXPORTS OpticalFlowDual_TVL1_CUDA
+// FarnebackOpticalFlow
+//
+
+/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm.
+ */
+class CV_EXPORTS FarnebackOpticalFlow : public DenseOpticalFlow
 {
 public:
-    OpticalFlowDual_TVL1_CUDA();
+    virtual int getNumLevels() const = 0;
+    virtual void setNumLevels(int numLevels) = 0;
+
+    virtual double getPyrScale() const = 0;
+    virtual void setPyrScale(double pyrScale) = 0;
+
+    virtual bool getFastPyramids() const = 0;
+    virtual void setFastPyramids(bool fastPyramids) = 0;
 
-    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy);
+    virtual int getWinSize() const = 0;
+    virtual void setWinSize(int winSize) = 0;
 
-    void collectGarbage();
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int numIters) = 0;
+
+    virtual int getPolyN() const = 0;
+    virtual void setPolyN(int polyN) = 0;
+
+    virtual double getPolySigma() const = 0;
+    virtual void setPolySigma(double polySigma) = 0;
+
+    virtual int getFlags() const = 0;
+    virtual void setFlags(int flags) = 0;
+
+    static Ptr<FarnebackOpticalFlow> create(
+            int numLevels = 5,
+            double pyrScale = 0.5,
+            bool fastPyramids = false,
+            int winSize = 13,
+            int numIters = 10,
+            int polyN = 5,
+            double polySigma = 1.1,
+            int flags = 0);
+};
+
+//
+// OpticalFlowDual_TVL1
+//
 
+/** @brief Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method.
+ *
+ * @sa C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+ * @sa Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+ */
+class CV_EXPORTS OpticalFlowDual_TVL1 : public DenseOpticalFlow
+{
+public:
     /**
      * Time step of the numerical scheme.
      */
-    double tau;
+    virtual double getTau() const = 0;
+    virtual void setTau(double tau) = 0;
 
     /**
      * Weight parameter for the data term, attachment parameter.
@@ -271,7 +274,8 @@ public:
      * The smaller this parameter is, the smoother the solutions we obtain.
      * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
      */
-    double lambda;
+    virtual double getLambda() const = 0;
+    virtual void setLambda(double lambda) = 0;
 
     /**
      * Weight parameter for (u - v)^2, tightness parameter.
@@ -279,20 +283,23 @@ public:
      * In theory, it should have a small value in order to maintain both parts in correspondence.
      * The method is stable for a large range of values of this parameter.
      */
+    virtual double getGamma() const = 0;
+    virtual void setGamma(double gamma) = 0;
 
-    double gamma;
     /**
-    * parameter used for motion estimation. It adds a variable allowing for illumination variations
-    * Set this parameter to 1. if you have varying illumination.
-    * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
-    * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
-    */
-    double theta;
+     * parameter used for motion estimation. It adds a variable allowing for illumination variations
+     * Set this parameter to 1. if you have varying illumination.
+     * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
+     * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
+     */
+    virtual double getTheta() const = 0;
+    virtual void setTheta(double theta) = 0;
 
     /**
      * Number of scales used to create the pyramid of images.
      */
-    int nscales;
+    virtual int getNumScales() const = 0;
+    virtual void setNumScales(int nscales) = 0;
 
     /**
      * Number of warpings per scale.
@@ -300,51 +307,39 @@ public:
      * This is a parameter that assures the stability of the method.
      * It also affects the running time, so it is a compromise between speed and accuracy.
      */
-    int warps;
+    virtual int getNumWarps() const = 0;
+    virtual void setNumWarps(int warps) = 0;
 
     /**
      * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
      * A small value will yield more accurate solutions at the expense of a slower convergence.
      */
-    double epsilon;
+    virtual double getEpsilon() const = 0;
+    virtual void setEpsilon(double epsilon) = 0;
 
     /**
      * Stopping criterion iterations number used in the numerical scheme.
      */
-    int iterations;
-
-    double scaleStep;
-
-    bool useInitialFlow;
-
-private:
-    void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3);
-
-    std::vector<GpuMat> I0s;
-    std::vector<GpuMat> I1s;
-    std::vector<GpuMat> u1s;
-    std::vector<GpuMat> u2s;
-    std::vector<GpuMat> u3s;
-
-    GpuMat I1x_buf;
-    GpuMat I1y_buf;
-
-    GpuMat I1w_buf;
-    GpuMat I1wx_buf;
-    GpuMat I1wy_buf;
-
-    GpuMat grad_buf;
-    GpuMat rho_c_buf;
-
-    GpuMat p11_buf;
-    GpuMat p12_buf;
-    GpuMat p21_buf;
-    GpuMat p22_buf;
-    GpuMat p31_buf;
-    GpuMat p32_buf;
-
-    GpuMat diff_buf;
-    GpuMat norm_buf;
+    virtual int getNumIterations() const = 0;
+    virtual void setNumIterations(int iterations) = 0;
+
+    virtual double getScaleStep() const = 0;
+    virtual void setScaleStep(double scaleStep) = 0;
+
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;
+
+    static Ptr<OpticalFlowDual_TVL1> create(
+            double tau = 0.25,
+            double lambda = 0.15,
+            double theta = 0.3,
+            int nscales = 5,
+            int warps = 5,
+            double epsilon = 0.01,
+            int iterations = 300,
+            double scaleStep = 0.8,
+            double gamma = 0.0,
+            bool useInitialFlow = false);
 };
 
 //! @}
diff --git a/modules/cudaoptflow/src/brox.cpp b/modules/cudaoptflow/src/brox.cpp
index 39eae9a8ab..11c541906b 100644
--- a/modules/cudaoptflow/src/brox.cpp
+++ b/modules/cudaoptflow/src/brox.cpp
@@ -47,84 +47,148 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)
 
-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double, double, double, int, int, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }
 
 #else
 
-namespace
-{
-    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
-                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
+namespace {
+
+    class BroxOpticalFlowImpl : public BroxOpticalFlow
     {
-        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    public:
+        BroxOpticalFlowImpl(double alpha, double gamma, double scale_factor,
+                            int inner_iterations, int outer_iterations, int solver_iterations) :
+            alpha_(alpha), gamma_(gamma), scale_factor_(scale_factor),
+            inner_iterations_(inner_iterations), outer_iterations_(outer_iterations),
+            solver_iterations_(solver_iterations)
+        {
+        }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+        virtual double getFlowSmoothness() const { return alpha_; }
+        virtual void setFlowSmoothness(double alpha) { alpha_ = static_cast<float>(alpha); }
+
+        virtual double getGradientConstancyImportance() const { return gamma_; }
+        virtual void setGradientConstancyImportance(double gamma) { gamma_ = static_cast<float>(gamma); }
+
+        virtual double getPyramidScaleFactor() const { return scale_factor_; }
+        virtual void setPyramidScaleFactor(double scale_factor) { scale_factor_ = static_cast<float>(scale_factor); }
+
+        //! number of lagged non-linearity iterations (inner loop)
+        virtual int getInnerIterations() const { return inner_iterations_; }
+        virtual void setInnerIterations(int inner_iterations) { inner_iterations_ = inner_iterations; }
+
+        //! number of warping iterations (number of pyramid levels)
+        virtual int getOuterIterations() const { return outer_iterations_; }
+        virtual void setOuterIterations(int outer_iterations) { outer_iterations_ = outer_iterations; }
+
+        //! number of linear system solver iterations
+        virtual int getSolverIterations() const { return solver_iterations_; }
+        virtual void setSolverIterations(int solver_iterations) { solver_iterations_ = solver_iterations; }
+
+    private:
+        //! flow smoothness
+        float alpha_;
+
+        //! gradient constancy importance
+        float gamma_;
+
+        //! pyramid scale factor
+        float scale_factor_;
+
+        //! number of lagged non-linearity iterations (inner loop)
+        int inner_iterations_;
+
+        //! number of warping iterations (number of pyramid levels)
+        int outer_iterations_;
+
+        //! number of linear system solver iterations
+        int solver_iterations_;
+    };
+
+    static size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc,
+                             const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                             NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v,
+                             size_t textureAlignment)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(textureAlignment));
 
         ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );
 
         return gpuCounter.maxSize();
     }
-}
 
-namespace
-{
-    static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
-}
+    static void outputHandler(const String &msg)
+    {
+        CV_Error(cv::Error::GpuApiCallError, msg.c_str());
+    }
 
-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
-{
-    ncvSetDebugOutputHandler(outputHandler);
+    void BroxOpticalFlowImpl::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _I0.getGpuMat();
+        const GpuMat frame1 = _I1.getGpuMat();
 
-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+        CV_Assert( frame0.type() == CV_32FC1 );
+        CV_Assert( frame1.size() == frame0.size() && frame1.type() == frame0.type() );
 
-    u.create(frame0.size(), CV_32FC1);
-    v.create(frame0.size(), CV_32FC1);
+        ncvSetDebugOutputHandler(outputHandler);
 
-    cudaDeviceProp devProp;
-    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
+        BufferPool pool(stream);
+        GpuMat u = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat v = pool.getBuffer(frame0.size(), CV_32FC1);
 
-    NCVBroxOpticalFlowDescriptor desc;
+        NCVBroxOpticalFlowDescriptor desc;
+        desc.alpha = alpha_;
+        desc.gamma = gamma_;
+        desc.scale_factor = scale_factor_;
+        desc.number_of_inner_iterations = inner_iterations_;
+        desc.number_of_outer_iterations = outer_iterations_;
+        desc.number_of_solver_iterations = solver_iterations_;
 
-    desc.alpha = alpha;
-    desc.gamma = gamma;
-    desc.scale_factor = scale_factor;
-    desc.number_of_inner_iterations = inner_iterations;
-    desc.number_of_outer_iterations = outer_iterations;
-    desc.number_of_solver_iterations = solver_iterations;
+        NCVMemSegment frame0MemSeg;
+        frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+        frame0MemSeg.size = frame0.step * frame0.rows;
 
-    NCVMemSegment frame0MemSeg;
-    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
-    frame0MemSeg.size = frame0.step * frame0.rows;
+        NCVMemSegment frame1MemSeg;
+        frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+        frame1MemSeg.size = frame1.step * frame1.rows;
 
-    NCVMemSegment frame1MemSeg;
-    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
-    frame1MemSeg.size = frame1.step * frame1.rows;
+        NCVMemSegment uMemSeg;
+        uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        uMemSeg.begin.ptr = u.ptr();
+        uMemSeg.size = u.step * u.rows;
 
-    NCVMemSegment uMemSeg;
-    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    uMemSeg.begin.ptr = u.ptr();
-    uMemSeg.size = u.step * u.rows;
+        NCVMemSegment vMemSeg;
+        vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        vMemSeg.begin.ptr = v.ptr();
+        vMemSeg.size = v.step * v.rows;
 
-    NCVMemSegment vMemSeg;
-    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    vMemSeg.begin.ptr = v.ptr();
-    vMemSeg.size = v.step * v.rows;
+        DeviceInfo devInfo;
+        size_t textureAlignment = devInfo.textureAlignment();
 
-    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
-    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
-    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
-    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
+        NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
+        NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
+        NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
+        NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+        size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, textureAlignment);
+        GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
 
-    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
+        NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(textureAlignment), buf.ptr());
 
-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, StreamAccessor::getStream(stream)) );
 
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
+        GpuMat flows[] = {u, v};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+}
 
-    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double alpha, double gamma, double scale_factor, int inner_iterations, int outer_iterations, int solver_iterations)
+{
+    return makePtr<BroxOpticalFlowImpl>(alpha, gamma, scale_factor, inner_iterations, outer_iterations, solver_iterations);
 }
 
 #endif /* HAVE_CUDA */
diff --git a/modules/cudaoptflow/src/cuda/pyrlk.cu b/modules/cudaoptflow/src/cuda/pyrlk.cu
index d4606f2281..7693551fca 100644
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -472,16 +472,16 @@ namespace pyrlk
         }
     }
 
-    void loadConstants(int2 winSize, int iters)
+    void loadConstants(int2 winSize, int iters, cudaStream_t stream)
     {
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
 
         int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
 
-        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
     }
 
     void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu
index 2b66c972bc..66f0d664a0 100644
--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -66,15 +66,16 @@ namespace tvl1flow
         dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
     }
 
-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
-        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
+        centeredGradientKernel<<<grid, block, 0, stream>>>(src, dx, dy);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
@@ -164,7 +165,10 @@ namespace tvl1flow
         rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
     }
 
-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx,
+                      PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
@@ -173,10 +177,11 @@ namespace tvl1flow
         bindTexture(&tex_I1x, I1x);
         bindTexture(&tex_I1y, I1y);
 
-        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
@@ -292,15 +297,17 @@ namespace tvl1flow
                    PtrStepSzf grad, PtrStepSzf rho_c,
                    PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
                    PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
-                   float l_t, float theta, float gamma, bool calcError)
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
 
-        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
+        estimateUKernel<<<grid, block, 0, stream>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
@@ -346,15 +353,19 @@ namespace tvl1flow
         }
     }
 
-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, float gamma)
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
 
-        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
+        estimateDualVariablesKernel<<<grid, block, 0, stream>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
diff --git a/modules/cudaoptflow/src/farneback.cpp b/modules/cudaoptflow/src/farneback.cpp
index 6b74432632..b7fefeb191 100644
--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
@@ -42,23 +42,21 @@
 
 #include "precomp.hpp"
 
-#define MIN_SIZE 32
-
-#define S(x) StreamAccessor::getStream(x)
-
-// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag
-// leads to an inefficient code. It's for debug purposes only.
-#define ENABLE_CUDA_RESIZE 1
-
 using namespace cv;
 using namespace cv::cuda;
 
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
 
-void cv::cuda::FarnebackOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<FarnebackOpticalFlow> cv::cuda::FarnebackOpticalFlow::create(int, double, bool, int, int, int, double, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }
 
 #else
 
+#define MIN_SIZE 32
+
+// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag
+// leads to an inefficient code. It's for debug purposes only.
+#define ENABLE_CUDA_RESIZE 1
+
 namespace cv { namespace cuda { namespace device { namespace optflow_farneback
 {
     void setPolynomialExpansionConsts(
@@ -76,8 +74,6 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback
     void updateFlowGpu(
             const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream);
 
-    /*void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);*/
-
     void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
 
     void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
@@ -93,296 +89,381 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback
     void gaussianBlur5Gpu_CC11(
             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
 
-}}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
+}}}}
 
 namespace
 {
-    GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
+    class FarnebackOpticalFlowImpl : public FarnebackOpticalFlow
     {
-        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
-            return mat(Rect(0, 0, cols, rows));
+    public:
+        FarnebackOpticalFlowImpl(int numLevels, double pyrScale, bool fastPyramids, int winSize,
+                                 int numIters, int polyN, double polySigma, int flags) :
+            numLevels_(numLevels), pyrScale_(pyrScale), fastPyramids_(fastPyramids), winSize_(winSize),
+            numIters_(numIters), polyN_(polyN), polySigma_(polySigma), flags_(flags)
+        {
+        }
 
-        return mat = GpuMat(rows, cols, type);
-    }
-}
+        virtual int getNumLevels() const { return numLevels_; }
+        virtual void setNumLevels(int numLevels) { numLevels_ = numLevels; }
 
-void cv::cuda::FarnebackOpticalFlow::prepareGaussian(
-        int n, double sigma, float *g, float *xg, float *xxg,
-        double &ig11, double &ig03, double &ig33, double &ig55)
-{
-    double s = 0.;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
-        s += g[x];
-    }
+        virtual double getPyrScale() const { return pyrScale_; }
+        virtual void setPyrScale(double pyrScale) { pyrScale_ = pyrScale; }
 
-    s = 1./s;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)(g[x]*s);
-        xg[x] = (float)(x*g[x]);
-        xxg[x] = (float)(x*x*g[x]);
-    }
+        virtual bool getFastPyramids() const { return fastPyramids_; }
+        virtual void setFastPyramids(bool fastPyramids) { fastPyramids_ = fastPyramids; }
 
-    Mat_<double> G(6, 6);
-    G.setTo(0);
+        virtual int getWinSize() const { return winSize_; }
+        virtual void setWinSize(int winSize) { winSize_ = winSize; }
 
-    for (int y = -n; y <= n; y++)
-    {
-        for (int x = -n; x <= n; x++)
-        {
-            G(0,0) += g[y]*g[x];
-            G(1,1) += g[y]*g[x]*x*x;
-            G(3,3) += g[y]*g[x]*x*x*x*x;
-            G(5,5) += g[y]*g[x]*x*x*y*y;
-        }
-    }
+        virtual int getNumIters() const { return numIters_; }
+        virtual void setNumIters(int numIters) { numIters_ = numIters; }
 
-    //G[0][0] = 1.;
-    G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
-    G(4,4) = G(3,3);
-    G(3,4) = G(4,3) = G(5,5);
-
-    // invG:
-    // [ x        e  e    ]
-    // [    y             ]
-    // [       y          ]
-    // [ e        z       ]
-    // [ e           z    ]
-    // [                u ]
-    Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
-
-    ig11 = invG(1,1);
-    ig03 = invG(0,3);
-    ig33 = invG(3,3);
-    ig55 = invG(5,5);
-}
+        virtual int getPolyN() const { return polyN_; }
+        virtual void setPolyN(int polyN) { polyN_ = polyN; }
 
+        virtual double getPolySigma() const { return polySigma_; }
+        virtual void setPolySigma(double polySigma) { polySigma_ = polySigma; }
 
-void cv::cuda::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma)
-{
-    std::vector<float> buf(n*6 + 3);
-    float* g = &buf[0] + n;
-    float* xg = g + n*2 + 1;
-    float* xxg = xg + n*2 + 1;
+        virtual int getFlags() const { return flags_; }
+        virtual void setFlags(int flags) { flags_ = flags; }
 
-    if (sigma < FLT_EPSILON)
-        sigma = n*0.3;
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
 
-    double ig11, ig03, ig33, ig55;
-    prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
+    private:
+        int numLevels_;
+        double pyrScale_;
+        bool fastPyramids_;
+        int winSize_;
+        int numIters_;
+        int polyN_;
+        double polySigma_;
+        int flags_;
 
-    device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
-}
+    private:
+        void prepareGaussian(
+                int n, double sigma, float *g, float *xg, float *xxg,
+                double &ig11, double &ig03, double &ig33, double &ig55);
 
+        void setPolynomialExpansionConsts(int n, double sigma);
 
-void cv::cuda::FarnebackOpticalFlow::updateFlow_boxFilter(
-        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
-        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
-{
-    if (deviceSupports(FEATURE_SET_COMPUTE_12))
-        device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
-    else
-        device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
-    swap(M, bufM);
-
-    for (int i = 1; i < 5; ++i)
-        streams[i].waitForCompletion();
-    device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
-
-    if (updateMatrices)
-        device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
-}
+        void updateFlow_boxFilter(
+                const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
+                GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
 
+        void updateFlow_gaussianBlur(
+                const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
+                GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
 
-void cv::cuda::FarnebackOpticalFlow::updateFlow_gaussianBlur(
-        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
-        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
-{
-    if (deviceSupports(FEATURE_SET_COMPUTE_12))
-        device::optflow_farneback::gaussianBlur5Gpu(
-                    M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0]));
-    else
-        device::optflow_farneback::gaussianBlur5Gpu_CC11(
-                    M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0]));
-    swap(M, bufM);
-
-    device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
-
-    if (updateMatrices)
-        device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
-}
+        void calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream);
 
+        GpuMat frames_[2];
+        GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
+        std::vector<GpuMat> pyramid0_, pyramid1_;
+    };
 
-void cv::cuda::FarnebackOpticalFlow::operator ()(
-        const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
-{
-    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
-    CV_Assert(frame0.size() == frame1.size());
-    CV_Assert(polyN == 5 || polyN == 7);
-    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
-
-    Stream streams[5];
-    if (S(s))
-        streams[0] = s;
-
-    Size size = frame0.size();
-    GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
-
-    flowx.create(size, CV_32F);
-    flowy.create(size, CV_32F);
-    GpuMat flowx0 = flowx;
-    GpuMat flowy0 = flowy;
-
-    // Crop unnecessary levels
-    double scale = 1;
-    int numLevelsCropped = 0;
-    for (; numLevelsCropped < numLevels; numLevelsCropped++)
+    void FarnebackOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream)
     {
-        scale *= pyrScale;
-        if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
-            break;
+        const GpuMat frame0 = _frame0.getGpuMat();
+        const GpuMat frame1 = _frame1.getGpuMat();
+
+        BufferPool pool(stream);
+        GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1);
+
+        calcImpl(frame0, frame1, flowx, flowy, stream);
+
+        GpuMat flows[] = {flowx, flowy};
+        cuda::merge(flows, 2, _flow, stream);
     }
 
-    frame0.convertTo(frames_[0], CV_32F, streams[0]);
-    frame1.convertTo(frames_[1], CV_32F, streams[1]);
+    GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
+    {
+        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
+            return mat(Rect(0, 0, cols, rows));
+
+        return mat = GpuMat(rows, cols, type);
+    }
 
-    if (fastPyramids)
+    void FarnebackOpticalFlowImpl::prepareGaussian(
+            int n, double sigma, float *g, float *xg, float *xxg,
+            double &ig11, double &ig03, double &ig33, double &ig55)
     {
-        // Build Gaussian pyramids using pyrDown()
-        pyramid0_.resize(numLevelsCropped + 1);
-        pyramid1_.resize(numLevelsCropped + 1);
-        pyramid0_[0] = frames_[0];
-        pyramid1_[0] = frames_[1];
-        for (int i = 1; i <= numLevelsCropped; ++i)
+        double s = 0.;
+        for (int x = -n; x <= n; x++)
         {
-            cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
-            cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
+            g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
+            s += g[x];
+        }
+
+        s = 1./s;
+        for (int x = -n; x <= n; x++)
+        {
+            g[x] = (float)(g[x]*s);
+            xg[x] = (float)(x*g[x]);
+            xxg[x] = (float)(x*x*g[x]);
+        }
+
+        Mat_<double> G(6, 6);
+        G.setTo(0);
+
+        for (int y = -n; y <= n; y++)
+        {
+            for (int x = -n; x <= n; x++)
+            {
+                G(0,0) += g[y]*g[x];
+                G(1,1) += g[y]*g[x]*x*x;
+                G(3,3) += g[y]*g[x]*x*x*x*x;
+                G(5,5) += g[y]*g[x]*x*x*y*y;
+            }
         }
+
+        //G[0][0] = 1.;
+        G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
+        G(4,4) = G(3,3);
+        G(3,4) = G(4,3) = G(5,5);
+
+        // invG:
+        // [ x        e  e    ]
+        // [    y             ]
+        // [       y          ]
+        // [ e        z       ]
+        // [ e           z    ]
+        // [                u ]
+        Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
+
+        ig11 = invG(1,1);
+        ig03 = invG(0,3);
+        ig33 = invG(3,3);
+        ig55 = invG(5,5);
     }
 
-    setPolynomialExpansionConsts(polyN, polySigma);
-    device::optflow_farneback::setUpdateMatricesConsts();
+    void FarnebackOpticalFlowImpl::setPolynomialExpansionConsts(int n, double sigma)
+    {
+        std::vector<float> buf(n*6 + 3);
+        float* g = &buf[0] + n;
+        float* xg = g + n*2 + 1;
+        float* xxg = xg + n*2 + 1;
+
+        if (sigma < FLT_EPSILON)
+            sigma = n*0.3;
+
+        double ig11, ig03, ig33, ig55;
+        prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
 
-    for (int k = numLevelsCropped; k >= 0; k--)
+        device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
+    }
+
+    void FarnebackOpticalFlowImpl::updateFlow_boxFilter(
+            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
+            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
     {
-        streams[0].waitForCompletion();
+        if (deviceSupports(FEATURE_SET_COMPUTE_12))
+            device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0]));
+        else
+            device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0]));
+        swap(M, bufM);
 
-        scale = 1;
-        for (int i = 0; i < k; i++)
-            scale *= pyrScale;
+        for (int i = 1; i < 5; ++i)
+            streams[i].waitForCompletion();
+        device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0]));
 
-        double sigma = (1./scale - 1) * 0.5;
-        int smoothSize = cvRound(sigma*5) | 1;
-        smoothSize = std::max(smoothSize, 3);
+        if (updateMatrices)
+            device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0]));
+    }
 
-        int width = cvRound(size.width*scale);
-        int height = cvRound(size.height*scale);
+    void FarnebackOpticalFlowImpl::updateFlow_gaussianBlur(
+            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
+            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
+    {
+        if (deviceSupports(FEATURE_SET_COMPUTE_12))
+            device::optflow_farneback::gaussianBlur5Gpu(
+                        M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0]));
+        else
+            device::optflow_farneback::gaussianBlur5Gpu_CC11(
+                        M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0]));
+        swap(M, bufM);
 
-        if (fastPyramids)
-        {
-            width = pyramid0_[k].cols;
-            height = pyramid0_[k].rows;
-        }
+        device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0]));
 
-        if (k > 0)
+        if (updateMatrices)
+            device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0]));
+    }
+
+    void FarnebackOpticalFlowImpl::calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream)
+    {
+        CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
+        CV_Assert(frame0.size() == frame1.size());
+        CV_Assert(polyN_ == 5 || polyN_ == 7);
+        CV_Assert(!fastPyramids_ || std::abs(pyrScale_ - 0.5) < 1e-6);
+
+        Stream streams[5];
+        if (stream)
+            streams[0] = stream;
+
+        Size size = frame0.size();
+        GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
+
+        flowx.create(size, CV_32F);
+        flowy.create(size, CV_32F);
+        GpuMat flowx0 = flowx;
+        GpuMat flowy0 = flowy;
+
+        // Crop unnecessary levels
+        double scale = 1;
+        int numLevelsCropped = 0;
+        for (; numLevelsCropped < numLevels_; numLevelsCropped++)
         {
-            curFlowX.create(height, width, CV_32F);
-            curFlowY.create(height, width, CV_32F);
+            scale *= pyrScale_;
+            if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
+                break;
         }
-        else
+
+        frame0.convertTo(frames_[0], CV_32F, streams[0]);
+        frame1.convertTo(frames_[1], CV_32F, streams[1]);
+
+        if (fastPyramids_)
         {
-            curFlowX = flowx0;
-            curFlowY = flowy0;
+            // Build Gaussian pyramids using pyrDown()
+            pyramid0_.resize(numLevelsCropped + 1);
+            pyramid1_.resize(numLevelsCropped + 1);
+            pyramid0_[0] = frames_[0];
+            pyramid1_[0] = frames_[1];
+            for (int i = 1; i <= numLevelsCropped; ++i)
+            {
+                cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
+                cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
+            }
         }
 
-        if (!prevFlowX.data)
+        setPolynomialExpansionConsts(polyN_, polySigma_);
+        device::optflow_farneback::setUpdateMatricesConsts();
+
+        for (int k = numLevelsCropped; k >= 0; k--)
         {
-            if (flags & OPTFLOW_USE_INITIAL_FLOW)
+            streams[0].waitForCompletion();
+
+            scale = 1;
+            for (int i = 0; i < k; i++)
+                scale *= pyrScale_;
+
+            double sigma = (1./scale - 1) * 0.5;
+            int smoothSize = cvRound(sigma*5) | 1;
+            smoothSize = std::max(smoothSize, 3);
+
+            int width = cvRound(size.width*scale);
+            int height = cvRound(size.height*scale);
+
+            if (fastPyramids_)
             {
-                cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
-                cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-                curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
-                curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
+                width = pyramid0_[k].cols;
+                height = pyramid0_[k].rows;
+            }
+
+            if (k > 0)
+            {
+                curFlowX.create(height, width, CV_32F);
+                curFlowY.create(height, width, CV_32F);
             }
             else
             {
-                curFlowX.setTo(0, streams[0]);
-                curFlowY.setTo(0, streams[1]);
+                curFlowX = flowx0;
+                curFlowY = flowy0;
             }
-        }
-        else
-        {
-            cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
-            cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-            curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale, streams[0]);
-            curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale, streams[1]);
-        }
 
-        GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
-        GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
-        GpuMat R[2] =
-        {
-            allocMatFromBuf(5*height, width, CV_32F, R_[0]),
-            allocMatFromBuf(5*height, width, CV_32F, R_[1])
-        };
+            if (!prevFlowX.data)
+            {
+                if (flags_ & OPTFLOW_USE_INITIAL_FLOW)
+                {
+                    cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
+                    cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
+                    curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
+                    curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
+                }
+                else
+                {
+                    curFlowX.setTo(0, streams[0]);
+                    curFlowY.setTo(0, streams[1]);
+                }
+            }
+            else
+            {
+                cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
+                cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
+                curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale_, streams[0]);
+                curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale_, streams[1]);
+            }
 
-        if (fastPyramids)
-        {
-            device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0]));
-            device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1]));
-        }
-        else
-        {
-            GpuMat blurredFrame[2] =
+            GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
+            GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
+            GpuMat R[2] =
             {
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
+                allocMatFromBuf(5*height, width, CV_32F, R_[0]),
+                allocMatFromBuf(5*height, width, CV_32F, R_[1])
             };
-            GpuMat pyrLevel[2] =
+
+            if (fastPyramids_)
             {
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
-            };
+                device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN_, R[0], StreamAccessor::getStream(streams[0]));
+                device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN_, R[1], StreamAccessor::getStream(streams[1]));
+            }
+            else
+            {
+                GpuMat blurredFrame[2] =
+                {
+                    allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
+                    allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
+                };
+                GpuMat pyrLevel[2] =
+                {
+                    allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
+                    allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
+                };
+
+                Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
+                device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
+
+                for (int i = 0; i < 2; i++)
+                {
+                    device::optflow_farneback::gaussianBlurGpu(
+                            frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, StreamAccessor::getStream(streams[i]));
+                    cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
+                    device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN_, R[i], StreamAccessor::getStream(streams[i]));
+                }
+            }
 
-            Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
-            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
+            streams[1].waitForCompletion();
+            device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, StreamAccessor::getStream(streams[0]));
 
-            for (int i = 0; i < 2; i++)
+            if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN)
             {
-                device::optflow_farneback::gaussianBlurGpu(
-                        frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, S(streams[i]));
-                cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
-                device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
+                Mat g = getGaussianKernel(winSize_, winSize_/2*0.3f, CV_32F);
+                device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize_/2), winSize_/2);
+            }
+            for (int i = 0; i < numIters_; i++)
+            {
+                if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN)
+                    updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams);
+                else
+                    updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams);
             }
-        }
-
-        streams[1].waitForCompletion();
-        device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0]));
 
-        if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-        {
-            Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
-            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
-        }
-        for (int i = 0; i < numIters; i++)
-        {
-            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-                updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
-            else
-                updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
+            prevFlowX = curFlowX;
+            prevFlowY = curFlowY;
         }
 
-        prevFlowX = curFlowX;
-        prevFlowY = curFlowY;
-    }
+        flowx = curFlowX;
+        flowy = curFlowY;
 
-    flowx = curFlowX;
-    flowy = curFlowY;
+        if (!stream)
+            streams[0].waitForCompletion();
+    }
+}
 
-    if (!S(s))
-        streams[0].waitForCompletion();
+Ptr<FarnebackOpticalFlow> cv::cuda::FarnebackOpticalFlow::create(int numLevels, double pyrScale, bool fastPyramids, int winSize,
+                                                                 int numIters, int polyN, double polySigma, int flags)
+{
+    return makePtr<FarnebackOpticalFlowImpl>(numLevels, pyrScale, fastPyramids, winSize,
+                                             numIters, polyN, polySigma, flags);
 }
 
 #endif
diff --git a/modules/cudaoptflow/src/pyrlk.cpp b/modules/cudaoptflow/src/pyrlk.cpp
index 52ee91f2fe..f4182743c0 100644
--- a/modules/cudaoptflow/src/pyrlk.cpp
+++ b/modules/cudaoptflow/src/pyrlk.cpp
@@ -47,37 +47,54 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::releaseMemory() {}
+Ptr<SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }
+
+Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
 namespace pyrlk
 {
-    void loadConstants(int2 winSize, int iters);
+    void loadConstants(int2 winSize, int iters, cudaStream_t stream);
 
     void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+                 int level, dim3 block, dim3 patch, cudaStream_t stream);
     void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+                 int level, dim3 block, dim3 patch, cudaStream_t stream);
 
     void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
-}
-
-cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow()
-{
-    winSize = Size(21, 21);
-    maxLevel = 3;
-    iters = 30;
-    useInitialFlow = false;
+               PtrStepSzf err, int2 winSize, cudaStream_t stream);
 }
 
 namespace
 {
-    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
+    class PyrLKOpticalFlowBase
+    {
+    public:
+        PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow);
+
+        void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
+            GpuMat& status, GpuMat* err, Stream& stream);
+
+        void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream);
+
+    protected:
+        Size winSize_;
+        int maxLevel_;
+        int iters_;
+        bool useInitialFlow_;
+
+    private:
+        std::vector<GpuMat> prevPyr_;
+        std::vector<GpuMat> nextPyr_;
+    };
+
+    PyrLKOpticalFlowBase::PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+        winSize_(winSize), maxLevel_(maxLevel), iters_(iters), useInitialFlow_(useInitialFlow)
+    {
+    }
+
+    void calcPatchSize(Size winSize, dim3& block, dim3& patch)
     {
         if (winSize.width > 32 && winSize.width > 2 * winSize.height)
         {
@@ -95,156 +112,239 @@ namespace
 
         block.z = patch.z = 1;
     }
-}
 
-void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
-{
-    if (prevPts.empty())
+    void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
     {
-        nextPts.release();
-        status.release();
-        if (err) err->release();
-        return;
-    }
+        if (prevPts.empty())
+        {
+            nextPts.release();
+            status.release();
+            if (err) err->release();
+            return;
+        }
 
-    dim3 block, patch;
-    calcPatchSize(winSize, block, patch);
+        dim3 block, patch;
+        calcPatchSize(winSize_, block, patch);
 
-    CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
-    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
+        CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
+        CV_Assert( maxLevel_ >= 0 );
+        CV_Assert( winSize_.width > 2 && winSize_.height > 2 );
+        CV_Assert( patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6 );
+        CV_Assert( prevPts.rows == 1 && prevPts.type() == CV_32FC2 );
 
-    if (useInitialFlow)
-        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
-    else
-        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
+        if (useInitialFlow_)
+            CV_Assert( nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type() );
+        else
+            ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
 
-    GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
-    GpuMat temp2 = nextPts.reshape(1);
-    cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);
+        GpuMat temp1 = (useInitialFlow_ ? nextPts : prevPts).reshape(1);
+        GpuMat temp2 = nextPts.reshape(1);
+        cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream);
 
-    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
-    status.setTo(Scalar::all(1));
+        ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
+        status.setTo(Scalar::all(1), stream);
 
-    if (err)
-        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
+        if (err)
+            ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
 
-    // build the image pyramids.
+        // build the image pyramids.
 
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
+        BufferPool pool(stream);
 
-    int cn = prevImg.channels();
+        prevPyr_.resize(maxLevel_ + 1);
+        nextPyr_.resize(maxLevel_ + 1);
 
-    if (cn == 1 || cn == 4)
-    {
-        prevImg.convertTo(prevPyr_[0], CV_32F);
-        nextImg.convertTo(nextPyr_[0], CV_32F);
-    }
-    else
-    {
-        cuda::cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(prevPyr_[0], CV_32F);
+        int cn = prevImg.channels();
+
+        if (cn == 1 || cn == 4)
+        {
+            prevImg.convertTo(prevPyr_[0], CV_32F, stream);
+            nextImg.convertTo(nextPyr_[0], CV_32F, stream);
+        }
+        else
+        {
+            GpuMat buf = pool.getBuffer(prevImg.size(), CV_MAKE_TYPE(prevImg.depth(), 4));
 
-        cuda::cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(nextPyr_[0], CV_32F);
+            cuda::cvtColor(prevImg, buf, COLOR_BGR2BGRA, 0, stream);
+            buf.convertTo(prevPyr_[0], CV_32F, stream);
+
+            cuda::cvtColor(nextImg, buf, COLOR_BGR2BGRA, 0, stream);
+            buf.convertTo(nextPyr_[0], CV_32F, stream);
+        }
+
+        for (int level = 1; level <= maxLevel_; ++level)
+        {
+            cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
+            cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
+        }
+
+        pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream));
+
+        for (int level = maxLevel_; level >= 0; level--)
+        {
+            if (cn == 1)
+            {
+                pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
+                               prevPts.ptr<float2>(), nextPts.ptr<float2>(),
+                               status.ptr(),
+                               level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
+                               level, block, patch,
+                               StreamAccessor::getStream(stream));
+            }
+            else
+            {
+                pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
+                               prevPts.ptr<float2>(), nextPts.ptr<float2>(),
+                               status.ptr(),
+                               level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
+                               level, block, patch,
+                               StreamAccessor::getStream(stream));
+            }
+        }
     }
 
-    for (int level = 1; level <= maxLevel; ++level)
+    void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream)
     {
-        cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
+        CV_Assert( prevImg.type() == CV_8UC1 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
+        CV_Assert( maxLevel_ >= 0 );
+        CV_Assert( winSize_.width > 2 && winSize_.height > 2 );
 
-    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
+        // build the image pyramids.
 
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        if (cn == 1)
+        prevPyr_.resize(maxLevel_ + 1);
+        nextPyr_.resize(maxLevel_ + 1);
+
+        prevPyr_[0] = prevImg;
+        nextImg.convertTo(nextPyr_[0], CV_32F, stream);
+
+        for (int level = 1; level <= maxLevel_; ++level)
         {
-            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
+            cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
+            cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
         }
-        else
+
+        BufferPool pool(stream);
+
+        GpuMat uPyr[] = {
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+        };
+        GpuMat vPyr[] = {
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+        };
+
+        uPyr[0].setTo(Scalar::all(0), stream);
+        vPyr[0].setTo(Scalar::all(0), stream);
+        uPyr[1].setTo(Scalar::all(0), stream);
+        vPyr[1].setTo(Scalar::all(0), stream);
+
+        int2 winSize2i = make_int2(winSize_.width, winSize_.height);
+        pyrlk::loadConstants(winSize2i, iters_, StreamAccessor::getStream(stream));
+
+        int idx = 0;
+
+        for (int level = maxLevel_; level >= 0; level--)
         {
-            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
+            int idx2 = (idx + 1) & 1;
+
+            pyrlk::dense(prevPyr_[level], nextPyr_[level],
+                         uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2],
+                         PtrStepSzf(), winSize2i,
+                         StreamAccessor::getStream(stream));
+
+            if (level > 0)
+                idx = idx2;
         }
+
+        uPyr[idx].copyTo(u, stream);
+        vPyr[idx].copyTo(v, stream);
     }
-}
 
-void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
-{
-    CV_Assert(prevImg.type() == CV_8UC1);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
+    class SparsePyrLKOpticalFlowImpl : public SparsePyrLKOpticalFlow, private PyrLKOpticalFlowBase
+    {
+    public:
+        SparsePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+            PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
+        {
+        }
 
-    if (err)
-        err->create(prevImg.size(), CV_32FC1);
+        virtual Size getWinSize() const { return winSize_; }
+        virtual void setWinSize(Size winSize) { winSize_ = winSize; }
 
-    // build the image pyramids.
+        virtual int getMaxLevel() const { return maxLevel_; }
+        virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }
 
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
+        virtual int getNumIters() const { return iters_; }
+        virtual void setNumIters(int iters) { iters_ = iters; }
 
-    prevPyr_[0] = prevImg;
-    nextImg.convertTo(nextPyr_[0], CV_32F);
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
 
-    for (int level = 1; level <= maxLevel; ++level)
+        virtual void calc(InputArray _prevImg, InputArray _nextImg,
+                          InputArray _prevPts, InputOutputArray _nextPts,
+                          OutputArray _status,
+                          OutputArray _err,
+                          Stream& stream)
+        {
+            const GpuMat prevImg = _prevImg.getGpuMat();
+            const GpuMat nextImg = _nextImg.getGpuMat();
+            const GpuMat prevPts = _prevPts.getGpuMat();
+            GpuMat& nextPts = _nextPts.getGpuMatRef();
+            GpuMat& status = _status.getGpuMatRef();
+            GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL;
+
+            sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
+        }
+    };
+
+    class DensePyrLKOpticalFlowImpl : public DensePyrLKOpticalFlow, private PyrLKOpticalFlowBase
     {
-        cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
+    public:
+        DensePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+            PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
+        {
+        }
 
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
-    uPyr_[0].setTo(Scalar::all(0));
-    vPyr_[0].setTo(Scalar::all(0));
-    uPyr_[1].setTo(Scalar::all(0));
-    vPyr_[1].setTo(Scalar::all(0));
+        virtual Size getWinSize() const { return winSize_; }
+        virtual void setWinSize(Size winSize) { winSize_ = winSize; }
 
-    int2 winSize2i = make_int2(winSize.width, winSize.height);
-    pyrlk::loadConstants(winSize2i, iters);
+        virtual int getMaxLevel() const { return maxLevel_; }
+        virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }
 
-    PtrStepSzf derr = err ? *err : PtrStepSzf();
+        virtual int getNumIters() const { return iters_; }
+        virtual void setNumIters(int iters) { iters_ = iters; }
 
-    int idx = 0;
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
 
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        int idx2 = (idx + 1) & 1;
+        virtual void calc(InputArray _prevImg, InputArray _nextImg, InputOutputArray _flow, Stream& stream)
+        {
+            const GpuMat prevImg = _prevImg.getGpuMat();
+            const GpuMat nextImg = _nextImg.getGpuMat();
 
-        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
-            level == 0 ? derr : PtrStepSzf(), winSize2i);
+            BufferPool pool(stream);
+            GpuMat u = pool.getBuffer(prevImg.size(), CV_32FC1);
+            GpuMat v = pool.getBuffer(prevImg.size(), CV_32FC1);
 
-        if (level > 0)
-            idx = idx2;
-    }
+            dense(prevImg, nextImg, u, v, stream);
 
-    uPyr_[idx].copyTo(u);
-    vPyr_[idx].copyTo(v);
+            GpuMat flows[] = {u, v};
+            cuda::merge(flows, 2, _flow, stream);
+        }
+    };
 }
 
-void cv::cuda::PyrLKOpticalFlow::releaseMemory()
+Ptr<SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
 {
-    prevPyr_.clear();
-    nextPyr_.clear();
-
-    buf_.release();
-
-    uPyr_[0].release();
-    vPyr_[0].release();
+    return makePtr<SparsePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
+}
 
-    uPyr_[1].release();
-    vPyr_[1].release();
+Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
+{
+    return makePtr<DensePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaoptflow/src/tvl1flow.cpp b/modules/cudaoptflow/src/tvl1flow.cpp
index b8dfea56f1..e2ef07b0d1 100644
--- a/modules/cudaoptflow/src/tvl1flow.cpp
+++ b/modules/cudaoptflow/src/tvl1flow.cpp
@@ -44,256 +44,338 @@
 
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
 
-cv::cuda::OpticalFlowDual_TVL1_CUDA::OpticalFlowDual_TVL1_CUDA() { throw_no_cuda(); }
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage() {}
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+Ptr<OpticalFlowDual_TVL1> cv::cuda::OpticalFlowDual_TVL1::create(double, double, double, int, int, double, int, double, double, bool) { throw_no_cuda(); return Ptr<OpticalFlowDual_TVL1>(); }
 
 #else
 
 using namespace cv;
 using namespace cv::cuda;
 
-cv::cuda::OpticalFlowDual_TVL1_CUDA::OpticalFlowDual_TVL1_CUDA()
+namespace tvl1flow
 {
-    tau            = 0.25;
-    lambda         = 0.15;
-    theta          = 0.3;
-    nscales        = 5;
-    warps          = 5;
-    epsilon        = 0.01;
-    iterations     = 300;
-    scaleStep      = 0.8;
-    gamma           = 0.0;
-    useInitialFlow = false;
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream);
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2,
+                      PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy,
+                      PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream);
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream);
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream);
 }
 
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
+namespace
 {
-    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
-    CV_Assert( I0.size() == I1.size() );
-    CV_Assert( I0.type() == I1.type() );
-    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
-    CV_Assert( nscales > 0 );
-
-    // allocate memory for the pyramid structure
-    I0s.resize(nscales);
-    I1s.resize(nscales);
-    u1s.resize(nscales);
-    u2s.resize(nscales);
-    u3s.resize(nscales);
-
-    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
-    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
-
-    if (!useInitialFlow)
+    class OpticalFlowDual_TVL1_Impl : public OpticalFlowDual_TVL1
     {
-        flowx.create(I0.size(), CV_32FC1);
-        flowy.create(I0.size(), CV_32FC1);
-    }
+    public:
+        OpticalFlowDual_TVL1_Impl(double tau, double lambda, double theta, int nscales, int warps, double epsilon,
+                                  int iterations, double scaleStep, double gamma, bool useInitialFlow) :
+            tau_(tau), lambda_(lambda), gamma_(gamma), theta_(theta), nscales_(nscales), warps_(warps),
+            epsilon_(epsilon), iterations_(iterations), scaleStep_(scaleStep), useInitialFlow_(useInitialFlow)
+        {
+        }
+
+        virtual double getTau() const { return tau_; }
+        virtual void setTau(double tau) { tau_ = tau; }
+
+        virtual double getLambda() const { return lambda_; }
+        virtual void setLambda(double lambda) { lambda_ = lambda; }
+
+        virtual double getGamma() const { return gamma_; }
+        virtual void setGamma(double gamma) { gamma_ = gamma; }
+
+        virtual double getTheta() const { return theta_; }
+        virtual void setTheta(double theta) { theta_ = theta; }
+
+        virtual int getNumScales() const { return nscales_; }
+        virtual void setNumScales(int nscales) { nscales_ = nscales; }
+
+        virtual int getNumWarps() const { return warps_; }
+        virtual void setNumWarps(int warps) { warps_ = warps; }
+
+        virtual double getEpsilon() const { return epsilon_; }
+        virtual void setEpsilon(double epsilon) { epsilon_ = epsilon; }
+
+        virtual int getNumIterations() const { return iterations_; }
+        virtual void setNumIterations(int iterations) { iterations_ = iterations; }
+
+        virtual double getScaleStep() const { return scaleStep_; }
+        virtual void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; }
+
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+    private:
+        double tau_;
+        double lambda_;
+        double gamma_;
+        double theta_;
+        int nscales_;
+        int warps_;
+        double epsilon_;
+        int iterations_;
+        double scaleStep_;
+        bool useInitialFlow_;
+
+    private:
+        void calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream);
+        void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& stream);
 
-    u1s[0] = flowx;
-    u2s[0] = flowy;
-    if (gamma)
-        u3s[0].create(I0.size(), CV_32FC1);
+        std::vector<GpuMat> I0s;
+        std::vector<GpuMat> I1s;
+        std::vector<GpuMat> u1s;
+        std::vector<GpuMat> u2s;
+        std::vector<GpuMat> u3s;
 
-    I1x_buf.create(I0.size(), CV_32FC1);
-    I1y_buf.create(I0.size(), CV_32FC1);
+        GpuMat I1x_buf;
+        GpuMat I1y_buf;
 
-    I1w_buf.create(I0.size(), CV_32FC1);
-    I1wx_buf.create(I0.size(), CV_32FC1);
-    I1wy_buf.create(I0.size(), CV_32FC1);
+        GpuMat I1w_buf;
+        GpuMat I1wx_buf;
+        GpuMat I1wy_buf;
 
-    grad_buf.create(I0.size(), CV_32FC1);
-    rho_c_buf.create(I0.size(), CV_32FC1);
+        GpuMat grad_buf;
+        GpuMat rho_c_buf;
 
-    p11_buf.create(I0.size(), CV_32FC1);
-    p12_buf.create(I0.size(), CV_32FC1);
-    p21_buf.create(I0.size(), CV_32FC1);
-    p22_buf.create(I0.size(), CV_32FC1);
-    if (gamma)
+        GpuMat p11_buf;
+        GpuMat p12_buf;
+        GpuMat p21_buf;
+        GpuMat p22_buf;
+        GpuMat p31_buf;
+        GpuMat p32_buf;
+
+        GpuMat diff_buf;
+        GpuMat norm_buf;
+    };
+
+    void OpticalFlowDual_TVL1_Impl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream)
     {
-        p31_buf.create(I0.size(), CV_32FC1);
-        p32_buf.create(I0.size(), CV_32FC1);
+        const GpuMat frame0 = _frame0.getGpuMat();
+        const GpuMat frame1 = _frame1.getGpuMat();
+
+        BufferPool pool(stream);
+        GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1);
+
+        calcImpl(frame0, frame1, flowx, flowy, stream);
+
+        GpuMat flows[] = {flowx, flowy};
+        cuda::merge(flows, 2, _flow, stream);
     }
-    diff_buf.create(I0.size(), CV_32FC1);
 
-    // create the scales
-    for (int s = 1; s < nscales; ++s)
+    void OpticalFlowDual_TVL1_Impl::calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream)
     {
-        cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
-        cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);
-
-        if (I0s[s].cols < 16 || I0s[s].rows < 16)
+        CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+        CV_Assert( I0.size() == I1.size() );
+        CV_Assert( I0.type() == I1.type() );
+        CV_Assert( !useInitialFlow_ || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
+        CV_Assert( nscales_ > 0 );
+
+        // allocate memory for the pyramid structure
+        I0s.resize(nscales_);
+        I1s.resize(nscales_);
+        u1s.resize(nscales_);
+        u2s.resize(nscales_);
+        u3s.resize(nscales_);
+
+        I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0, stream);
+        I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0, stream);
+
+        if (!useInitialFlow_)
         {
-            nscales = s;
-            break;
+            flowx.create(I0.size(), CV_32FC1);
+            flowy.create(I0.size(), CV_32FC1);
         }
 
-        if (useInitialFlow)
+        u1s[0] = flowx;
+        u2s[0] = flowy;
+        if (gamma_)
         {
-            cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
-            cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);
+            u3s[0].create(I0.size(), CV_32FC1);
+        }
+
+        I1x_buf.create(I0.size(), CV_32FC1);
+        I1y_buf.create(I0.size(), CV_32FC1);
+
+        I1w_buf.create(I0.size(), CV_32FC1);
+        I1wx_buf.create(I0.size(), CV_32FC1);
+        I1wy_buf.create(I0.size(), CV_32FC1);
 
-            cuda::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
-            cuda::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
+        grad_buf.create(I0.size(), CV_32FC1);
+        rho_c_buf.create(I0.size(), CV_32FC1);
+
+        p11_buf.create(I0.size(), CV_32FC1);
+        p12_buf.create(I0.size(), CV_32FC1);
+        p21_buf.create(I0.size(), CV_32FC1);
+        p22_buf.create(I0.size(), CV_32FC1);
+        if (gamma_)
+        {
+            p31_buf.create(I0.size(), CV_32FC1);
+            p32_buf.create(I0.size(), CV_32FC1);
         }
-        else
+        diff_buf.create(I0.size(), CV_32FC1);
+
+        // create the scales
+        for (int s = 1; s < nscales_; ++s)
         {
-            u1s[s].create(I0s[s].size(), CV_32FC1);
-            u2s[s].create(I0s[s].size(), CV_32FC1);
+            cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+            cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+
+            if (I0s[s].cols < 16 || I0s[s].rows < 16)
+            {
+                nscales_ = s;
+                break;
+            }
+
+            if (useInitialFlow_)
+            {
+                cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+                cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+
+                cuda::multiply(u1s[s], Scalar::all(scaleStep_), u1s[s], 1, -1, stream);
+                cuda::multiply(u2s[s], Scalar::all(scaleStep_), u2s[s], 1, -1, stream);
+            }
+            else
+            {
+                u1s[s].create(I0s[s].size(), CV_32FC1);
+                u2s[s].create(I0s[s].size(), CV_32FC1);
+            }
+            if (gamma_)
+            {
+                u3s[s].create(I0s[s].size(), CV_32FC1);
+            }
         }
-        if (gamma)
-            u3s[s].create(I0s[s].size(), CV_32FC1);
-    }
 
-    if (!useInitialFlow)
-    {
-        u1s[nscales-1].setTo(Scalar::all(0));
-        u2s[nscales-1].setTo(Scalar::all(0));
-    }
-    if (gamma)
-        u3s[nscales - 1].setTo(Scalar::all(0));
+        if (!useInitialFlow_)
+        {
+            u1s[nscales_-1].setTo(Scalar::all(0), stream);
+            u2s[nscales_-1].setTo(Scalar::all(0), stream);
+        }
+        if (gamma_)
+        {
+            u3s[nscales_ - 1].setTo(Scalar::all(0), stream);
+        }
 
-    // pyramidal structure for computing the optical flow
-    for (int s = nscales - 1; s >= 0; --s)
-    {
-        // compute the optical flow at the current scale
-        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s]);
+        // pyramidal structure for computing the optical flow
+        for (int s = nscales_ - 1; s >= 0; --s)
+        {
+            // compute the optical flow at the current scale
+            procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s], stream);
 
-        // if this was the last scale, finish now
-        if (s == 0)
-            break;
+            // if this was the last scale, finish now
+            if (s == 0)
+                break;
 
-        // otherwise, upsample the optical flow
+            // otherwise, upsample the optical flow
 
-        // zoom the optical flow for the next finer scale
-        cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
-        cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
-        if (gamma)
-            cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size());
+            // zoom the optical flow for the next finer scale
+            cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
+            cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
+            if (gamma_)
+            {
+                cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
+            }
 
-        // scale the optical flow with the appropriate zoom factor
-        cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
-        cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
+            // scale the optical flow with the appropriate zoom factor
+            cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep_), u1s[s - 1], 1, -1, stream);
+            cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep_), u2s[s - 1], 1, -1, stream);
+        }
     }
-}
-
-namespace tvl1flow
-{
-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy);
-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho);
-    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
-                   PtrStepSzf grad, PtrStepSzf rho_c,
-                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
-                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
-                   float l_t, float theta, float gamma, bool calcError);
-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, const float gamma);
-}
 
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3)
-{
-    using namespace tvl1flow;
+    void OpticalFlowDual_TVL1_Impl::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& _stream)
+    {
+        using namespace tvl1flow;
 
-    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-    CV_DbgAssert( I1.size() == I0.size() );
-    CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.size() == I0.size() );
-    CV_DbgAssert( u2.size() == u1.size() );
+        const double scaledEpsilon = epsilon_ * epsilon_ * I0.size().area();
 
-    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
-    centeredGradient(I1, I1x, I1y);
+        CV_DbgAssert( I1.size() == I0.size() );
+        CV_DbgAssert( I1.type() == I0.type() );
+        CV_DbgAssert( u1.size() == I0.size() );
+        CV_DbgAssert( u2.size() == u1.size() );
 
-    GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+        centeredGradient(I1, I1x, I1y, stream);
 
-    GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
 
-    GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p31, p32;
-    if (gamma)
-    {
-        p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
-        p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
-    }
-    p11.setTo(Scalar::all(0));
-    p12.setTo(Scalar::all(0));
-    p21.setTo(Scalar::all(0));
-    p22.setTo(Scalar::all(0));
-    if (gamma)
-    {
-        p31.setTo(Scalar::all(0));
-        p32.setTo(Scalar::all(0));
-    }
+        GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
 
-    GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p31, p32;
+        if (gamma_)
+        {
+            p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
+            p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
+        }
+        p11.setTo(Scalar::all(0), _stream);
+        p12.setTo(Scalar::all(0), _stream);
+        p21.setTo(Scalar::all(0), _stream);
+        p22.setTo(Scalar::all(0), _stream);
+        if (gamma_)
+        {
+            p31.setTo(Scalar::all(0), _stream);
+            p32.setTo(Scalar::all(0), _stream);
+        }
 
-    const float l_t = static_cast<float>(lambda * theta);
-    const float taut = static_cast<float>(tau / theta);
+        GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
 
-    for (int warpings = 0; warpings < warps; ++warpings)
-    {
-        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
+        const float l_t = static_cast<float>(lambda_ * theta_);
+        const float taut = static_cast<float>(tau_ / theta_);
 
-        double error = std::numeric_limits<double>::max();
-        double prevError = 0.0;
-        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
+        for (int warpings = 0; warpings < warps_; ++warpings)
         {
-            // some tweaks to make sum operation less frequently
-            bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon);
-            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast<float>(theta), gamma, calcError);
-            if (calcError)
-            {
-                error = cuda::sum(diff, norm_buf)[0];
-                prevError = error;
-            }
-            else
+            warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c, stream);
+
+            double error = std::numeric_limits<double>::max();
+            double prevError = 0.0;
+            for (int n = 0; error > scaledEpsilon && n < iterations_; ++n)
             {
-                error = std::numeric_limits<double>::max();
-                prevError -= scaledEpsilon;
+                // some tweaks to make sum operation less frequently
+                bool calcError = (epsilon_ > 0) && (n & 0x1) && (prevError < scaledEpsilon);
+                estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast<float>(theta_), gamma_, calcError, stream);
+                if (calcError)
+                {
+                    _stream.waitForCompletion();
+                    error = cuda::sum(diff, norm_buf)[0];
+                    prevError = error;
+                }
+                else
+                {
+                    error = std::numeric_limits<double>::max();
+                    prevError -= scaledEpsilon;
+                }
+
+                estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma_, stream);
             }
-
-            estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
         }
     }
 }
 
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage()
+Ptr<OpticalFlowDual_TVL1> cv::cuda::OpticalFlowDual_TVL1::create(
+            double tau, double lambda, double theta, int nscales, int warps,
+            double epsilon, int iterations, double scaleStep, double gamma, bool useInitialFlow)
 {
-    I0s.clear();
-    I1s.clear();
-    u1s.clear();
-    u2s.clear();
-    u3s.clear();
-
-    I1x_buf.release();
-    I1y_buf.release();
-
-    I1w_buf.release();
-    I1wx_buf.release();
-    I1wy_buf.release();
-
-    grad_buf.release();
-    rho_c_buf.release();
-
-    p11_buf.release();
-    p12_buf.release();
-    p21_buf.release();
-    p22_buf.release();
-    if (gamma)
-    {
-        p31_buf.release();
-        p32_buf.release();
-    }
-    diff_buf.release();
-    norm_buf.release();
+    return makePtr<OpticalFlowDual_TVL1_Impl>(tau, lambda, theta, nscales, warps,
+                                              epsilon, iterations, scaleStep, gamma, useInitialFlow);
 }
 
 #endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)

From 2dc3b0f7f95135648d6a05f61e7e47787b3d63f2 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 15:36:31 +0300
Subject: [PATCH 05/10] fix cudaoptflow tests build

---
 modules/cudaoptflow/perf/perf_optflow.cpp | 81 ++++++++++++---------
 modules/cudaoptflow/perf/perf_precomp.hpp |  1 +
 modules/cudaoptflow/test/test_optflow.cpp | 87 ++++++++++++-----------
 modules/cudaoptflow/test/test_precomp.hpp |  1 +
 4 files changed, 98 insertions(+), 72 deletions(-)

diff --git a/modules/cudaoptflow/perf/perf_optflow.cpp b/modules/cudaoptflow/perf/perf_optflow.cpp
index 12612b0622..32040f282c 100644
--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@@ -71,13 +71,19 @@ PERF_TEST_P(ImagePair, BroxOpticalFlow,
     {
         const cv::cuda::GpuMat d_frame0(frame0);
         const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
+        cv::cuda::GpuMat flow;
 
-        cv::cuda::BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                        10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+        cv::Ptr<cv::cuda::BroxOpticalFlow> d_alg =
+                cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                                  10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
 
-        TEST_CYCLE() d_flow(d_frame0, d_frame1, u, v);
+        TEST_CYCLE() d_alg->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];
 
         CUDA_SANITY_CHECK(u, 1e-1);
         CUDA_SANITY_CHECK(v, 1e-1);
@@ -129,17 +135,17 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
     {
         const cv::cuda::GpuMat d_pts(pts.reshape(2, 1));
 
-        cv::cuda::PyrLKOpticalFlow d_pyrLK;
-        d_pyrLK.winSize = cv::Size(winSize, winSize);
-        d_pyrLK.maxLevel = levels - 1;
-        d_pyrLK.iters = iters;
+        cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> d_pyrLK =
+                cv::cuda::SparsePyrLKOpticalFlow::create(cv::Size(winSize, winSize),
+                                                         levels - 1,
+                                                         iters);
 
         const cv::cuda::GpuMat d_frame0(frame0);
         const cv::cuda::GpuMat d_frame1(frame1);
         cv::cuda::GpuMat nextPts;
         cv::cuda::GpuMat status;
 
-        TEST_CYCLE() d_pyrLK.sparse(d_frame0, d_frame1, d_pts, nextPts, status);
+        TEST_CYCLE() d_pyrLK->calc(d_frame0, d_frame1, d_pts, nextPts, status);
 
         CUDA_SANITY_CHECK(nextPts);
         CUDA_SANITY_CHECK(status);
@@ -189,15 +195,20 @@ PERF_TEST_P(ImagePair_WinSz_Levels_Iters, PyrLKOpticalFlowDense,
     {
         const cv::cuda::GpuMat d_frame0(frame0);
         const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
+        cv::cuda::GpuMat flow;
+
+        cv::Ptr<cv::cuda::DensePyrLKOpticalFlow> d_pyrLK =
+                cv::cuda::DensePyrLKOpticalFlow::create(cv::Size(winSize, winSize),
+                                                        levels - 1,
+                                                        iters);
+
+        TEST_CYCLE() d_pyrLK->calc(d_frame0, d_frame1, flow);
 
-        cv::cuda::PyrLKOpticalFlow d_pyrLK;
-        d_pyrLK.winSize = cv::Size(winSize, winSize);
-        d_pyrLK.maxLevel = levels - 1;
-        d_pyrLK.iters = iters;
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
 
-        TEST_CYCLE() d_pyrLK.dense(d_frame0, d_frame1, u, v);
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];
 
         CUDA_SANITY_CHECK(u);
         CUDA_SANITY_CHECK(v);
@@ -234,19 +245,19 @@ PERF_TEST_P(ImagePair, FarnebackOpticalFlow,
     {
         const cv::cuda::GpuMat d_frame0(frame0);
         const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
+        cv::cuda::GpuMat flow;
 
-        cv::cuda::FarnebackOpticalFlow d_farneback;
-        d_farneback.numLevels = numLevels;
-        d_farneback.pyrScale = pyrScale;
-        d_farneback.winSize = winSize;
-        d_farneback.numIters = numIters;
-        d_farneback.polyN = polyN;
-        d_farneback.polySigma = polySigma;
-        d_farneback.flags = flags;
+        cv::Ptr<cv::cuda::FarnebackOpticalFlow> d_farneback =
+                cv::cuda::FarnebackOpticalFlow::create(numLevels, pyrScale, false, winSize,
+                                                       numIters, polyN, polySigma, flags);
 
-        TEST_CYCLE() d_farneback(d_frame0, d_frame1, u, v);
+        TEST_CYCLE() d_farneback->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];
 
         CUDA_SANITY_CHECK(u, 1e-4);
         CUDA_SANITY_CHECK(v, 1e-4);
@@ -279,12 +290,18 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
     {
         const cv::cuda::GpuMat d_frame0(frame0);
         const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
+        cv::cuda::GpuMat flow;
+
+        cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg =
+                cv::cuda::OpticalFlowDual_TVL1::create();
+
+        TEST_CYCLE() d_alg->calc(d_frame0, d_frame1, flow);
 
-        cv::cuda::OpticalFlowDual_TVL1_CUDA d_alg;
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
 
-        TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v);
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];
 
         CUDA_SANITY_CHECK(u, 1e-1);
         CUDA_SANITY_CHECK(v, 1e-1);
diff --git a/modules/cudaoptflow/perf/perf_precomp.hpp b/modules/cudaoptflow/perf/perf_precomp.hpp
index 1dc00ae4b7..d7761a587a 100644
--- a/modules/cudaoptflow/perf/perf_precomp.hpp
+++ b/modules/cudaoptflow/perf/perf_precomp.hpp
@@ -55,6 +55,7 @@
 #include "opencv2/ts/cuda_perf.hpp"
 
 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
 #include "opencv2/video.hpp"
 
 #ifdef GTEST_CREATE_SHARED_LIBRARY
diff --git a/modules/cudaoptflow/test/test_optflow.cpp b/modules/cudaoptflow/test/test_optflow.cpp
index 7a6e68310f..c5b2ad8478 100644
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -71,12 +71,18 @@ CUDA_TEST_P(BroxOpticalFlow, Regression)
     cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
     ASSERT_FALSE(frame1.empty());
 
-    cv::cuda::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                  10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+    cv::Ptr<cv::cuda::BroxOpticalFlow> brox =
+            cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                              10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
 
-    cv::cuda::GpuMat u;
-    cv::cuda::GpuMat v;
-    brox(loadMat(frame0), loadMat(frame1), u, v);
+    cv::cuda::GpuMat flow;
+    brox->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    cv::cuda::GpuMat flows[2];
+    cv::cuda::split(flow, flows);
+
+    cv::cuda::GpuMat u = flows[0];
+    cv::cuda::GpuMat v = flows[1];
 
     std::string fname(cvtest::TS::ptr()->get_data_path());
     if (devInfo.majorVersion() >= 2)
@@ -133,12 +139,18 @@ CUDA_TEST_P(BroxOpticalFlow, OpticalFlowNan)
     cv::resize(frame0, r_frame0, cv::Size(1380,1000));
     cv::resize(frame1, r_frame1, cv::Size(1380,1000));
 
-    cv::cuda::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                  5 /*inner_iterations*/, 150 /*outer_iterations*/, 10 /*solver_iterations*/);
+    cv::Ptr<cv::cuda::BroxOpticalFlow> brox =
+            cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                              10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+    cv::cuda::GpuMat flow;
+    brox->calc(loadMat(frame0), loadMat(frame1), flow);
 
-    cv::cuda::GpuMat u;
-    cv::cuda::GpuMat v;
-    brox(loadMat(r_frame0), loadMat(r_frame1), u, v);
+    cv::cuda::GpuMat flows[2];
+    cv::cuda::split(flow, flows);
+
+    cv::cuda::GpuMat u = flows[0];
+    cv::cuda::GpuMat v = flows[1];
 
     cv::Mat h_u, h_v;
     u.download(h_u);
@@ -193,11 +205,12 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
     cv::Mat pts_mat(1, (int) pts.size(), CV_32FC2, (void*) &pts[0]);
     d_pts.upload(pts_mat);
 
-    cv::cuda::PyrLKOpticalFlow pyrLK;
+    cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> pyrLK =
+            cv::cuda::SparsePyrLKOpticalFlow::create();
 
     cv::cuda::GpuMat d_nextPts;
     cv::cuda::GpuMat d_status;
-    pyrLK.sparse(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status);
+    pyrLK->calc(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status);
 
     std::vector<cv::Point2f> nextPts(d_nextPts.cols);
     cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*) &nextPts[0]);
@@ -285,34 +298,30 @@ CUDA_TEST_P(FarnebackOpticalFlow, Accuracy)
 
     double polySigma = polyN <= 5 ? 1.1 : 1.5;
 
-    cv::cuda::FarnebackOpticalFlow farn;
-    farn.pyrScale = pyrScale;
-    farn.polyN = polyN;
-    farn.polySigma = polySigma;
-    farn.flags = flags;
+    cv::Ptr<cv::cuda::FarnebackOpticalFlow> farn =
+            cv::cuda::FarnebackOpticalFlow::create();
+    farn->setPyrScale(pyrScale);
+    farn->setPolyN(polyN);
+    farn->setPolySigma(polySigma);
+    farn->setFlags(flags);
 
-    cv::cuda::GpuMat d_flowx, d_flowy;
-    farn(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
+    cv::cuda::GpuMat d_flow;
+    farn->calc(loadMat(frame0), loadMat(frame1), d_flow);
 
     cv::Mat flow;
     if (useInitFlow)
     {
-        cv::Mat flowxy[] = {cv::Mat(d_flowx), cv::Mat(d_flowy)};
-        cv::merge(flowxy, 2, flow);
+        d_flow.download(flow);
 
-        farn.flags |= cv::OPTFLOW_USE_INITIAL_FLOW;
-        farn(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
+        farn->setFlags(farn->getFlags() | cv::OPTFLOW_USE_INITIAL_FLOW);
+        farn->calc(loadMat(frame0), loadMat(frame1), d_flow);
     }
 
     cv::calcOpticalFlowFarneback(
-        frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
-        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
-
-    std::vector<cv::Mat> flowxy;
-    cv::split(flow, flowxy);
+        frame0, frame1, flow, farn->getPyrScale(), farn->getNumLevels(), farn->getWinSize(),
+        farn->getNumIters(), farn->getPolyN(), farn->getPolySigma(), farn->getFlags());
 
-    EXPECT_MAT_SIMILAR(flowxy[0], d_flowx, 0.1);
-    EXPECT_MAT_SIMILAR(flowxy[1], d_flowy, 0.1);
+    EXPECT_MAT_SIMILAR(flow, d_flow, 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FarnebackOpticalFlow, testing::Combine(
@@ -352,26 +361,24 @@ CUDA_TEST_P(OpticalFlowDual_TVL1, Accuracy)
     cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(frame1.empty());
 
-    cv::cuda::OpticalFlowDual_TVL1_CUDA d_alg;
-    d_alg.iterations = 10;
-    d_alg.gamma = gamma;
+    cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg =
+            cv::cuda::OpticalFlowDual_TVL1::create();
+    d_alg->setNumIterations(10);
+    d_alg->setGamma(gamma);
 
-    cv::cuda::GpuMat d_flowx, d_flowy;
-    d_alg(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
+    cv::cuda::GpuMat d_flow;
+    d_alg->calc(loadMat(frame0), loadMat(frame1), d_flow);
 
     cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
     alg->set("medianFiltering", 1);
     alg->set("innerIterations", 1);
-    alg->set("outerIterations", d_alg.iterations);
+    alg->set("outerIterations", d_alg->getNumIterations());
     alg->set("gamma", gamma);
 
     cv::Mat flow;
     alg->calc(frame0, frame1, flow);
-    cv::Mat gold[2];
-    cv::split(flow, gold);
 
-    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 4e-3);
-    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 4e-3);
+    EXPECT_MAT_SIMILAR(flow, d_flow, 4e-3);
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
diff --git a/modules/cudaoptflow/test/test_precomp.hpp b/modules/cudaoptflow/test/test_precomp.hpp
index 54812022a6..2dc36abf5c 100644
--- a/modules/cudaoptflow/test/test_precomp.hpp
+++ b/modules/cudaoptflow/test/test_precomp.hpp
@@ -57,6 +57,7 @@
 #include "opencv2/ts/cuda_test.hpp"
 
 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
 #include "opencv2/video.hpp"
 
 #include "cvconfig.h"

From 03ae1e5aae21bc12bfbefaf78b353a3b83c96ff4 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 15:36:44 +0300
Subject: [PATCH 06/10] fix superres module compilation

---
 modules/superres/src/optical_flow.cpp | 161 ++++++++++++++++----------
 1 file changed, 98 insertions(+), 63 deletions(-)

diff --git a/modules/superres/src/optical_flow.cpp b/modules/superres/src/optical_flow.cpp
index fcc9bef347..52fc2648e2 100644
--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@@ -341,7 +341,7 @@ namespace
         int iterations_;
         bool useInitialFlow_;
 
-        Ptr<DenseOpticalFlow> alg_;
+        Ptr<cv::DenseOpticalFlow> alg_;
     };
 
     CV_INIT_ALGORITHM(DualTVL1, "DenseOpticalFlowExt.DualTVL1",
@@ -514,7 +514,7 @@ namespace
         int outerIterations_;
         int solverIterations_;
 
-        BroxOpticalFlow alg_;
+        Ptr<cuda::BroxOpticalFlow> alg_;
     };
 
     CV_INIT_ALGORITHM(Brox_CUDA, "DenseOpticalFlowExt.Brox_CUDA",
@@ -525,31 +525,40 @@ namespace
                       obj.info()->addParam(obj, "outerIterations", obj.outerIterations_, false, 0, 0, "Number of warping iterations (number of pyramid levels)");
                       obj.info()->addParam(obj, "solverIterations", obj.solverIterations_, false, 0, 0, "Number of linear system solver iterations"))
 
-    Brox_CUDA::Brox_CUDA() : GpuOpticalFlow(CV_32FC1), alg_(0.197f, 50.0f, 0.8f, 10, 77, 10)
+    Brox_CUDA::Brox_CUDA() : GpuOpticalFlow(CV_32FC1)
     {
-        alpha_ = alg_.alpha;
-        gamma_ = alg_.gamma;
-        scaleFactor_ = alg_.scale_factor;
-        innerIterations_ = alg_.inner_iterations;
-        outerIterations_ = alg_.outer_iterations;
-        solverIterations_ = alg_.solver_iterations;
+        alg_ = cuda::BroxOpticalFlow::create(0.197f, 50.0f, 0.8f, 10, 77, 10);
+
+        alpha_ = alg_->getFlowSmoothness();
+        gamma_ = alg_->getGradientConstancyImportance();
+        scaleFactor_ = alg_->getPyramidScaleFactor();
+        innerIterations_ = alg_->getInnerIterations();
+        outerIterations_ = alg_->getOuterIterations();
+        solverIterations_ = alg_->getSolverIterations();
     }
 
     void Brox_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
     {
-        alg_.alpha = static_cast<float>(alpha_);
-        alg_.gamma = static_cast<float>(gamma_);
-        alg_.scale_factor = static_cast<float>(scaleFactor_);
-        alg_.inner_iterations = innerIterations_;
-        alg_.outer_iterations = outerIterations_;
-        alg_.solver_iterations = solverIterations_;
+        alg_->setFlowSmoothness(alpha_);
+        alg_->setGradientConstancyImportance(gamma_);
+        alg_->setPyramidScaleFactor(scaleFactor_);
+        alg_->setInnerIterations(innerIterations_);
+        alg_->setOuterIterations(outerIterations_);
+        alg_->setSolverIterations(solverIterations_);
+
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);
 
-        alg_(input0, input1, dst1, dst2);
+        dst1 = flows[0];
+        dst2 = flows[1];
     }
 
     void Brox_CUDA::collectGarbage()
     {
-        alg_.buf.release();
+        alg_ = cuda::BroxOpticalFlow::create(alpha_, gamma_, scaleFactor_, innerIterations_, outerIterations_, solverIterations_);
         GpuOpticalFlow::collectGarbage();
     }
 }
@@ -581,7 +590,7 @@ namespace
         int maxLevel_;
         int iterations_;
 
-        PyrLKOpticalFlow alg_;
+        Ptr<cuda::DensePyrLKOpticalFlow> alg_;
     };
 
     CV_INIT_ALGORITHM(PyrLK_CUDA, "DenseOpticalFlowExt.PyrLK_CUDA",
@@ -591,24 +600,32 @@ namespace
 
     PyrLK_CUDA::PyrLK_CUDA() : GpuOpticalFlow(CV_8UC1)
     {
-        winSize_ = alg_.winSize.width;
-        maxLevel_ = alg_.maxLevel;
-        iterations_ = alg_.iters;
+        alg_ = cuda::DensePyrLKOpticalFlow::create();
+
+        winSize_ = alg_->getWinSize().width;
+        maxLevel_ = alg_->getMaxLevel();
+        iterations_ = alg_->getNumIters();
     }
 
     void PyrLK_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
     {
-        alg_.winSize.width = winSize_;
-        alg_.winSize.height = winSize_;
-        alg_.maxLevel = maxLevel_;
-        alg_.iters = iterations_;
+        alg_->setWinSize(Size(winSize_, winSize_));
+        alg_->setMaxLevel(maxLevel_);
+        alg_->setNumIters(iterations_);
+
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);
 
-        alg_.dense(input0, input1, dst1, dst2);
+        dst1 = flows[0];
+        dst2 = flows[1];
     }
 
     void PyrLK_CUDA::collectGarbage()
     {
-        alg_.releaseMemory();
+        alg_ = cuda::DensePyrLKOpticalFlow::create();
         GpuOpticalFlow::collectGarbage();
     }
 }
@@ -644,7 +661,7 @@ namespace
         double polySigma_;
         int flags_;
 
-        FarnebackOpticalFlow alg_;
+        Ptr<cuda::FarnebackOpticalFlow> alg_;
     };
 
     CV_INIT_ALGORITHM(Farneback_CUDA, "DenseOpticalFlowExt.Farneback_CUDA",
@@ -658,31 +675,40 @@ namespace
 
     Farneback_CUDA::Farneback_CUDA() : GpuOpticalFlow(CV_8UC1)
     {
-        pyrScale_ = alg_.pyrScale;
-        numLevels_ = alg_.numLevels;
-        winSize_ = alg_.winSize;
-        numIters_ = alg_.numIters;
-        polyN_ = alg_.polyN;
-        polySigma_ = alg_.polySigma;
-        flags_ = alg_.flags;
+        alg_ = cuda::FarnebackOpticalFlow::create();
+
+        pyrScale_ = alg_->getPyrScale();
+        numLevels_ = alg_->getNumLevels();
+        winSize_ = alg_->getWinSize();
+        numIters_ = alg_->getNumIters();
+        polyN_ = alg_->getPolyN();
+        polySigma_ = alg_->getPolySigma();
+        flags_ = alg_->getFlags();
     }
 
     void Farneback_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
     {
-        alg_.pyrScale = pyrScale_;
-        alg_.numLevels = numLevels_;
-        alg_.winSize = winSize_;
-        alg_.numIters = numIters_;
-        alg_.polyN = polyN_;
-        alg_.polySigma = polySigma_;
-        alg_.flags = flags_;
+        alg_->setPyrScale(pyrScale_);
+        alg_->setNumLevels(numLevels_);
+        alg_->setWinSize(winSize_);
+        alg_->setNumIters(numIters_);
+        alg_->setPolyN(polyN_);
+        alg_->setPolySigma(polySigma_);
+        alg_->setFlags(flags_);
+
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);
 
-        alg_(input0, input1, dst1, dst2);
+        dst1 = flows[0];
+        dst2 = flows[1];
     }
 
     void Farneback_CUDA::collectGarbage()
     {
-        alg_.releaseMemory();
+        alg_ = cuda::FarnebackOpticalFlow::create();
         GpuOpticalFlow::collectGarbage();
     }
 }
@@ -719,7 +745,7 @@ namespace
         int iterations_;
         bool useInitialFlow_;
 
-        OpticalFlowDual_TVL1_CUDA alg_;
+        Ptr<cuda::OpticalFlowDual_TVL1> alg_;
     };
 
     CV_INIT_ALGORITHM(DualTVL1_CUDA, "DenseOpticalFlowExt.DualTVL1_CUDA",
@@ -734,33 +760,42 @@ namespace
 
     DualTVL1_CUDA::DualTVL1_CUDA() : GpuOpticalFlow(CV_8UC1)
     {
-        tau_ = alg_.tau;
-        lambda_ = alg_.lambda;
-        theta_ = alg_.theta;
-        nscales_ = alg_.nscales;
-        warps_ = alg_.warps;
-        epsilon_ = alg_.epsilon;
-        iterations_ = alg_.iterations;
-        useInitialFlow_ = alg_.useInitialFlow;
+        alg_ = cuda::OpticalFlowDual_TVL1::create();
+
+        tau_ = alg_->getTau();
+        lambda_ = alg_->getLambda();
+        theta_ = alg_->getTheta();
+        nscales_ = alg_->getNumScales();
+        warps_ = alg_->getNumWarps();
+        epsilon_ = alg_->getEpsilon();
+        iterations_ = alg_->getNumIterations();
+        useInitialFlow_ = alg_->getUseInitialFlow();
     }
 
     void DualTVL1_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
     {
-        alg_.tau = tau_;
-        alg_.lambda = lambda_;
-        alg_.theta = theta_;
-        alg_.nscales = nscales_;
-        alg_.warps = warps_;
-        alg_.epsilon = epsilon_;
-        alg_.iterations = iterations_;
-        alg_.useInitialFlow = useInitialFlow_;
+        alg_->setTau(tau_);
+        alg_->setLambda(lambda_);
+        alg_->setTheta(theta_);
+        alg_->setNumScales(nscales_);
+        alg_->setNumWarps(warps_);
+        alg_->setEpsilon(epsilon_);
+        alg_->setNumIterations(iterations_);
+        alg_->setUseInitialFlow(useInitialFlow_);
+
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);
 
-        alg_(input0, input1, dst1, dst2);
+        dst1 = flows[0];
+        dst2 = flows[1];
     }
 
     void DualTVL1_CUDA::collectGarbage()
     {
-        alg_.collectGarbage();
+        alg_ = cuda::OpticalFlowDual_TVL1::create();
         GpuOpticalFlow::collectGarbage();
     }
 }

From 62f8240b004fe2d1386be8003082b42673fa7c55 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 15:36:51 +0300
Subject: [PATCH 07/10] fix videostab module compilation

---
 .../opencv2/videostab/optical_flow.hpp        |  4 +--
 modules/videostab/src/optical_flow.cpp        | 36 +++++++++++++------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/modules/videostab/include/opencv2/videostab/optical_flow.hpp b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
index a34a82e3f8..41d1953549 100644
--- a/modules/videostab/include/opencv2/videostab/optical_flow.hpp
+++ b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
@@ -121,7 +121,7 @@ public:
              cuda::GpuMat &status);
 
 private:
-    cuda::PyrLKOpticalFlow optFlowEstimator_;
+    Ptr<cuda::SparsePyrLKOpticalFlow> optFlowEstimator_;
     cuda::GpuMat frame0_, frame1_, points0_, points1_, status_, errors_;
 };
 
@@ -136,7 +136,7 @@ public:
             OutputArray errors);
 
 private:
-    cuda::PyrLKOpticalFlow optFlowEstimator_;
+    Ptr<cuda::DensePyrLKOpticalFlow> optFlowEstimator_;
     cuda::GpuMat frame0_, frame1_, flowX_, flowY_, errors_;
 };
 
diff --git a/modules/videostab/src/optical_flow.cpp b/modules/videostab/src/optical_flow.cpp
index d8a059c1fd..32c8133a7d 100644
--- a/modules/videostab/src/optical_flow.cpp
+++ b/modules/videostab/src/optical_flow.cpp
@@ -45,6 +45,10 @@
 #include "opencv2/videostab/optical_flow.hpp"
 #include "opencv2/videostab/ring_buffer.hpp"
 
+#ifdef HAVE_OPENCV_CUDAARITHM
+  #include "opencv2/cudaarithm.hpp"
+#endif
+
 namespace cv
 {
 namespace videostab
@@ -63,6 +67,7 @@ void SparsePyrLkOptFlowEstimator::run(
 SparsePyrLkOptFlowEstimatorGpu::SparsePyrLkOptFlowEstimatorGpu()
 {
     CV_Assert(cuda::getCudaEnabledDeviceCount() > 0);
+    optFlowEstimator_ = cuda::SparsePyrLKOpticalFlow::create();
 }
 
 
@@ -91,9 +96,9 @@ void SparsePyrLkOptFlowEstimatorGpu::run(
         const cuda::GpuMat &frame0, const cuda::GpuMat &frame1, const cuda::GpuMat &points0,
         cuda::GpuMat &points1, cuda::GpuMat &status, cuda::GpuMat &errors)
 {
-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
-    optFlowEstimator_.sparse(frame0, frame1, points0, points1, status, &errors);
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);
+    optFlowEstimator_->calc(frame0, frame1, points0, points1, status, errors);
 }
 
 
@@ -101,15 +106,16 @@ void SparsePyrLkOptFlowEstimatorGpu::run(
         const cuda::GpuMat &frame0, const cuda::GpuMat &frame1, const cuda::GpuMat &points0,
         cuda::GpuMat &points1, cuda::GpuMat &status)
 {
-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
-    optFlowEstimator_.sparse(frame0, frame1, points0, points1, status);
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);
+    optFlowEstimator_->calc(frame0, frame1, points0, points1, status);
 }
 
 
 DensePyrLkOptFlowEstimatorGpu::DensePyrLkOptFlowEstimatorGpu()
 {
     CV_Assert(cuda::getCudaEnabledDeviceCount() > 0);
+    optFlowEstimator_ = cuda::DensePyrLKOpticalFlow::create();
 }
 
 
@@ -120,16 +126,24 @@ void DensePyrLkOptFlowEstimatorGpu::run(
     frame0_.upload(frame0.getMat());
     frame1_.upload(frame1.getMat());
 
-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);
 
     if (errors.needed())
     {
-        optFlowEstimator_.dense(frame0_, frame1_, flowX_, flowY_, &errors_);
-        errors_.download(errors.getMatRef());
+        CV_Error(Error::StsNotImplemented, "DensePyrLkOptFlowEstimatorGpu doesn't support errors calculation");
     }
     else
-        optFlowEstimator_.dense(frame0_, frame1_, flowX_, flowY_);
+    {
+        cuda::GpuMat flow;
+        optFlowEstimator_->calc(frame0_, frame1_, flow);
+
+        cuda::GpuMat flows[2];
+        cuda::split(flow, flows);
+
+        flowX_ = flows[0];
+        flowY_ = flows[1];
+    }
 
     flowX_.download(flowX.getMatRef());
     flowY_.download(flowY.getMatRef());

From b3a743f09a94bddcde0344ae37f1754c887a1685 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 15:37:09 +0300
Subject: [PATCH 08/10] fix gpu samples compilation

---
 samples/gpu/farneback_optical_flow.cpp | 19 +++++---
 samples/gpu/optical_flow.cpp           | 61 +++++++++-----------------
 samples/gpu/pyrlk_optical_flow.cpp     | 24 ++--------
 3 files changed, 35 insertions(+), 69 deletions(-)

diff --git a/samples/gpu/farneback_optical_flow.cpp b/samples/gpu/farneback_optical_flow.cpp
index b8ed55ea6c..798b108a62 100644
--- a/samples/gpu/farneback_optical_flow.cpp
+++ b/samples/gpu/farneback_optical_flow.cpp
@@ -7,6 +7,7 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/video.hpp"
 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
 
 using namespace std;
 using namespace cv;
@@ -70,8 +71,8 @@ int main(int argc, char **argv)
     if (frameL.empty() || frameR.empty()) return -1;
 
     GpuMat d_frameL(frameL), d_frameR(frameR);
-    GpuMat d_flowx, d_flowy;
-    FarnebackOpticalFlow d_calc;
+    GpuMat d_flow;
+    Ptr<cuda::FarnebackOpticalFlow> d_calc = cuda::FarnebackOpticalFlow::create();
     Mat flowxy, flowx, flowy, image;
 
     bool running = true, gpuMode = true;
@@ -86,17 +87,21 @@ int main(int argc, char **argv)
         if (gpuMode)
         {
             tc0 = getTickCount();
-            d_calc(d_frameL, d_frameR, d_flowx, d_flowy);
+            d_calc->calc(d_frameL, d_frameR, d_flow);
             tc1 = getTickCount();
-            d_flowx.download(flowx);
-            d_flowy.download(flowy);
+
+            GpuMat planes[2];
+            cuda::split(d_flow, planes);
+
+            planes[0].download(flowx);
+            planes[1].download(flowy);
         }
         else
         {
             tc0 = getTickCount();
             calcOpticalFlowFarneback(
-                        frameL, frameR, flowxy, d_calc.pyrScale, d_calc.numLevels, d_calc.winSize,
-                        d_calc.numIters, d_calc.polyN, d_calc.polySigma, d_calc.flags);
+                        frameL, frameR, flowxy, d_calc->getPyrScale(), d_calc->getNumLevels(), d_calc->getWinSize(),
+                        d_calc->getNumIters(), d_calc->getPolyN(), d_calc->getPolySigma(), d_calc->getFlags());
             tc1 = getTickCount();
 
             Mat planes[] = {flowx, flowy};
diff --git a/samples/gpu/optical_flow.cpp b/samples/gpu/optical_flow.cpp
index 7d625de85b..b1b3c8de1e 100644
--- a/samples/gpu/optical_flow.cpp
+++ b/samples/gpu/optical_flow.cpp
@@ -5,6 +5,7 @@
 #include <opencv2/core/utility.hpp>
 #include "opencv2/highgui.hpp"
 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
 
 using namespace std;
 using namespace cv;
@@ -122,10 +123,13 @@ static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy,
     }
 }
 
-static void showFlow(const char* name, const GpuMat& d_flowx, const GpuMat& d_flowy)
+static void showFlow(const char* name, const GpuMat& d_flow)
 {
-    Mat flowx(d_flowx);
-    Mat flowy(d_flowy);
+    GpuMat planes[2];
+    cuda::split(d_flow, planes);
+
+    Mat flowx(planes[0]);
+    Mat flowy(planes[1]);
 
     Mat out;
     drawOpticalFlow(flowx, flowy, out, 10);
@@ -171,14 +175,12 @@ int main(int argc, const char* argv[])
     GpuMat d_frame0(frame0);
     GpuMat d_frame1(frame1);
 
-    GpuMat d_flowx(frame0.size(), CV_32FC1);
-    GpuMat d_flowy(frame0.size(), CV_32FC1);
+    GpuMat d_flow(frame0.size(), CV_32FC2);
 
-    BroxOpticalFlow brox(0.197f, 50.0f, 0.8f, 10, 77, 10);
-    PyrLKOpticalFlow lk; lk.winSize = Size(7, 7);
-    FarnebackOpticalFlow farn;
-    OpticalFlowDual_TVL1_CUDA tvl1;
-    FastOpticalFlowBM fastBM;
+    Ptr<cuda::BroxOpticalFlow> brox = cuda::BroxOpticalFlow::create(0.197f, 50.0f, 0.8f, 10, 77, 10);
+    Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
+    Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
+    Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();
 
     {
         GpuMat d_frame0f;
@@ -189,68 +191,45 @@ int main(int argc, const char* argv[])
 
         const int64 start = getTickCount();
 
-        brox(d_frame0f, d_frame1f, d_flowx, d_flowy);
+        brox->calc(d_frame0f, d_frame1f, d_flow);
 
         const double timeSec = (getTickCount() - start) / getTickFrequency();
         cout << "Brox : " << timeSec << " sec" << endl;
 
-        showFlow("Brox", d_flowx, d_flowy);
+        showFlow("Brox", d_flow);
     }
 
     {
         const int64 start = getTickCount();
 
-        lk.dense(d_frame0, d_frame1, d_flowx, d_flowy);
+        lk->calc(d_frame0, d_frame1, d_flow);
 
         const double timeSec = (getTickCount() - start) / getTickFrequency();
         cout << "LK : " << timeSec << " sec" << endl;
 
-        showFlow("LK", d_flowx, d_flowy);
+        showFlow("LK", d_flow);
     }
 
     {
         const int64 start = getTickCount();
 
-        farn(d_frame0, d_frame1, d_flowx, d_flowy);
+        farn->calc(d_frame0, d_frame1, d_flow);
 
         const double timeSec = (getTickCount() - start) / getTickFrequency();
         cout << "Farn : " << timeSec << " sec" << endl;
 
-        showFlow("Farn", d_flowx, d_flowy);
+        showFlow("Farn", d_flow);
     }
 
     {
         const int64 start = getTickCount();
 
-        tvl1(d_frame0, d_frame1, d_flowx, d_flowy);
+        tvl1->calc(d_frame0, d_frame1, d_flow);
 
         const double timeSec = (getTickCount() - start) / getTickFrequency();
         cout << "TVL1 : " << timeSec << " sec" << endl;
 
-        showFlow("TVL1", d_flowx, d_flowy);
-    }
-
-    {
-        const int64 start = getTickCount();
-
-        GpuMat buf;
-        calcOpticalFlowBM(d_frame0, d_frame1, Size(7, 7), Size(1, 1), Size(21, 21), false, d_flowx, d_flowy, buf);
-
-        const double timeSec = (getTickCount() - start) / getTickFrequency();
-        cout << "BM : " << timeSec << " sec" << endl;
-
-        showFlow("BM", d_flowx, d_flowy);
-    }
-
-    {
-        const int64 start = getTickCount();
-
-        fastBM(d_frame0, d_frame1, d_flowx, d_flowy);
-
-        const double timeSec = (getTickCount() - start) / getTickFrequency();
-        cout << "Fast BM : " << timeSec << " sec" << endl;
-
-        showFlow("Fast BM", d_flowx, d_flowy);
+        showFlow("TVL1", d_flow);
     }
 
     imshow("Frame 0", frame0);
diff --git a/samples/gpu/pyrlk_optical_flow.cpp b/samples/gpu/pyrlk_optical_flow.cpp
index febc28f28d..9074c47b68 100644
--- a/samples/gpu/pyrlk_optical_flow.cpp
+++ b/samples/gpu/pyrlk_optical_flow.cpp
@@ -186,12 +186,8 @@ int main(int argc, const char* argv[])
 
     // Sparse
 
-    PyrLKOpticalFlow d_pyrLK;
-
-    d_pyrLK.winSize.width = winSize;
-    d_pyrLK.winSize.height = winSize;
-    d_pyrLK.maxLevel = maxLevel;
-    d_pyrLK.iters = iters;
+    Ptr<cuda::SparsePyrLKOpticalFlow> d_pyrLK = cuda::SparsePyrLKOpticalFlow::create(
+                Size(winSize, winSize), maxLevel, iters);
 
     GpuMat d_frame0(frame0);
     GpuMat d_frame1(frame1);
@@ -199,7 +195,7 @@ int main(int argc, const char* argv[])
     GpuMat d_nextPts;
     GpuMat d_status;
 
-    d_pyrLK.sparse(useGray ? d_frame0Gray : d_frame0, useGray ? d_frame1Gray : d_frame1, d_prevPts, d_nextPts, d_status);
+    d_pyrLK->calc(useGray ? d_frame0Gray : d_frame0, useGray ? d_frame1Gray : d_frame1, d_prevPts, d_nextPts, d_status);
 
     // Draw arrows
 
@@ -216,20 +212,6 @@ int main(int argc, const char* argv[])
 
     imshow("PyrLK [Sparse]", frame0);
 
-    // Dense
-
-    GpuMat d_u;
-    GpuMat d_v;
-
-    d_pyrLK.dense(d_frame0Gray, d_frame1Gray, d_u, d_v);
-
-    // Draw flow field
-
-    Mat flowField;
-    getFlowField(Mat(d_u), Mat(d_v), flowField);
-
-    imshow("PyrLK [Dense] Flow Field", flowField);
-
     waitKey();
 
     return 0;

From 63ff39f9f3e1e486002266b75410ad988be9e9ec Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 31 Dec 2014 15:37:25 +0300
Subject: [PATCH 09/10] remove obsolete gpu optical flow samples

---
 samples/gpu/brox_optical_flow.cpp | 270 ------------------------------
 samples/gpu/performance/tests.cpp |  81 ---------
 2 files changed, 351 deletions(-)
 delete mode 100644 samples/gpu/brox_optical_flow.cpp

diff --git a/samples/gpu/brox_optical_flow.cpp b/samples/gpu/brox_optical_flow.cpp
deleted file mode 100644
index 638aade45a..0000000000
--- a/samples/gpu/brox_optical_flow.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-#include <iostream>
-#include <iomanip>
-#include <string>
-#include <ctype.h>
-
-#include "opencv2/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/cudaoptflow.hpp"
-#include "opencv2/cudaarithm.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::cuda;
-
-void getFlowField(const Mat& u, const Mat& v, Mat& flowField);
-
-int main(int argc, const char* argv[])
-{
-    try
-    {
-        const char* keys =
-           "{ h   help      |       | print help message }"
-           "{ l   left      |       | specify left image }"
-           "{ r   right     |       | specify right image }"
-           "{ s   scale     | 0.8   | set pyramid scale factor }"
-           "{ a   alpha     | 0.197 | set alpha }"
-           "{ g   gamma     | 50.0  | set gamma }"
-           "{ i   inner     | 10    | set number of inner iterations }"
-           "{ o   outer     | 77    | set number of outer iterations }"
-           "{ si  solver    | 10    | set number of basic solver iterations }"
-           "{ t   time_step | 0.1   | set frame interpolation time step }";
-
-        CommandLineParser cmd(argc, argv, keys);
-
-        if (cmd.has("help") || !cmd.check())
-        {
-            cmd.printMessage();
-            cmd.printErrors();
-            return 0;
-        }
-
-        string frame0Name = cmd.get<string>("left");
-        string frame1Name = cmd.get<string>("right");
-        float scale = cmd.get<float>("scale");
-        float alpha = cmd.get<float>("alpha");
-        float gamma = cmd.get<float>("gamma");
-        int inner_iterations = cmd.get<int>("inner");
-        int outer_iterations = cmd.get<int>("outer");
-        int solver_iterations = cmd.get<int>("solver");
-        float timeStep = cmd.get<float>("time_step");
-
-        if (frame0Name.empty() || frame1Name.empty())
-        {
-            cerr << "Missing input file names" << endl;
-            return -1;
-        }
-
-        Mat frame0Color = imread(frame0Name);
-        Mat frame1Color = imread(frame1Name);
-
-        if (frame0Color.empty() || frame1Color.empty())
-        {
-            cout << "Can't load input images" << endl;
-            return -1;
-        }
-
-        cv::cuda::printShortCudaDeviceInfo(cv::cuda::getDevice());
-
-        cout << "OpenCV / NVIDIA Computer Vision" << endl;
-        cout << "Optical Flow Demo: Frame Interpolation" << endl;
-        cout << "=========================================" << endl;
-
-        namedWindow("Forward flow");
-        namedWindow("Backward flow");
-
-        namedWindow("Interpolated frame");
-
-        cout << "Press:" << endl;
-        cout << "\tESC to quit" << endl;
-        cout << "\t'a' to move to the previous frame" << endl;
-        cout << "\t's' to move to the next frame\n" << endl;
-
-        frame0Color.convertTo(frame0Color, CV_32F, 1.0 / 255.0);
-        frame1Color.convertTo(frame1Color, CV_32F, 1.0 / 255.0);
-
-        Mat frame0Gray, frame1Gray;
-
-        cv::cvtColor(frame0Color, frame0Gray, COLOR_BGR2GRAY);
-        cv::cvtColor(frame1Color, frame1Gray, COLOR_BGR2GRAY);
-
-        GpuMat d_frame0(frame0Gray);
-        GpuMat d_frame1(frame1Gray);
-
-        cout << "Estimating optical flow" << endl;
-
-        BroxOpticalFlow d_flow(alpha, gamma, scale, inner_iterations, outer_iterations, solver_iterations);
-
-        cout << "\tForward..." << endl;
-
-        GpuMat d_fu, d_fv;
-
-        d_flow(d_frame0, d_frame1, d_fu, d_fv);
-
-        Mat flowFieldForward;
-        getFlowField(Mat(d_fu), Mat(d_fv), flowFieldForward);
-
-        cout << "\tBackward..." << endl;
-
-        GpuMat d_bu, d_bv;
-
-        d_flow(d_frame1, d_frame0, d_bu, d_bv);
-
-        Mat flowFieldBackward;
-        getFlowField(Mat(d_bu), Mat(d_bv), flowFieldBackward);
-
-        cout << "Interpolating..." << endl;
-
-        // first frame color components
-        GpuMat d_b, d_g, d_r;
-
-        // second frame color components
-        GpuMat d_bt, d_gt, d_rt;
-
-        // prepare color components on host and copy them to device memory
-        Mat channels[3];
-        cv::split(frame0Color, channels);
-
-        d_b.upload(channels[0]);
-        d_g.upload(channels[1]);
-        d_r.upload(channels[2]);
-
-        cv::split(frame1Color, channels);
-
-        d_bt.upload(channels[0]);
-        d_gt.upload(channels[1]);
-        d_rt.upload(channels[2]);
-
-        // temporary buffer
-        GpuMat d_buf;
-
-        // intermediate frame color components (GPU memory)
-        GpuMat d_rNew, d_gNew, d_bNew;
-
-        GpuMat d_newFrame;
-
-        vector<Mat> frames;
-        frames.reserve(static_cast<int>(1.0f / timeStep) + 2);
-
-        frames.push_back(frame0Color);
-
-        // compute interpolated frames
-        for (float timePos = timeStep; timePos < 1.0f; timePos += timeStep)
-        {
-            // interpolate blue channel
-            interpolateFrames(d_b, d_bt, d_fu, d_fv, d_bu, d_bv, timePos, d_bNew, d_buf);
-
-            // interpolate green channel
-            interpolateFrames(d_g, d_gt, d_fu, d_fv, d_bu, d_bv, timePos, d_gNew, d_buf);
-
-            // interpolate red channel
-            interpolateFrames(d_r, d_rt, d_fu, d_fv, d_bu, d_bv, timePos, d_rNew, d_buf);
-
-            GpuMat channels3[] = {d_bNew, d_gNew, d_rNew};
-            cuda::merge(channels3, 3, d_newFrame);
-
-            frames.push_back(Mat(d_newFrame));
-
-            cout << setprecision(4) << timePos * 100.0f << "%\r";
-        }
-
-        frames.push_back(frame1Color);
-
-        cout << setw(5) << "100%" << endl;
-
-        cout << "Done" << endl;
-
-        imshow("Forward flow", flowFieldForward);
-        imshow("Backward flow", flowFieldBackward);
-
-        int currentFrame = 0;
-
-        imshow("Interpolated frame", frames[currentFrame]);
-
-        for(;;)
-        {
-            int key = toupper(waitKey(10) & 0xff);
-
-            switch (key)
-            {
-            case 27:
-                return 0;
-
-            case 'A':
-                if (currentFrame > 0)
-                    --currentFrame;
-
-                imshow("Interpolated frame", frames[currentFrame]);
-                break;
-
-            case 'S':
-                if (currentFrame < static_cast<int>(frames.size()) - 1)
-                    ++currentFrame;
-
-                imshow("Interpolated frame", frames[currentFrame]);
-                break;
-            }
-        }
-    }
-    catch (const exception& ex)
-    {
-        cerr << ex.what() << endl;
-        return -1;
-    }
-    catch (...)
-    {
-        cerr << "Unknow error" << endl;
-        return -1;
-    }
-}
-
-template <typename T> inline T clamp (T x, T a, T b)
-{
-    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
-}
-
-template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
-{
-    x = clamp(x, a, b);
-    return c + (d - c) * (x - a) / (b - a);
-}
-
-void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
-{
-    float maxDisplacement = 1.0f;
-
-    for (int i = 0; i < u.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-        for (int j = 0; j < u.cols; ++j)
-        {
-            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
-
-            if (d > maxDisplacement)
-                maxDisplacement = d;
-        }
-    }
-
-    flowField.create(u.size(), CV_8UC4);
-
-    for (int i = 0; i < flowField.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-
-        Vec4b* row = flowField.ptr<Vec4b>(i);
-
-        for (int j = 0; j < flowField.cols; ++j)
-        {
-            row[j][0] = 0;
-            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][3] = 255;
-        }
-    }
-}
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 14910f9a38..9e8bfd28cd 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -1187,87 +1187,6 @@ TEST(GoodFeaturesToTrack)
     CUDA_OFF;
 }
 
-TEST(PyrLKOpticalFlow)
-{
-    Mat frame0 = imread(abspath("../data/rubberwhale1.png"));
-    if (frame0.empty()) throw runtime_error("can't open ../data/rubberwhale1.png");
-
-    Mat frame1 = imread(abspath("../data/rubberwhale2.png"));
-    if (frame1.empty()) throw runtime_error("can't open ../data/rubberwhale2.png");
-
-    Mat gray_frame;
-    cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
-
-    for (int points = 1000; points <= 8000; points *= 2)
-    {
-        SUBTEST << points;
-
-        vector<Point2f> pts;
-        goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
-
-        vector<Point2f> nextPts;
-        vector<unsigned char> status;
-
-        vector<float> err;
-
-        calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-
-        CPU_ON;
-        calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-        CPU_OFF;
-
-        cuda::PyrLKOpticalFlow d_pyrLK;
-
-        cuda::GpuMat d_frame0(frame0);
-        cuda::GpuMat d_frame1(frame1);
-
-        cuda::GpuMat d_pts;
-        Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void*)&pts[0]);
-        d_pts.upload(pts_mat);
-
-        cuda::GpuMat d_nextPts;
-        cuda::GpuMat d_status;
-        cuda::GpuMat d_err;
-
-        d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-
-        CUDA_ON;
-        d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-        CUDA_OFF;
-    }
-}
-
-
-TEST(FarnebackOpticalFlow)
-{
-    const string datasets[] = {"../data/rubberwhale", "../data/basketball"};
-    for (size_t i = 0; i < sizeof(datasets)/sizeof(*datasets); ++i) {
-    for (int fastPyramids = 0; fastPyramids < 2; ++fastPyramids) {
-    for (int useGaussianBlur = 0; useGaussianBlur < 2; ++useGaussianBlur) {
-
-    SUBTEST << "dataset=" << datasets[i] << ", fastPyramids=" << fastPyramids << ", useGaussianBlur=" << useGaussianBlur;
-    Mat frame0 = imread(abspath(datasets[i] + "1.png"), IMREAD_GRAYSCALE);
-    Mat frame1 = imread(abspath(datasets[i] + "2.png"), IMREAD_GRAYSCALE);
-    if (frame0.empty()) throw runtime_error("can't open " + datasets[i] + "1.png");
-    if (frame1.empty()) throw runtime_error("can't open " + datasets[i] + "2.png");
-
-    cuda::FarnebackOpticalFlow calc;
-    calc.fastPyramids = fastPyramids != 0;
-    calc.flags |= useGaussianBlur ? OPTFLOW_FARNEBACK_GAUSSIAN : 0;
-
-    cuda::GpuMat d_frame0(frame0), d_frame1(frame1), d_flowx, d_flowy;
-    CUDA_ON;
-    calc(d_frame0, d_frame1, d_flowx, d_flowy);
-    CUDA_OFF;
-
-    Mat flow;
-    CPU_ON;
-    calcOpticalFlowFarneback(frame0, frame1, flow, calc.pyrScale, calc.numLevels, calc.winSize, calc.numIters, calc.polyN, calc.polySigma, calc.flags);
-    CPU_OFF;
-
-    }}}
-}
-
 #ifdef HAVE_OPENCV_BGSEGM
 
 TEST(MOG)

From 710617034ba4840d3d007e3444a03d84b6f5e423 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 12 Jan 2015 11:26:07 +0300
Subject: [PATCH 10/10] remove unused function from pyrlk_optical_flow sample

---
 samples/gpu/pyrlk_optical_flow.cpp | 38 ------------------------------
 1 file changed, 38 deletions(-)

diff --git a/samples/gpu/pyrlk_optical_flow.cpp b/samples/gpu/pyrlk_optical_flow.cpp
index 9074c47b68..f13487b622 100644
--- a/samples/gpu/pyrlk_optical_flow.cpp
+++ b/samples/gpu/pyrlk_optical_flow.cpp
@@ -77,44 +77,6 @@ template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
     return c + (d - c) * (x - a) / (b - a);
 }
 
-static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
-{
-    float maxDisplacement = 1.0f;
-
-    for (int i = 0; i < u.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-        for (int j = 0; j < u.cols; ++j)
-        {
-            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
-
-            if (d > maxDisplacement)
-                maxDisplacement = d;
-        }
-    }
-
-    flowField.create(u.size(), CV_8UC4);
-
-    for (int i = 0; i < flowField.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-
-        Vec4b* row = flowField.ptr<Vec4b>(i);
-
-        for (int j = 0; j < flowField.cols; ++j)
-        {
-            row[j][0] = 0;
-            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][3] = 255;
-        }
-    }
-}
-
 int main(int argc, const char* argv[])
 {
     const char* keys =