From 10f2a038c2f30e78e14e530a6c9f9c17d618a1c5 Mon Sep 17 00:00:00 2001
From: mlyashko <maria.lyashko@itseez.com>
Date: Tue, 18 Feb 2014 14:24:30 +0400
Subject: [PATCH 1/6] Optical flow dualt tvl1 ocl-based implementation and
 tests

---
 .../perf/opencl/perf_optflow_dualTVL1.cpp     | 113 +++
 modules/video/src/opencl/optical_flow_tvl1.cl | 378 ++++++++++
 modules/video/src/tvl1flow.cpp                | 672 +++++++++++++++---
 .../video/test/ocl/test_optflow_tvl1flow.cpp  | 117 +++
 4 files changed, 1162 insertions(+), 118 deletions(-)
 create mode 100644 modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
 create mode 100644 modules/video/src/opencl/optical_flow_tvl1.cl
 create mode 100644 modules/video/test/ocl/test_optflow_tvl1flow.cpp

diff --git a/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp b/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
new file mode 100644
index 0000000000..8b71b9ad8f
--- /dev/null
+++ b/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+using std::tr1::make_tuple;
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// OpticalFlow Dual TVL1 ////////////////////////
+typedef tuple< tuple<int, double>, bool> OpticalFlowDualTVL1Params;
+typedef TestBaseWithParam<OpticalFlowDualTVL1Params> OpticalFlowDualTVL1Fixture;
+
+OCL_PERF_TEST_P(OpticalFlowDualTVL1Fixture, OpticalFlowDualTVL1,
+            ::testing::Combine(
+                        ::testing::Values(make_tuple<int, double>(-1, 0.3),
+                                          make_tuple<int, double>(3, 0.5)),
+                        ::testing::Bool()
+                                )
+            )
+    {
+        Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";
+
+        Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";
+
+        const Size srcSize = frame0.size();
+
+        const OpticalFlowDualTVL1Params params = GetParam();
+        const tuple<int, double> filteringScale = get<0>(params);
+            const int medianFiltering = get<0>(filteringScale);
+            const double scaleStep = get<1>(filteringScale);
+        const bool useInitFlow = get<1>(params);
+        const double eps = 0.001;
+
+        UMat uFrame0; frame0.copyTo(uFrame0);
+        UMat uFrame1; frame1.copyTo(uFrame1);
+        UMat uFlow(srcSize, CV_32FC2);
+        declare.in(uFrame0, uFrame1, WARMUP_READ).out(uFlow, WARMUP_READ);
+
+        //create algorithm
+        cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+
+        //set parameters
+        alg->set("scaleStep", scaleStep);
+        alg->setInt("medianFiltering", medianFiltering);
+
+        if (useInitFlow)
+        {
+            //calculate initial flow as result of optical flow
+            OCL_ON(alg->calc(uFrame0, uFrame1, uFlow));
+        }
+
+        //set flag to use initial flow
+        alg->setBool("useInitialFlow", useInitFlow);
+        OCL_TEST_CYCLE()
+            alg->calc(uFrame0, uFrame1, uFlow);
+
+        SANITY_CHECK(uFlow, eps, ERROR_RELATIVE);
+    }
+
+    }
+} // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/video/src/opencl/optical_flow_tvl1.cl b/modules/video/src/opencl/optical_flow_tvl1.cl
new file mode 100644
index 0000000000..472c4fa1a4
--- /dev/null
+++ b/modules/video/src/opencl/optical_flow_tvl1.cl
@@ -0,0 +1,378 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void centeredGradientKernel(__global const float* src_ptr, int src_col, int src_row, int src_step,
+                                     __global float* dx, __global float* dy, int d_step)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if((x < src_col)&&(y < src_row))
+    {
+        int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);
+        int src_x2 = (x - 1) > 0 ? (x -1) : 0;
+        dx[y * d_step+ x] = 0.5f * (src_ptr[y * src_step + src_x1] - src_ptr[y * src_step+ src_x2]);
+
+        int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);
+        int src_y2 = (y - 1) > 0 ? (y - 1) : 0;
+        dy[y * d_step+ x] = 0.5f * (src_ptr[src_y1 * src_step + x] - src_ptr[src_y2 * src_step+ x]);
+    }
+
+}
+
+inline float bicubicCoeff(float x_)
+{
+
+    float x = fabs(x_);
+    if (x <= 1.0f)
+        return x * x * (1.5f * x - 2.5f) + 1.0f;
+    else if (x < 2.0f)
+        return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+    else
+        return 0.0f;
+}
+
+__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,
+    image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,
+    __global const float* u1, int u1_step,
+    __global const float* u2,
+    __global float* I1w,
+    __global float* I1wx, /*int I1wx_step,*/
+    __global float* I1wy, /*int I1wy_step,*/
+    __global float* grad, /*int grad_step,*/
+    __global float* rho,
+    int I1w_step,
+    int u2_step,
+    int u1_offset_x,
+    int u1_offset_y,
+    int u2_offset_x,
+    int u2_offset_y)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < I0_col&&y < I0_row)
+    {
+        //float u1Val = u1(y, x);
+        float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        //float u2Val = u2(y, x);
+        float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+
+        float wx = x + u1Val;
+        float wy = y + u2Val;
+
+        int xmin = ceil(wx - 2.0f);
+        int xmax = floor(wx + 2.0f);
+
+        int ymin = ceil(wy - 2.0f);
+        int ymax = floor(wy + 2.0f);
+
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+        sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+                //sum  += w * tex2D(tex_I1 , cx, cy);
+                int2 cood = (int2)(cx, cy);
+                sum += w * read_imagef(tex_I1, sampleri, cood).x;
+                //sumx += w * tex2D(tex_I1x, cx, cy);
+                sumx += w * read_imagef(tex_I1x, sampleri, cood).x;
+                //sumy += w * tex2D(tex_I1y, cx, cy);
+                sumy += w * read_imagef(tex_I1y, sampleri, cood).x;
+                wsum += w;
+            }
+        }
+        float coeff = 1.0f / wsum;
+        float I1wVal  = sum  * coeff;
+        float I1wxVal = sumx * coeff;
+        float I1wyVal = sumy * coeff;
+        I1w[y * I1w_step + x]  = I1wVal;
+        I1wx[y * I1w_step + x] = I1wxVal;
+        I1wy[y * I1w_step + x] = I1wyVal;
+        float Ix2 = I1wxVal * I1wxVal;
+        float Iy2 = I1wyVal * I1wyVal;
+
+        // store the |Grad(I1)|^2
+        grad[y * I1w_step + x] = Ix2 + Iy2;
+
+        // compute the constant part of the rho function
+        float I0Val = I0[y * I0_step + x];
+        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+}
+
+inline float readImage(__global float *image,  int x,  int y,  int rows,  int cols, int elemCntPerRow)
+{
+    int i0 = clamp(x, 0, cols - 1);
+    int j0 = clamp(y, 0, rows - 1);
+
+    return image[j0 * elemCntPerRow + i0];
+}
+
+__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,
+    __global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,
+    __global const float* u1, int u1_step,
+    __global const float* u2,
+    __global float* I1w,
+    __global float* I1wx, /*int I1wx_step,*/
+    __global float* I1wy, /*int I1wy_step,*/
+    __global float* grad, /*int grad_step,*/
+    __global float* rho,
+    int I1w_step,
+    int u2_step,
+    int I1_step,
+    int I1x_step)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < I0_col&&y < I0_row)
+    {
+        //float u1Val = u1(y, x);
+        float u1Val = u1[y * u1_step + x];
+        //float u2Val = u2(y, x);
+        float u2Val = u2[y * u2_step + x];
+
+        float wx = x + u1Val;
+        float wy = y + u2Val;
+
+        int xmin = ceil(wx - 2.0f);
+        int xmax = floor(wx + 2.0f);
+
+        int ymin = ceil(wy - 2.0f);
+        int ymax = floor(wy + 2.0f);
+
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+
+                int2 cood = (int2)(cx, cy);
+                sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);
+                sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);
+                sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);
+                wsum += w;
+            }
+        }
+
+        float coeff = 1.0f / wsum;
+
+        float I1wVal  = sum  * coeff;
+        float I1wxVal = sumx * coeff;
+        float I1wyVal = sumy * coeff;
+
+        I1w[y * I1w_step + x]  = I1wVal;
+        I1wx[y * I1w_step + x] = I1wxVal;
+        I1wy[y * I1w_step + x] = I1wyVal;
+
+        float Ix2 = I1wxVal * I1wxVal;
+        float Iy2 = I1wyVal * I1wyVal;
+
+        // store the |Grad(I1)|^2
+        grad[y * I1w_step + x] = Ix2 + Iy2;
+
+        // compute the constant part of the rho function
+        float I0Val = I0[y * I0_step + x];
+        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+
+}
+
+
+__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,
+    __global const float* u2,
+    __global float* p11, int p11_step,
+    __global float* p12,
+    __global float* p21,
+    __global float* p22,
+    float taut,
+    int u2_step,
+    int u1_offset_x,
+    int u1_offset_y,
+    int u2_offset_x,
+    int u2_offset_y)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < u1_col && y < u1_row)
+    {
+        int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
+        float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+
+        int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
+        float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+
+        int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
+        float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+
+        int src_y2 = (y + 1) <  (u1_row - 1) ? (y + 1) : (u1_row - 1);
+        float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+
+        float g1 = hypot(u1x, u1y);
+        float g2 = hypot(u2x, u2y);
+
+        float ng1 = 1.0f + taut * g1;
+        float ng2 = 1.0f + taut * g2;
+
+        p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;
+        p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;
+        p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;
+        p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;
+    }
+
+}
+
+inline float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)
+{
+
+    if (x > 0 && y > 0)
+    {
+        float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];
+        float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];
+        return v1x + v2y;
+    }
+    else
+    {
+        if (y > 0)
+            return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];
+        else
+        {
+            if (x > 0)
+                return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];
+            else
+                return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];
+        }
+    }
+
+}
+
+__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,
+    __global const float* I1wy, /*int I1wy_step,*/
+    __global const float* grad, /*int grad_step,*/
+    __global const float* rho_c, /*int rho_c_step,*/
+    __global const float* p11, /*int p11_step,*/
+    __global const float* p12, /*int p12_step,*/
+    __global const float* p21, /*int p21_step,*/
+    __global const float* p22, /*int p22_step,*/
+    __global float* u1, int u1_step,
+    __global float* u2,
+    __global float* error, float l_t, float theta, int u2_step,
+    int u1_offset_x,
+    int u1_offset_y,
+    int u2_offset_x,
+    int u2_offset_y,
+    char calc_error)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < I1wx_col && y < I1wx_row)
+    {
+        float I1wxVal = I1wx[y * I1wx_step + x];
+        float I1wyVal = I1wy[y * I1wx_step + x];
+        float gradVal = grad[y * I1wx_step + x];
+        float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+
+        float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
+
+        // estimate the values of the variable (v1, v2) (thresholding operator TH)
+
+        float d1 = 0.0f;
+        float d2 = 0.0f;
+
+        if (rho < -l_t * gradVal)
+        {
+            d1 = l_t * I1wxVal;
+            d2 = l_t * I1wyVal;
+        }
+        else if (rho > l_t * gradVal)
+        {
+            d1 = -l_t * I1wxVal;
+            d2 = -l_t * I1wyVal;
+        }
+        else if (gradVal > 1.192092896e-07f)
+        {
+            float fi = -rho / gradVal;
+            d1 = fi * I1wxVal;
+            d2 = fi * I1wyVal;
+        }
+
+        float v1 = u1OldVal + d1;
+        float v2 = u2OldVal + d2;
+
+        // compute the divergence of the dual variable (p1, p2)
+
+        float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);
+        float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);
+
+        // estimate the values of the optical flow (u1, u2)
+
+        float u1NewVal = v1 + theta * div_p1;
+        float u2NewVal = v2 + theta * div_p2;
+
+        u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;
+        u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;
+
+        if(calc_error)
+        {
+            float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
+            float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
+            error[y * I1wx_step + x] = n1 + n2;
+        }
+    }
+}
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index ddcc32b405..f57aa8082d 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -73,7 +73,14 @@
 */
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 #include <limits>
+#include <iomanip>
+#include <iostream>
+#include "opencv2/core/opencl/ocl_defs.hpp"
+
+
 
 using namespace cv;
 
@@ -105,41 +112,254 @@ protected:
 private:
     void procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2);
 
-    std::vector<Mat_<float> > I0s;
-    std::vector<Mat_<float> > I1s;
-    std::vector<Mat_<float> > u1s;
-    std::vector<Mat_<float> > u2s;
+    bool procOneScale_ocl(const UMat& I0, const UMat& I1, UMat& u1, UMat& u2);
+
+    bool calc_ocl(InputArray I0, InputArray I1, InputOutputArray flow);
+    struct dataMat
+    {
+        std::vector<Mat_<float> > I0s;
+        std::vector<Mat_<float> > I1s;
+        std::vector<Mat_<float> > u1s;
+        std::vector<Mat_<float> > u2s;
+
+        Mat_<float> I1x_buf;
+        Mat_<float> I1y_buf;
+
+        Mat_<float> flowMap1_buf;
+        Mat_<float> flowMap2_buf;
+
+        Mat_<float> I1w_buf;
+        Mat_<float> I1wx_buf;
+        Mat_<float> I1wy_buf;
 
-    Mat_<float> I1x_buf;
-    Mat_<float> I1y_buf;
+        Mat_<float> grad_buf;
+        Mat_<float> rho_c_buf;
 
-    Mat_<float> flowMap1_buf;
-    Mat_<float> flowMap2_buf;
+        Mat_<float> v1_buf;
+        Mat_<float> v2_buf;
 
-    Mat_<float> I1w_buf;
-    Mat_<float> I1wx_buf;
-    Mat_<float> I1wy_buf;
+        Mat_<float> p11_buf;
+        Mat_<float> p12_buf;
+        Mat_<float> p21_buf;
+        Mat_<float> p22_buf;
 
-    Mat_<float> grad_buf;
-    Mat_<float> rho_c_buf;
+        Mat_<float> div_p1_buf;
+        Mat_<float> div_p2_buf;
 
-    Mat_<float> v1_buf;
-    Mat_<float> v2_buf;
+        Mat_<float> u1x_buf;
+        Mat_<float> u1y_buf;
+        Mat_<float> u2x_buf;
+        Mat_<float> u2y_buf;
+    } dm;
+    struct dataUMat
+    {
+        std::vector<UMat> I0s;
+        std::vector<UMat> I1s;
+        std::vector<UMat> u1s;
+        std::vector<UMat> u2s;
+
+        UMat I1x_buf;
+        UMat I1y_buf;
 
-    Mat_<float> p11_buf;
-    Mat_<float> p12_buf;
-    Mat_<float> p21_buf;
-    Mat_<float> p22_buf;
+        UMat I1w_buf;
+        UMat I1wx_buf;
+        UMat I1wy_buf;
 
-    Mat_<float> div_p1_buf;
-    Mat_<float> div_p2_buf;
+        UMat grad_buf;
+        UMat rho_c_buf;
 
-    Mat_<float> u1x_buf;
-    Mat_<float> u1y_buf;
-    Mat_<float> u2x_buf;
-    Mat_<float> u2y_buf;
+        UMat p11_buf;
+        UMat p12_buf;
+        UMat p21_buf;
+        UMat p22_buf;
+
+        UMat diff_buf;
+        UMat norm_buf;
+    } dum;
 };
 
+namespace cv_ocl_tvl1flow
+{
+    bool centeredGradient(const UMat &src, UMat &dx, UMat &dy);
+
+    bool warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UMat &I1y,
+        UMat &u1, UMat &u2, UMat &I1w, UMat &I1wx, UMat &I1wy,
+        UMat &grad, UMat &rho);
+
+    bool estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
+        UMat &rho_c, UMat &p11, UMat &p12,
+        UMat &p21, UMat &p22, UMat &u1,
+        UMat &u2, UMat &error, float l_t, float theta, char calc_error);
+
+    bool estimateDualVariables(UMat &u1, UMat &u2,
+        UMat &p11, UMat &p12, UMat &p21, UMat &p22, float taut);
+}
+
+bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
+{
+#ifdef ANDROID
+    size_t localsize[2] = { 32, 4 };
+#else
+    size_t localsize[2] = { 32, 8 };
+#endif
+    size_t globalsize[2] = { src.cols, src.rows };
+
+    ocl::Kernel kernel;
+    if (!kernel.create("centeredGradientKernel", cv::ocl::video::optical_flow_tvl1_oclsrc, ""))
+        return false;
+
+    int idxArg = 0;
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));//src mat
+    idxArg = kernel.set(idxArg, (int)(src.cols));//src mat col
+    idxArg = kernel.set(idxArg, (int)(src.rows));//src mat rows
+    idxArg = kernel.set(idxArg, (int)(src.step / src.elemSize()));//src mat step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dx));//res mat dx
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dy));//res mat dy
+    idxArg = kernel.set(idxArg, (int)(dx.step/dx.elemSize()));//res mat step
+    return kernel.run(2, globalsize, localsize, false);
+
+}
+
+bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UMat &I1y,
+    UMat &u1, UMat &u2, UMat &I1w, UMat &I1wx, UMat &I1wy,
+    UMat &grad, UMat &rho)
+{
+#ifdef ANDROID
+    size_t localsize[2] = { 32, 4 };
+#else
+    size_t localsize[2] = { 32, 8 };
+#endif
+    size_t globalsize[2] = { I0.cols, I0.rows };
+
+    ocl::Kernel kernel;
+    if (!kernel.create("warpBackwardKernel", cv::ocl::video::optical_flow_tvl1_oclsrc, ""))
+        return false;
+
+    int idxArg = 0;
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I0));//I0 mat
+    int I0_step = (int)(I0.step / I0.elemSize());
+    idxArg = kernel.set(idxArg, I0_step);//I0_step
+    idxArg = kernel.set(idxArg, (int)(I0.cols));//I0_col
+    idxArg = kernel.set(idxArg, (int)(I0.rows));//I0_row
+    ocl::Image2D imageI1(I1);
+    ocl::Image2D imageI1x(I1x);
+    ocl::Image2D imageI1y(I1y);
+    idxArg = kernel.set(idxArg, imageI1);//image2d_t tex_I1
+    idxArg = kernel.set(idxArg, imageI1x);//image2d_t tex_I1x
+    idxArg = kernel.set(idxArg, imageI1y);//image2d_t tex_I1y
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u1));//const float* u1
+    idxArg = kernel.set(idxArg, (int)(u1.step / u1.elemSize()));//int u1_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u2));//const float* u2
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1w));///float* I1w
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1wx));//float* I1wx
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1wy));//float* I1wy
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(grad));//float* grad
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(rho));//float* rho
+    idxArg = kernel.set(idxArg, (int)(I1w.step / I1w.elemSize()));//I1w_step
+    idxArg = kernel.set(idxArg, (int)(u2.step / u2.elemSize()));//u2_step
+    int u1_offset_x = (int)((u1.offset) % (u1.step));
+    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
+    idxArg = kernel.set(idxArg, (int)u1_offset_x );//u1_offset_x
+    idxArg = kernel.set(idxArg, (int)(u1.offset/u1.step));//u1_offset_y
+    int u2_offset_x = (int)((u2.offset) % (u2.step));
+    u2_offset_x = (int) (u2_offset_x / u2.elemSize());
+    idxArg = kernel.set(idxArg, (int)u2_offset_x);//u2_offset_x
+    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step));//u2_offset_y
+
+    return kernel.run(2, globalsize, localsize, false);
+
+}
+
+bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
+    UMat &rho_c, UMat &p11, UMat &p12,
+    UMat &p21, UMat &p22, UMat &u1,
+    UMat &u2, UMat &error, float l_t, float theta, char calc_error)
+{
+#ifdef ANDROID
+    size_t localsize[2] = { 32, 4 };
+#else
+    size_t localsize[2] = { 32, 8 };
+#endif
+    size_t globalsize[2] = { I1wx.cols, I1wx.rows };
+
+    ocl::Kernel kernel;
+    if (!kernel.create("estimateUKernel", cv::ocl::video::optical_flow_tvl1_oclsrc, ""))
+        return false;
+
+    int idxArg = 0;
+
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I1wx)); //const float* I1wx
+    idxArg = kernel.set(idxArg, (int)(I1wx.cols)); //int I1wx_col
+    idxArg = kernel.set(idxArg, (int)(I1wx.rows)); //int I1wx_row
+    idxArg = kernel.set(idxArg, (int)(I1wx.step/I1wx.elemSize())); //int I1wx_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I1wy)); //const float* I1wy
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(grad)); //const float* grad
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(rho_c)); //const float* rho_c
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p11)); //const float* p11
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p12)); //const float* p12
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p21)); //const float* p21
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p22)); //const float* p22
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(u1)); //float* u1
+    idxArg = kernel.set(idxArg, (int)(u1.step / u1.elemSize())); //int u1_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(u2)); //float* u2
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(error)); //float* error
+    idxArg = kernel.set(idxArg, (float)l_t); //float l_t
+    idxArg = kernel.set(idxArg, (float)theta); //float theta
+    idxArg = kernel.set(idxArg, (int)(u2.step / u2.elemSize()));//int u2_step
+    int u1_offset_x = (int)(u1.offset % u1.step);
+    u1_offset_x = (int) (u1_offset_x  / u1.elemSize());
+    idxArg = kernel.set(idxArg, (int)u1_offset_x); //int u1_offset_x
+    idxArg = kernel.set(idxArg, (int)(u1.offset/u1.step)); //int u1_offset_y
+    int u2_offset_x = (int)(u2.offset % u2.step);
+    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
+    idxArg = kernel.set(idxArg, (int)u2_offset_x ); //int u2_offset_x
+    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
+    idxArg = kernel.set(idxArg, (char)calc_error);    //char calc_error
+
+    return kernel.run(2, globalsize, localsize, false);
+}
+
+bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
+    UMat &p11, UMat &p12, UMat &p21, UMat &p22, float taut)
+{
+#ifdef ANDROID
+    size_t localsize[2] = { 32, 4 };
+#else
+    size_t localsize[2] = { 32, 8 };
+#endif
+    size_t globalsize[2] = { u1.cols, u1.rows };
+
+    ocl::Kernel kernel;
+    if (!kernel.create("estimateDualVariablesKernel", cv::ocl::video::optical_flow_tvl1_oclsrc, ""))
+        return false;
+
+    int idxArg = 0;
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u1));// const float* u1
+    idxArg = kernel.set(idxArg, (int)(u1.cols)); //int u1_col
+    idxArg = kernel.set(idxArg, (int)(u1.rows)); //int u1_row
+    idxArg = kernel.set(idxArg, (int)(u1.step/u1.elemSize())); //int u1_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u2)); // const float* u2
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p11)); // float* p11
+    idxArg = kernel.set(idxArg, (int)(p11.step/p11.elemSize())); //int p11_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p12)); // float* p12
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p21)); // float* p21
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p22)); // float* p22
+    idxArg = kernel.set(idxArg, (float)(taut));    //float taut
+    idxArg = kernel.set(idxArg, (int)(u2.step/u2.elemSize())); //int u2_step
+    int u1_offset_x = (int)(u1.offset % u1.step);
+    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
+    idxArg = kernel.set(idxArg, u1_offset_x); //int u1_offset_x
+    idxArg = kernel.set(idxArg, (int)(u1.offset / u1.step)); //int u1_offset_y
+    int u2_offset_x = (int)(u2.offset % u2.step);
+    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
+    idxArg = kernel.set(idxArg, u2_offset_x); //int u2_offset_x
+    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
+
+    return kernel.run(2, globalsize, localsize, false);
+
+}
+
+
 OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
 {
     tau            = 0.25;
@@ -157,6 +377,8 @@ OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
 
 void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow)
 {
+    CV_OCL_RUN(_flow.isUMat(), calc_ocl(_I0, _I1, _flow))
+
     Mat I0 = _I0.getMat();
     Mat I1 = _I1.getMat();
 
@@ -167,59 +389,59 @@ void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray
     CV_Assert( nscales > 0 );
 
     // allocate memory for the pyramid structure
-    I0s.resize(nscales);
-    I1s.resize(nscales);
-    u1s.resize(nscales);
-    u2s.resize(nscales);
+    dm.I0s.resize(nscales);
+    dm.I1s.resize(nscales);
+    dm.u1s.resize(nscales);
+    dm.u2s.resize(nscales);
 
-    I0.convertTo(I0s[0], I0s[0].depth(), I0.depth() == CV_8U ? 1.0 : 255.0);
-    I1.convertTo(I1s[0], I1s[0].depth(), I1.depth() == CV_8U ? 1.0 : 255.0);
+    I0.convertTo(dm.I0s[0], dm.I0s[0].depth(), I0.depth() == CV_8U ? 1.0 : 255.0);
+    I1.convertTo(dm.I1s[0], dm.I1s[0].depth(), I1.depth() == CV_8U ? 1.0 : 255.0);
 
-    u1s[0].create(I0.size());
-    u2s[0].create(I0.size());
+    dm.u1s[0].create(I0.size());
+    dm.u2s[0].create(I0.size());
 
     if (useInitialFlow)
     {
-        Mat_<float> mv[] = {u1s[0], u2s[0]};
+        Mat_<float> mv[] = { dm.u1s[0], dm.u2s[0] };
         split(_flow.getMat(), mv);
     }
 
-    I1x_buf.create(I0.size());
-    I1y_buf.create(I0.size());
+    dm.I1x_buf.create(I0.size());
+    dm.I1y_buf.create(I0.size());
 
-    flowMap1_buf.create(I0.size());
-    flowMap2_buf.create(I0.size());
+    dm.flowMap1_buf.create(I0.size());
+    dm.flowMap2_buf.create(I0.size());
 
-    I1w_buf.create(I0.size());
-    I1wx_buf.create(I0.size());
-    I1wy_buf.create(I0.size());
+    dm.I1w_buf.create(I0.size());
+    dm.I1wx_buf.create(I0.size());
+    dm.I1wy_buf.create(I0.size());
 
-    grad_buf.create(I0.size());
-    rho_c_buf.create(I0.size());
+    dm.grad_buf.create(I0.size());
+    dm.rho_c_buf.create(I0.size());
 
-    v1_buf.create(I0.size());
-    v2_buf.create(I0.size());
+    dm.v1_buf.create(I0.size());
+    dm.v2_buf.create(I0.size());
 
-    p11_buf.create(I0.size());
-    p12_buf.create(I0.size());
-    p21_buf.create(I0.size());
-    p22_buf.create(I0.size());
+    dm.p11_buf.create(I0.size());
+    dm.p12_buf.create(I0.size());
+    dm.p21_buf.create(I0.size());
+    dm.p22_buf.create(I0.size());
 
-    div_p1_buf.create(I0.size());
-    div_p2_buf.create(I0.size());
+    dm.div_p1_buf.create(I0.size());
+    dm.div_p2_buf.create(I0.size());
 
-    u1x_buf.create(I0.size());
-    u1y_buf.create(I0.size());
-    u2x_buf.create(I0.size());
-    u2y_buf.create(I0.size());
+    dm.u1x_buf.create(I0.size());
+    dm.u1y_buf.create(I0.size());
+    dm.u2x_buf.create(I0.size());
+    dm.u2y_buf.create(I0.size());
 
     // create the scales
     for (int s = 1; s < nscales; ++s)
     {
-        resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
-        resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);
+        resize(dm.I0s[s - 1], dm.I0s[s], Size(), scaleStep, scaleStep);
+        resize(dm.I1s[s - 1], dm.I1s[s], Size(), scaleStep, scaleStep);
 
-        if (I0s[s].cols < 16 || I0s[s].rows < 16)
+        if (dm.I0s[s].cols < 16 || dm.I0s[s].rows < 16)
         {
             nscales = s;
             break;
@@ -227,30 +449,30 @@ void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray
 
         if (useInitialFlow)
         {
-            resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
-            resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);
+            resize(dm.u1s[s - 1], dm.u1s[s], Size(), scaleStep, scaleStep);
+            resize(dm.u2s[s - 1], dm.u2s[s], Size(), scaleStep, scaleStep);
 
-            multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
-            multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
+            multiply(dm.u1s[s], Scalar::all(scaleStep), dm.u1s[s]);
+            multiply(dm.u2s[s], Scalar::all(scaleStep), dm.u2s[s]);
         }
         else
         {
-            u1s[s].create(I0s[s].size());
-            u2s[s].create(I0s[s].size());
+            dm.u1s[s].create(dm.I0s[s].size());
+            dm.u2s[s].create(dm.I0s[s].size());
         }
     }
 
     if (!useInitialFlow)
     {
-        u1s[nscales-1].setTo(Scalar::all(0));
-        u2s[nscales-1].setTo(Scalar::all(0));
+        dm.u1s[nscales - 1].setTo(Scalar::all(0));
+        dm.u2s[nscales - 1].setTo(Scalar::all(0));
     }
 
     // pyramidal structure for computing the optical flow
     for (int s = nscales - 1; s >= 0; --s)
     {
         // compute the optical flow at the current scale
-        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
+        procOneScale(dm.I0s[s], dm.I1s[s], dm.u1s[s], dm.u2s[s]);
 
         // if this was the last scale, finish now
         if (s == 0)
@@ -259,18 +481,118 @@ void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray
         // otherwise, upsample the optical flow
 
         // zoom the optical flow for the next finer scale
-        resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
-        resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
+        resize(dm.u1s[s], dm.u1s[s - 1], dm.I0s[s - 1].size());
+        resize(dm.u2s[s], dm.u2s[s - 1], dm.I0s[s - 1].size());
 
         // scale the optical flow with the appropriate zoom factor
-        multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
-        multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
+        multiply(dm.u1s[s - 1], Scalar::all(1 / scaleStep), dm.u1s[s - 1]);
+        multiply(dm.u2s[s - 1], Scalar::all(1 / scaleStep), dm.u2s[s - 1]);
     }
 
-    Mat uxy[] = {u1s[0], u2s[0]};
+    Mat uxy[] = { dm.u1s[0], dm.u2s[0] };
     merge(uxy, 2, _flow);
 }
 
+bool OpticalFlowDual_TVL1::calc_ocl(InputArray _I0, InputArray _I1, InputOutputArray _flow)
+{
+    UMat I0 = _I0.getUMat();
+    UMat I1 = _I1.getUMat();
+
+    CV_Assert(I0.type() == CV_8UC1 || I0.type() == CV_32FC1);
+    CV_Assert(I0.size() == I1.size());
+    CV_Assert(I0.type() == I1.type());
+    CV_Assert(!useInitialFlow || (_flow.size() == I0.size() && _flow.type() == CV_32FC2));
+    CV_Assert(nscales > 0);
+
+    // allocate memory for the pyramid structure
+    dum.I0s.resize(nscales);
+    dum.I1s.resize(nscales);
+    dum.u1s.resize(nscales);
+    dum.u2s.resize(nscales);
+    //I0s_step == I1s_step
+    double alpha = I0.depth() == CV_8U ? 1.0 : 255.0;
+
+    I0.convertTo(dum.I0s[0], CV_32F, alpha);
+    I1.convertTo(dum.I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
+
+    dum.u1s[0].create(I0.size(), CV_32FC1);
+    dum.u2s[0].create(I0.size(), CV_32FC1);
+
+    if (useInitialFlow)
+    {
+        std::vector<UMat> umv;
+        umv.push_back(dum.u1s[0]);
+        umv.push_back(dum.u2s[0]);
+        cv::split(_flow,umv);
+    }
+
+    dum.I1x_buf.create(I0.size(), CV_32FC1);
+    dum.I1y_buf.create(I0.size(), CV_32FC1);
+
+    dum.I1w_buf.create(I0.size(), CV_32FC1);
+    dum.I1wx_buf.create(I0.size(), CV_32FC1);
+    dum.I1wy_buf.create(I0.size(), CV_32FC1);
+
+    dum.grad_buf.create(I0.size(), CV_32FC1);
+    dum.rho_c_buf.create(I0.size(), CV_32FC1);
+
+    dum.p11_buf.create(I0.size(), CV_32FC1);
+    dum.p12_buf.create(I0.size(), CV_32FC1);
+    dum.p21_buf.create(I0.size(), CV_32FC1);
+    dum.p22_buf.create(I0.size(), CV_32FC1);
+
+    dum.diff_buf.create(I0.size(), CV_32FC1);
+
+    // create the scales
+    for (int s = 1; s < nscales; ++s)
+    {
+        resize(dum.I0s[s - 1], dum.I0s[s], Size(), scaleStep, scaleStep);
+        resize(dum.I1s[s - 1], dum.I1s[s], Size(), scaleStep, scaleStep);
+
+        if (dum.I0s[s].cols < 16 || dum.I0s[s].rows < 16)
+        {
+            nscales = s;
+            break;
+        }
+
+        if (useInitialFlow)
+        {
+            resize(dum.u1s[s - 1], dum.u1s[s], Size(), scaleStep, scaleStep);
+            resize(dum.u2s[s - 1], dum.u2s[s], Size(), scaleStep, scaleStep);
+
+            //scale by scale factor
+            multiply(dum.u1s[s], Scalar::all(scaleStep), dum.u1s[s]);
+            multiply(dum.u2s[s], Scalar::all(scaleStep), dum.u2s[s]);
+        }
+    }
+
+    // pyramidal structure for computing the optical flow
+    for (int s = nscales - 1; s >= 0; --s)
+    {
+        // compute the optical flow at the current scale
+        if (!OpticalFlowDual_TVL1::procOneScale_ocl(dum.I0s[s], dum.I1s[s], dum.u1s[s], dum.u2s[s]))
+            return false;
+
+        // if this was the last scale, finish now
+        if (s == 0)
+            break;
+
+        // zoom the optical flow for the next finer scale
+        resize(dum.u1s[s], dum.u1s[s - 1], dum.I0s[s - 1].size());
+        resize(dum.u2s[s], dum.u2s[s - 1], dum.I0s[s - 1].size());
+
+        // scale the optical flow with the appropriate zoom factor
+        multiply(dum.u1s[s - 1], Scalar::all(1 / scaleStep), dum.u1s[s - 1]);
+        multiply(dum.u2s[s - 1], Scalar::all(1 / scaleStep), dum.u2s[s - 1]);
+    }
+
+    std::vector<UMat> uxy;
+    uxy.push_back(dum.u1s[0]);
+    uxy.push_back(dum.u2s[0]);
+    merge(uxy, _flow);
+    return true;
+}
+
 ////////////////////////////////////////////////////////////
 // buildFlowMap
 
@@ -803,6 +1125,94 @@ void estimateDualVariables(const Mat_<float>& u1x, const Mat_<float>& u1y, const
     parallel_for_(Range(0, u1x.rows), body);
 }
 
+bool OpticalFlowDual_TVL1::procOneScale_ocl(const UMat& I0, const UMat& I1, UMat& u1, UMat& u2)
+{
+    using namespace cv_ocl_tvl1flow;
+
+    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
+
+    CV_DbgAssert(I1.size() == I0.size());
+    CV_DbgAssert(I1.type() == I0.type());
+    CV_DbgAssert(u1.empty() || u1.size() == I0.size());
+    CV_DbgAssert(u2.size() == u1.size());
+
+    if (u1.empty())
+    {
+        u1.create(I0.size(), CV_32FC1);
+        u1.setTo(Scalar::all(0));
+
+        u2.create(I0.size(), CV_32FC1);
+        u2.setTo(Scalar::all(0));
+    }
+
+    UMat I1x = dum.I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat I1y = dum.I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    if (!centeredGradient(I1, I1x, I1y))
+        return false;
+
+    UMat I1w = dum.I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat I1wx = dum.I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat I1wy = dum.I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    UMat grad = dum.grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat rho_c = dum.rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    UMat p11 = dum.p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat p12 = dum.p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat p21 = dum.p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat p22 = dum.p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    p11.setTo(Scalar::all(0));
+    p12.setTo(Scalar::all(0));
+    p21.setTo(Scalar::all(0));
+    p22.setTo(Scalar::all(0));
+
+    UMat diff = dum.diff_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    const float l_t = static_cast<float>(lambda * theta);
+    const float taut = static_cast<float>(tau / theta);
+    int n;
+
+    for (int warpings = 0; warpings < warps; ++warpings)
+    {
+        if (!warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c))
+            return false;
+
+        double error = std::numeric_limits<double>::max();
+        double prev_error = 0;
+
+        for (int n_outer = 0; error > scaledEpsilon && n_outer < outerIterations; ++n_outer)
+        {
+            if (medianFiltering > 1) {
+                cv::medianBlur(u1, u1, medianFiltering);
+                cv::medianBlur(u2, u2, medianFiltering);
+            }
+            for (int n_inner = 0; error > scaledEpsilon && n_inner < innerIterations; ++n_inner)
+            {
+                // some tweaks to make sum operation less frequently
+                n = n_inner + n_outer*innerIterations;
+                char calc_error = (n & 0x1) && (prev_error < scaledEpsilon);
+                if (!estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22,
+                    u1, u2, diff, l_t, static_cast<float>(theta), calc_error))
+                    return false;
+                if (calc_error)
+                {
+                    error = cv::sum(diff)[0];
+                    prev_error = error;
+                }
+                else
+                {
+                    error = std::numeric_limits<double>::max();
+                    prev_error -= scaledEpsilon;
+                }
+                if (!estimateDualVariables(u1, u2, p11, p12, p21, p22, taut))
+                    return false;
+            }
+        }
+    }
+    return true;
+}
+
 void OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2)
 {
     const float scaledEpsilon = static_cast<float>(epsilon * epsilon * I0.size().area());
@@ -812,39 +1222,39 @@ void OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>
     CV_DbgAssert( u1.size() == I0.size() );
     CV_DbgAssert( u2.size() == u1.size() );
 
-    Mat_<float> I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1x = dm.I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1y = dm.I1y_buf(Rect(0, 0, I0.cols, I0.rows));
     centeredGradient(I1, I1x, I1y);
 
-    Mat_<float> flowMap1 = flowMap1_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> flowMap2 = flowMap2_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> flowMap1 = dm.flowMap1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> flowMap2 = dm.flowMap2_buf(Rect(0, 0, I0.cols, I0.rows));
 
-    Mat_<float> I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1w = dm.I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1wx = dm.I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1wy = dm.I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
 
-    Mat_<float> grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> grad = dm.grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> rho_c = dm.rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
 
-    Mat_<float> v1 = v1_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> v2 = v2_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> v1 = dm.v1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> v2 = dm.v2_buf(Rect(0, 0, I0.cols, I0.rows));
 
-    Mat_<float> p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p11 = dm.p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p12 = dm.p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p21 = dm.p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p22 = dm.p22_buf(Rect(0, 0, I0.cols, I0.rows));
     p11.setTo(Scalar::all(0));
     p12.setTo(Scalar::all(0));
     p21.setTo(Scalar::all(0));
     p22.setTo(Scalar::all(0));
 
-    Mat_<float> div_p1 = div_p1_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> div_p2 = div_p2_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> div_p1 = dm.div_p1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> div_p2 = dm.div_p2_buf(Rect(0, 0, I0.cols, I0.rows));
 
-    Mat_<float> u1x = u1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u1y = u1y_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u2x = u2x_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u2y = u2y_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u1x = dm.u1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u1y = dm.u1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u2x = dm.u2x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u2y = dm.u2y_buf(Rect(0, 0, I0.cols, I0.rows));
 
     const float l_t = static_cast<float>(lambda * theta);
     const float taut = static_cast<float>(tau / theta);
@@ -891,41 +1301,67 @@ void OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>
 
 void OpticalFlowDual_TVL1::collectGarbage()
 {
-    I0s.clear();
-    I1s.clear();
-    u1s.clear();
-    u2s.clear();
+    //dataMat structure dm
+    dm.I0s.clear();
+    dm.I1s.clear();
+    dm.u1s.clear();
+    dm.u2s.clear();
 
-    I1x_buf.release();
-    I1y_buf.release();
+    dm.I1x_buf.release();
+    dm.I1y_buf.release();
 
-    flowMap1_buf.release();
-    flowMap2_buf.release();
+    dm.flowMap1_buf.release();
+    dm.flowMap2_buf.release();
 
-    I1w_buf.release();
-    I1wx_buf.release();
-    I1wy_buf.release();
+    dm.I1w_buf.release();
+    dm.I1wx_buf.release();
+    dm.I1wy_buf.release();
 
-    grad_buf.release();
-    rho_c_buf.release();
+    dm.grad_buf.release();
+    dm.rho_c_buf.release();
 
-    v1_buf.release();
-    v2_buf.release();
+    dm.v1_buf.release();
+    dm.v2_buf.release();
 
-    p11_buf.release();
-    p12_buf.release();
-    p21_buf.release();
-    p22_buf.release();
+    dm.p11_buf.release();
+    dm.p12_buf.release();
+    dm.p21_buf.release();
+    dm.p22_buf.release();
 
-    div_p1_buf.release();
-    div_p2_buf.release();
+    dm.div_p1_buf.release();
+    dm.div_p2_buf.release();
 
-    u1x_buf.release();
-    u1y_buf.release();
-    u2x_buf.release();
-    u2y_buf.release();
+    dm.u1x_buf.release();
+    dm.u1y_buf.release();
+    dm.u2x_buf.release();
+    dm.u2y_buf.release();
+
+    //dataUMat structure dum
+    dum.I0s.clear();
+    dum.I1s.clear();
+    dum.u1s.clear();
+    dum.u2s.clear();
+
+    dum.I1x_buf.release();
+    dum.I1y_buf.release();
+
+    dum.I1w_buf.release();
+    dum.I1wx_buf.release();
+    dum.I1wy_buf.release();
+
+    dum.grad_buf.release();
+    dum.rho_c_buf.release();
+
+    dum.p11_buf.release();
+    dum.p12_buf.release();
+    dum.p21_buf.release();
+    dum.p22_buf.release();
+
+    dum.diff_buf.release();
+    dum.norm_buf.release();
 }
 
+
 CV_INIT_ALGORITHM(OpticalFlowDual_TVL1, "DenseOpticalFlow.DualTVL1",
                   obj.info()->addParam(obj, "tau", obj.tau, false, 0, 0,
                                        "Time step of the numerical scheme");
diff --git a/modules/video/test/ocl/test_optflow_tvl1flow.cpp b/modules/video/test/ocl/test_optflow_tvl1flow.cpp
new file mode 100644
index 0000000000..6e7150718b
--- /dev/null
+++ b/modules/video/test/ocl/test_optflow_tvl1flow.cpp
@@ -0,0 +1,117 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Optical_flow_tvl1
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
+    IMPLEMENT_PARAM_CLASS(MedianFiltering, int)
+    IMPLEMENT_PARAM_CLASS(ScaleStep, double)
+}
+
+PARAM_TEST_CASE(OpticalFlowTVL1, UseInitFlow, MedianFiltering, ScaleStep)
+{
+    bool useInitFlow;
+    int medianFiltering;
+    double scaleStep;
+    virtual void SetUp()
+    {
+        useInitFlow = GET_PARAM(0);
+        medianFiltering = GET_PARAM(1);
+        scaleStep = GET_PARAM(2);
+    }
+};
+
+OCL_TEST_P(OpticalFlowTVL1, Mat)
+{
+    cv::Mat frame0 = readImage("optflow/RubberWhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("optflow/RubberWhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat flow; cv::UMat uflow;
+
+    //create algorithm
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+
+    //set parameters
+    alg->set("scaleStep", scaleStep);
+    alg->setInt("medianFiltering", medianFiltering);
+
+    //create initial flow as result of algorithm calculation
+    if (useInitFlow)
+    {
+        OCL_ON(alg->calc(frame0, frame1, uflow));
+        uflow.copyTo(flow);
+    }
+
+    //set flag to use initial flow as it is ready to use
+    alg->setBool("useInitialFlow", useInitFlow);
+
+    OCL_OFF(alg->calc(frame0, frame1, flow));
+    OCL_ON(alg->calc(frame0, frame1, uflow));
+
+    EXPECT_MAT_SIMILAR(flow, uflow, 1e-2);
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Video, OpticalFlowTVL1,
+    Combine(
+    Values(UseInitFlow(false), UseInitFlow(true)),
+    Values(MedianFiltering(3), MedianFiltering(-1)),
+    Values(ScaleStep(0.3),ScaleStep(0.5))
+    )
+    );
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
\ No newline at end of file

From e75a257f929079c9e21a0821092c7a00d137d2c2 Mon Sep 17 00:00:00 2001
From: mlyashko <maria.lyashko@itseez.com>
Date: Mon, 24 Feb 2014 17:47:56 +0400
Subject: [PATCH 2/6] changed epsilon for test passing

---
 modules/video/perf/opencl/perf_optflow_dualTVL1.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp b/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
index 8b71b9ad8f..adf5e35832 100644
--- a/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
+++ b/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
@@ -66,10 +66,10 @@ OCL_PERF_TEST_P(OpticalFlowDualTVL1Fixture, OpticalFlowDualTVL1,
                                 )
             )
     {
-        Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
+        Mat frame0 = imread(getDataPath("cv/optflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
         ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";
 
-        Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
+        Mat frame1 = imread(getDataPath("cv/optflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
         ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";
 
         const Size srcSize = frame0.size();
@@ -79,7 +79,7 @@ OCL_PERF_TEST_P(OpticalFlowDualTVL1Fixture, OpticalFlowDualTVL1,
             const int medianFiltering = get<0>(filteringScale);
             const double scaleStep = get<1>(filteringScale);
         const bool useInitFlow = get<1>(params);
-        const double eps = 0.001;
+        double eps = 0.9;
 
         UMat uFrame0; frame0.copyTo(uFrame0);
         UMat uFrame1; frame1.copyTo(uFrame1);
@@ -96,7 +96,7 @@ OCL_PERF_TEST_P(OpticalFlowDualTVL1Fixture, OpticalFlowDualTVL1,
         if (useInitFlow)
         {
             //calculate initial flow as result of optical flow
-            OCL_ON(alg->calc(uFrame0, uFrame1, uFlow));
+            alg->calc(uFrame0, uFrame1, uFlow);
         }
 
         //set flag to use initial flow
@@ -106,8 +106,7 @@ OCL_PERF_TEST_P(OpticalFlowDualTVL1Fixture, OpticalFlowDualTVL1,
 
         SANITY_CHECK(uFlow, eps, ERROR_RELATIVE);
     }
-
-    }
+}
 } // namespace cvtest::ocl
 
 #endif // HAVE_OPENCL
\ No newline at end of file

From ba5ebaa7588e1442f76f89e0214f0dbecb2f9f49 Mon Sep 17 00:00:00 2001
From: mlyashko <maria.lyashko@itseez.com>
Date: Wed, 26 Feb 2014 10:55:18 +0400
Subject: [PATCH 3/6] changed file name to case-sensitve to support linux

---
 modules/video/perf/opencl/perf_optflow_dualTVL1.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp b/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
index adf5e35832..72b1b0cbb2 100644
--- a/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
+++ b/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
@@ -66,11 +66,11 @@ OCL_PERF_TEST_P(OpticalFlowDualTVL1Fixture, OpticalFlowDualTVL1,
                                 )
             )
     {
-        Mat frame0 = imread(getDataPath("cv/optflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
-        ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";
+        Mat frame0 = imread(getDataPath("cv/optflow/RubberWhale1.png"), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame0.empty()) << "can't load RubberWhale1.png";
 
-        Mat frame1 = imread(getDataPath("cv/optflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
-        ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";
+        Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale2.png"), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame1.empty()) << "can't load RubberWhale2.png";
 
         const Size srcSize = frame0.size();
 

From 5c867463d2ff7b2b8b6cdb876d674cecbcc484ef Mon Sep 17 00:00:00 2001
From: mlyashko <maria.lyashko@itseez.com>
Date: Tue, 4 Mar 2014 11:36:47 +0400
Subject: [PATCH 4/6] changed localsize to NULL (reviewers comment)

---
 modules/video/src/tvl1flow.cpp | 30 ++++--------------------------
 1 file changed, 4 insertions(+), 26 deletions(-)

diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index f57aa8082d..097014577e 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -197,11 +197,6 @@ namespace cv_ocl_tvl1flow
 
 bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
 {
-#ifdef ANDROID
-    size_t localsize[2] = { 32, 4 };
-#else
-    size_t localsize[2] = { 32, 8 };
-#endif
     size_t globalsize[2] = { src.cols, src.rows };
 
     ocl::Kernel kernel;
@@ -216,19 +211,13 @@ bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dx));//res mat dx
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dy));//res mat dy
     idxArg = kernel.set(idxArg, (int)(dx.step/dx.elemSize()));//res mat step
-    return kernel.run(2, globalsize, localsize, false);
-
+    return kernel.run(2, globalsize, NULL, false);
 }
 
 bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UMat &I1y,
     UMat &u1, UMat &u2, UMat &I1w, UMat &I1wx, UMat &I1wy,
     UMat &grad, UMat &rho)
 {
-#ifdef ANDROID
-    size_t localsize[2] = { 32, 4 };
-#else
-    size_t localsize[2] = { 32, 8 };
-#endif
     size_t globalsize[2] = { I0.cols, I0.rows };
 
     ocl::Kernel kernel;
@@ -266,8 +255,7 @@ bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UM
     idxArg = kernel.set(idxArg, (int)u2_offset_x);//u2_offset_x
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step));//u2_offset_y
 
-    return kernel.run(2, globalsize, localsize, false);
-
+    return kernel.run(2, globalsize, NULL, false);
 }
 
 bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
@@ -275,11 +263,6 @@ bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
     UMat &p21, UMat &p22, UMat &u1,
     UMat &u2, UMat &error, float l_t, float theta, char calc_error)
 {
-#ifdef ANDROID
-    size_t localsize[2] = { 32, 4 };
-#else
-    size_t localsize[2] = { 32, 8 };
-#endif
     size_t globalsize[2] = { I1wx.cols, I1wx.rows };
 
     ocl::Kernel kernel;
@@ -316,17 +299,12 @@ bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
     idxArg = kernel.set(idxArg, (char)calc_error);    //char calc_error
 
-    return kernel.run(2, globalsize, localsize, false);
+    return kernel.run(2, globalsize, NULL, false);
 }
 
 bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
     UMat &p11, UMat &p12, UMat &p21, UMat &p22, float taut)
 {
-#ifdef ANDROID
-    size_t localsize[2] = { 32, 4 };
-#else
-    size_t localsize[2] = { 32, 8 };
-#endif
     size_t globalsize[2] = { u1.cols, u1.rows };
 
     ocl::Kernel kernel;
@@ -355,7 +333,7 @@ bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
     idxArg = kernel.set(idxArg, u2_offset_x); //int u2_offset_x
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
 
-    return kernel.run(2, globalsize, localsize, false);
+    return kernel.run(2, globalsize, NULL, false);
 
 }
 

From 66ed6a8a50e15c628e56a748b6c9defa5d660152 Mon Sep 17 00:00:00 2001
From: mlyashko <maria.lyashko@itseez.com>
Date: Thu, 6 Mar 2014 11:43:01 +0400
Subject: [PATCH 5/6] macro for normal return from thriveless kernel.set

---
 modules/core/include/opencv2/core/ocl.hpp | 11 +++
 modules/video/src/lkpyramid.cpp           | 10 ---
 modules/video/src/tvl1flow.cpp            | 99 ++++++++++++++++++++++-
 3 files changed, 106 insertions(+), 14 deletions(-)

diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index c6b0cf2d0f..abdc9c5d5b 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -599,6 +599,17 @@ protected:
 
 CV_EXPORTS MatAllocator* getOpenCLAllocator();
 
+#define SAFE_KERNEL_SET_ARG(idx, arg) \
+{\
+    int idxNew = kernel.set(idx, arg); \
+if (-1 == idxNew)\
+{\
+    printf("algorithm can't setup argument index = %d to kernel\n", idx); \
+    return false; \
+}\
+    idx = idxNew; \
+}
+
 }}
 
 #endif
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 598e69c888..71fc9590b3 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -706,16 +706,6 @@ namespace cv
             block.z = patch.z = 1;
         }
 
-        #define SAFE_KERNEL_SET_ARG(idx, arg) \
-        {\
-            int idxNew = kernel.set(idx, arg);\
-            if (-1 == idxNew)\
-            {\
-                printf("lkSparse_run can't setup argument index = %d to kernel\n", idx);\
-                return false;\
-            }\
-            idx = idxNew;\
-        }
         bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err,
             int ptcount, int level)
         {
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index 097014577e..e1a5b7856e 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -204,6 +204,7 @@ bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
         return false;
 
     int idxArg = 0;
+#if 0
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));//src mat
     idxArg = kernel.set(idxArg, (int)(src.cols));//src mat col
     idxArg = kernel.set(idxArg, (int)(src.rows));//src mat rows
@@ -211,6 +212,15 @@ bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dx));//res mat dx
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dy));//res mat dy
     idxArg = kernel.set(idxArg, (int)(dx.step/dx.elemSize()));//res mat step
+#else
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(src));
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(src.cols));
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(src.rows));
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(src.step / src.elemSize()));
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(dx));
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(dy));
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(dx.step / dx.elemSize()));
+#endif
     return kernel.run(2, globalsize, NULL, false);
 }
 
@@ -225,6 +235,7 @@ bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UM
         return false;
 
     int idxArg = 0;
+#if 0
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I0));//I0 mat
     int I0_step = (int)(I0.step / I0.elemSize());
     idxArg = kernel.set(idxArg, I0_step);//I0_step
@@ -254,7 +265,37 @@ bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UM
     u2_offset_x = (int) (u2_offset_x / u2.elemSize());
     idxArg = kernel.set(idxArg, (int)u2_offset_x);//u2_offset_x
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step));//u2_offset_y
-
+#else
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(I0));//I0 mat
+    int I0_step = (int)(I0.step / I0.elemSize());
+    SAFE_KERNEL_SET_ARG(idxArg, I0_step);//I0_step
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(I0.cols));//I0_col
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(I0.rows));//I0_row
+    ocl::Image2D imageI1(I1);
+    ocl::Image2D imageI1x(I1x);
+    ocl::Image2D imageI1y(I1y);
+    SAFE_KERNEL_SET_ARG(idxArg, imageI1);//image2d_t tex_I1
+    SAFE_KERNEL_SET_ARG(idxArg, imageI1x);//image2d_t tex_I1x
+    SAFE_KERNEL_SET_ARG(idxArg, imageI1y);//image2d_t tex_I1y
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(u1));//const float* u1
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.step / u1.elemSize()));//int u1_step
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(u2));//const float* u2
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(I1w));///float* I1w
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(I1wx));//float* I1wx
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(I1wy));//float* I1wy
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(grad));//float* grad
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(rho));//float* rho
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(I1w.step / I1w.elemSize()));//I1w_step
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.step / u2.elemSize()));//u2_step
+    int u1_offset_x = (int)((u1.offset) % (u1.step));
+    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
+    SAFE_KERNEL_SET_ARG(idxArg, (int)u1_offset_x);//u1_offset_x
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.offset / u1.step));//u1_offset_y
+    int u2_offset_x = (int)((u2.offset) % (u2.step));
+    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
+    SAFE_KERNEL_SET_ARG(idxArg, (int)u2_offset_x);//u2_offset_x
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.offset / u2.step));//u2_offset_y
+#endif
     return kernel.run(2, globalsize, NULL, false);
 }
 
@@ -270,7 +311,7 @@ bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
         return false;
 
     int idxArg = 0;
-
+#if 0
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I1wx)); //const float* I1wx
     idxArg = kernel.set(idxArg, (int)(I1wx.cols)); //int I1wx_col
     idxArg = kernel.set(idxArg, (int)(I1wx.rows)); //int I1wx_row
@@ -298,7 +339,35 @@ bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
     idxArg = kernel.set(idxArg, (int)u2_offset_x ); //int u2_offset_x
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
     idxArg = kernel.set(idxArg, (char)calc_error);    //char calc_error
-
+#else
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(I1wx)); //const float* I1wx
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(I1wx.cols)); //int I1wx_col
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(I1wx.rows)); //int I1wx_row
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(I1wx.step / I1wx.elemSize())); //int I1wx_step
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(I1wy)); //const float* I1wy
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(grad)); //const float* grad
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(rho_c)); //const float* rho_c
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(p11)); //const float* p11
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(p12)); //const float* p12
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(p21)); //const float* p21
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(p22)); //const float* p22
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(u1)); //float* u1
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.step / u1.elemSize())); //int u1_step
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(u2)); //float* u2
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(error)); //float* error
+    SAFE_KERNEL_SET_ARG(idxArg, (float)l_t); //float l_t
+    SAFE_KERNEL_SET_ARG(idxArg, (float)theta); //float theta
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.step / u2.elemSize()));//int u2_step
+    int u1_offset_x = (int)(u1.offset % u1.step);
+    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
+    SAFE_KERNEL_SET_ARG(idxArg, (int)u1_offset_x); //int u1_offset_x
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.offset / u1.step)); //int u1_offset_y
+    int u2_offset_x = (int)(u2.offset % u2.step);
+    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
+    SAFE_KERNEL_SET_ARG(idxArg, (int)u2_offset_x); //int u2_offset_x
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
+    SAFE_KERNEL_SET_ARG(idxArg, (char)calc_error);    //char calc_error
+#endif
     return kernel.run(2, globalsize, NULL, false);
 }
 
@@ -312,6 +381,7 @@ bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
         return false;
 
     int idxArg = 0;
+#if 0
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u1));// const float* u1
     idxArg = kernel.set(idxArg, (int)(u1.cols)); //int u1_col
     idxArg = kernel.set(idxArg, (int)(u1.rows)); //int u1_row
@@ -332,7 +402,28 @@ bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
     u2_offset_x = (int)(u2_offset_x / u2.elemSize());
     idxArg = kernel.set(idxArg, u2_offset_x); //int u2_offset_x
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
-
+#else
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(u1));// const float* u1
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.cols)); //int u1_col
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.rows)); //int u1_row
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.step / u1.elemSize())); //int u1_step
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(u2)); // const float* u2
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(p11)); // float* p11
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(p11.step / p11.elemSize())); //int p11_step
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(p12)); // float* p12
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(p21)); // float* p21
+    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(p22)); // float* p22
+    SAFE_KERNEL_SET_ARG(idxArg, (float)(taut));    //float taut
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.step / u2.elemSize())); //int u2_step
+    int u1_offset_x = (int)(u1.offset % u1.step);
+    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
+    SAFE_KERNEL_SET_ARG(idxArg, u1_offset_x); //int u1_offset_x
+    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.offset / u1.step)); //int u1_offset_y
+    int u2_offset_x = (int)(u2.offset % u2.step);
+    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
+    SAFE_KERNEL_SET_ARG(idxArg, u2_offset_x); //int u2_offset_x
+    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
+#endif
     return kernel.run(2, globalsize, NULL, false);
 
 }

From eb2e8a11faa496871b3c8d3737a5497db926ddb1 Mon Sep 17 00:00:00 2001
From: mlyashko <maria.lyashko@itseez.com>
Date: Fri, 7 Mar 2014 11:15:54 +0400
Subject: [PATCH 6/6] changed kernel.set to support unsuccssful set

---
 modules/core/include/opencv2/core/ocl.hpp | 11 ---
 modules/core/src/ocl.cpp                  |  6 +-
 modules/video/src/lkpyramid.cpp           | 21 -----
 modules/video/src/tvl1flow.cpp            | 98 +----------------------
 4 files changed, 6 insertions(+), 130 deletions(-)

diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index abdc9c5d5b..c6b0cf2d0f 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -599,17 +599,6 @@ protected:
 
 CV_EXPORTS MatAllocator* getOpenCLAllocator();
 
-#define SAFE_KERNEL_SET_ARG(idx, arg) \
-{\
-    int idxNew = kernel.set(idx, arg); \
-if (-1 == idxNew)\
-{\
-    printf("algorithm can't setup argument index = %d to kernel\n", idx); \
-    return false; \
-}\
-    idx = idxNew; \
-}
-
 }}
 
 #endif
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index c7f18dcb17..d34ea94fc0 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -2796,7 +2796,8 @@ int Kernel::set(int i, const void* value, size_t sz)
 {
     if (!p || !p->handle)
         return -1;
-    CV_Assert(i >= 0);
+    if (i < 0)
+        return i;
     if( i == 0 )
         p->cleanupUMats();
 
@@ -2822,7 +2823,8 @@ int Kernel::set(int i, const KernelArg& arg)
 {
     if( !p || !p->handle )
         return -1;
-    CV_Assert( i >= 0 );
+    if (i < 0)
+        return i;
     if( i == 0 )
         p->cleanupUMats();
     if( arg.m )
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 71fc9590b3..158ec279bc 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -726,7 +726,6 @@ namespace cv
             ocl::Image2D imageI(I);
             ocl::Image2D imageJ(J);
             int idxArg = 0;
-#if 0
             idxArg = kernel.set(idxArg, imageI); //image2d_t I
             idxArg = kernel.set(idxArg, imageJ); //image2d_t J
             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
@@ -744,26 +743,6 @@ namespace cv
             idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
             idxArg = kernel.set(idxArg, (int)iters); // int c_iters
             idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
-#else
-            SAFE_KERNEL_SET_ARG(idxArg, imageI); //image2d_t I
-            SAFE_KERNEL_SET_ARG(idxArg, imageJ); //image2d_t J
-            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
-            SAFE_KERNEL_SET_ARG(idxArg, (int)prevPts.step); // int prevPtsStep
-            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
-            SAFE_KERNEL_SET_ARG(idxArg, (int)nextPts.step); //  int nextPtsStep
-            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
-            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
-            SAFE_KERNEL_SET_ARG(idxArg, (int)level); // const int level
-            SAFE_KERNEL_SET_ARG(idxArg, (int)I.rows); // const int rows
-            SAFE_KERNEL_SET_ARG(idxArg, (int)I.cols); // const int cols
-            SAFE_KERNEL_SET_ARG(idxArg, (int)patch.x); // int PATCH_X
-            SAFE_KERNEL_SET_ARG(idxArg, (int)patch.y); // int PATCH_Y
-            SAFE_KERNEL_SET_ARG(idxArg, (int)winSize.width); // int c_winSize_x
-            SAFE_KERNEL_SET_ARG(idxArg, (int)winSize.height); // int c_winSize_y
-            SAFE_KERNEL_SET_ARG(idxArg, (int)iters); // int c_iters
-            SAFE_KERNEL_SET_ARG(idxArg, (char)calcErr); //char calcErr
-#endif
-
             return kernel.run(2, globalThreads, localThreads, true);
         }
     private:
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index e1a5b7856e..fad73ef65d 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -204,7 +204,6 @@ bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
         return false;
 
     int idxArg = 0;
-#if 0
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));//src mat
     idxArg = kernel.set(idxArg, (int)(src.cols));//src mat col
     idxArg = kernel.set(idxArg, (int)(src.rows));//src mat rows
@@ -212,15 +211,6 @@ bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dx));//res mat dx
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dy));//res mat dy
     idxArg = kernel.set(idxArg, (int)(dx.step/dx.elemSize()));//res mat step
-#else
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(src));
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(src.cols));
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(src.rows));
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(src.step / src.elemSize()));
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(dx));
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(dy));
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(dx.step / dx.elemSize()));
-#endif
     return kernel.run(2, globalsize, NULL, false);
 }
 
@@ -235,7 +225,6 @@ bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UM
         return false;
 
     int idxArg = 0;
-#if 0
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I0));//I0 mat
     int I0_step = (int)(I0.step / I0.elemSize());
     idxArg = kernel.set(idxArg, I0_step);//I0_step
@@ -265,37 +254,6 @@ bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UM
     u2_offset_x = (int) (u2_offset_x / u2.elemSize());
     idxArg = kernel.set(idxArg, (int)u2_offset_x);//u2_offset_x
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step));//u2_offset_y
-#else
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(I0));//I0 mat
-    int I0_step = (int)(I0.step / I0.elemSize());
-    SAFE_KERNEL_SET_ARG(idxArg, I0_step);//I0_step
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(I0.cols));//I0_col
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(I0.rows));//I0_row
-    ocl::Image2D imageI1(I1);
-    ocl::Image2D imageI1x(I1x);
-    ocl::Image2D imageI1y(I1y);
-    SAFE_KERNEL_SET_ARG(idxArg, imageI1);//image2d_t tex_I1
-    SAFE_KERNEL_SET_ARG(idxArg, imageI1x);//image2d_t tex_I1x
-    SAFE_KERNEL_SET_ARG(idxArg, imageI1y);//image2d_t tex_I1y
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(u1));//const float* u1
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.step / u1.elemSize()));//int u1_step
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(u2));//const float* u2
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(I1w));///float* I1w
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(I1wx));//float* I1wx
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(I1wy));//float* I1wy
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(grad));//float* grad
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(rho));//float* rho
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(I1w.step / I1w.elemSize()));//I1w_step
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.step / u2.elemSize()));//u2_step
-    int u1_offset_x = (int)((u1.offset) % (u1.step));
-    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
-    SAFE_KERNEL_SET_ARG(idxArg, (int)u1_offset_x);//u1_offset_x
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.offset / u1.step));//u1_offset_y
-    int u2_offset_x = (int)((u2.offset) % (u2.step));
-    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
-    SAFE_KERNEL_SET_ARG(idxArg, (int)u2_offset_x);//u2_offset_x
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.offset / u2.step));//u2_offset_y
-#endif
     return kernel.run(2, globalsize, NULL, false);
 }
 
@@ -311,7 +269,6 @@ bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
         return false;
 
     int idxArg = 0;
-#if 0
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I1wx)); //const float* I1wx
     idxArg = kernel.set(idxArg, (int)(I1wx.cols)); //int I1wx_col
     idxArg = kernel.set(idxArg, (int)(I1wx.rows)); //int I1wx_row
@@ -339,35 +296,7 @@ bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
     idxArg = kernel.set(idxArg, (int)u2_offset_x ); //int u2_offset_x
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
     idxArg = kernel.set(idxArg, (char)calc_error);    //char calc_error
-#else
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(I1wx)); //const float* I1wx
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(I1wx.cols)); //int I1wx_col
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(I1wx.rows)); //int I1wx_row
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(I1wx.step / I1wx.elemSize())); //int I1wx_step
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(I1wy)); //const float* I1wy
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(grad)); //const float* grad
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(rho_c)); //const float* rho_c
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(p11)); //const float* p11
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(p12)); //const float* p12
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(p21)); //const float* p21
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(p22)); //const float* p22
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(u1)); //float* u1
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.step / u1.elemSize())); //int u1_step
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(u2)); //float* u2
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrWriteOnly(error)); //float* error
-    SAFE_KERNEL_SET_ARG(idxArg, (float)l_t); //float l_t
-    SAFE_KERNEL_SET_ARG(idxArg, (float)theta); //float theta
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.step / u2.elemSize()));//int u2_step
-    int u1_offset_x = (int)(u1.offset % u1.step);
-    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
-    SAFE_KERNEL_SET_ARG(idxArg, (int)u1_offset_x); //int u1_offset_x
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.offset / u1.step)); //int u1_offset_y
-    int u2_offset_x = (int)(u2.offset % u2.step);
-    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
-    SAFE_KERNEL_SET_ARG(idxArg, (int)u2_offset_x); //int u2_offset_x
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
-    SAFE_KERNEL_SET_ARG(idxArg, (char)calc_error);    //char calc_error
-#endif
+
     return kernel.run(2, globalsize, NULL, false);
 }
 
@@ -381,7 +310,6 @@ bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
         return false;
 
     int idxArg = 0;
-#if 0
     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u1));// const float* u1
     idxArg = kernel.set(idxArg, (int)(u1.cols)); //int u1_col
     idxArg = kernel.set(idxArg, (int)(u1.rows)); //int u1_row
@@ -402,33 +330,11 @@ bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
     u2_offset_x = (int)(u2_offset_x / u2.elemSize());
     idxArg = kernel.set(idxArg, u2_offset_x); //int u2_offset_x
     idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
-#else
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(u1));// const float* u1
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.cols)); //int u1_col
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.rows)); //int u1_row
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.step / u1.elemSize())); //int u1_step
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(u2)); // const float* u2
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(p11)); // float* p11
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(p11.step / p11.elemSize())); //int p11_step
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(p12)); // float* p12
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(p21)); // float* p21
-    SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(p22)); // float* p22
-    SAFE_KERNEL_SET_ARG(idxArg, (float)(taut));    //float taut
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u2.step / u2.elemSize())); //int u2_step
-    int u1_offset_x = (int)(u1.offset % u1.step);
-    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
-    SAFE_KERNEL_SET_ARG(idxArg, u1_offset_x); //int u1_offset_x
-    SAFE_KERNEL_SET_ARG(idxArg, (int)(u1.offset / u1.step)); //int u1_offset_y
-    int u2_offset_x = (int)(u2.offset % u2.step);
-    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
-    SAFE_KERNEL_SET_ARG(idxArg, u2_offset_x); //int u2_offset_x
-    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
-#endif
+
     return kernel.run(2, globalsize, NULL, false);
 
 }
 
-
 OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
 {
     tau            = 0.25;