Merge pull request #1062 from pengli:dis_optflow

8 years ago · 05eb9f35c1
parent 1ecbb8f0a2 7ed6f77840
commit 05eb9f35c1
5 changed files with 1120 additions and 1 deletions
--- a/modules/optflow/perf/opencl/perf_dis_optflow.cpp
+++ b/modules/optflow/perf/opencl/perf_dis_optflow.cpp
@ -0,0 +1,113 @@
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "../perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+using std::tr1::tuple;
+using std::tr1::get;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::optflow;
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+void MakeArtificialExample(UMat &dst_frame1, UMat &dst_frame2);
+
+typedef tuple<String, Size> DISParams;
+typedef TestBaseWithParam<DISParams> DenseOpticalFlow_DIS;
+
+OCL_PERF_TEST_P(DenseOpticalFlow_DIS, perf,
+                Combine(Values("PRESET_ULTRAFAST", "PRESET_FAST", "PRESET_MEDIUM"), Values(szVGA, sz720p, sz1080p)))
+{
+    DISParams params = GetParam();
+
+    // use strings to print preset names in the perf test results:
+    String preset_string = get<0>(params);
+    int preset = DISOpticalFlow::PRESET_FAST;
+    if (preset_string == "PRESET_ULTRAFAST")
+        preset = DISOpticalFlow::PRESET_ULTRAFAST;
+    else if (preset_string == "PRESET_FAST")
+        preset = DISOpticalFlow::PRESET_FAST;
+    else if (preset_string == "PRESET_MEDIUM")
+        preset = DISOpticalFlow::PRESET_MEDIUM;
+    Size sz = get<1>(params);
+
+    UMat frame1(sz, CV_8U);
+    UMat frame2(sz, CV_8U);
+    UMat flow;
+
+    MakeArtificialExample(frame1, frame2);
+
+    Ptr<DenseOpticalFlow> algo = createOptFlow_DIS(preset);
+
+    OCL_TEST_CYCLE_N(10)
+    {
+        algo->calc(frame1, frame2, flow);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+void MakeArtificialExample(UMat &dst_frame1, UMat &dst_frame2)
+{
+    int src_scale = 2;
+    int OF_scale = 6;
+    double sigma = dst_frame1.cols / 300;
+
+    UMat tmp(Size(dst_frame1.cols / (int)pow(2, src_scale), dst_frame1.rows / (int)pow(2, src_scale)), CV_8U);
+    randu(tmp, 0, 255);
+    resize(tmp, dst_frame1, dst_frame1.size(), 0.0, 0.0, INTER_LINEAR);
+    resize(tmp, dst_frame2, dst_frame2.size(), 0.0, 0.0, INTER_LINEAR);
+
+    Mat displacement_field(Size(dst_frame1.cols / (int)pow(2, OF_scale), dst_frame1.rows / (int)pow(2, OF_scale)),
+                           CV_32FC2);
+    randn(displacement_field, 0.0, sigma);
+    resize(displacement_field, displacement_field, dst_frame2.size(), 0.0, 0.0, INTER_CUBIC);
+    for (int i = 0; i < displacement_field.rows; i++)
+        for (int j = 0; j < displacement_field.cols; j++)
+            displacement_field.at<Vec2f>(i, j) += Vec2f((float)j, (float)i);
+
+    remap(dst_frame2, dst_frame2, displacement_field, Mat(), INTER_LINEAR, BORDER_REPLICATE);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/optflow/src/dis_flow.cpp
+++ b/modules/optflow/src/dis_flow.cpp
@ -42,6 +42,8 @@

 #include "opencv2/core/hal/intrin.hpp"
 #include "precomp.hpp"
+#include "opencl_kernels_optflow.hpp"
+
 using namespace std;
 #define EPS 0.001F
 #define INF 1E+10F
@ -166,6 +168,50 @@ class DISOpticalFlowImpl : public DISOpticalFlow
                              Mat &src_Sy, Mat &_I0, Mat &_I1);
        void operator()(const Range &range) const;
    };
+
+#ifdef HAVE_OPENCL
+    vector<UMat> u_I0s;     //!< Gaussian pyramid for the current frame
+    vector<UMat> u_I1s;     //!< Gaussian pyramid for the next frame
+    vector<UMat> u_I1s_ext; //!< I1s with borders
+
+    vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame
+    vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame
+
+    vector<UMat> u_Ux; //!< x component of the flow vectors
+    vector<UMat> u_Uy; //!< y component of the flow vectors
+
+    vector<UMat> u_initial_Ux; //!< x component of the initial flow field, if one was passed as an input
+    vector<UMat> u_initial_Uy; //!< y component of the initial flow field, if one was passed as an input
+
+    UMat u_U; //!< a buffer for the merged flow
+
+    UMat u_Sx; //!< intermediate sparse flow representation (x component)
+    UMat u_Sy; //!< intermediate sparse flow representation (y component)
+
+    /* Structure tensor components: */
+    UMat u_I0xx_buf; //!< sum of squares of x gradient values
+    UMat u_I0yy_buf; //!< sum of squares of y gradient values
+    UMat u_I0xy_buf; //!< sum of x and y gradient products
+
+    /* Extra buffers that are useful if patch mean-normalization is used: */
+    UMat u_I0x_buf; //!< sum of x gradient values
+    UMat u_I0y_buf; //!< sum of y gradient values
+
+    /* Auxiliary buffers used in structure tensor computation: */
+    UMat u_I0xx_buf_aux;
+    UMat u_I0yy_buf_aux;
+    UMat u_I0xy_buf_aux;
+    UMat u_I0x_buf_aux;
+    UMat u_I0y_buf_aux;
+
+    bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
+                                       UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y);
+    void ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool use_flow);
+    bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow);
+    bool ocl_Densification(UMat &dst_Ux, UMat &dst_Uy, UMat &src_Sx, UMat &src_Sy, UMat &_I0, UMat &_I1);
+    bool ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
+                                UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level);
+#endif
 };

 DISOpticalFlowImpl::DISOpticalFlowImpl()
@ -1004,6 +1050,319 @@ void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) c
 #undef UPDATE_SPARSE_J_COORDINATES
 }

+#ifdef HAVE_OPENCL
+bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
+                                                UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level)
+{
+    size_t globalSize[] = {(size_t)ws, (size_t)hs};
+    size_t localSize[]  = {16, 16};
+    int idx;
+    int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter);
+
+    for (int iter = 0; iter < num_iter; iter++)
+    {
+        if (iter == 0)
+        {
+            ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::optflow::dis_flow_oclsrc);
+            size_t global_sz[] = {(size_t)hs * 8};
+            size_t local_sz[]  = {8};
+            idx = 0;
+
+            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(src_Ux));
+            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(src_Uy));
+            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(I0));
+            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(I1));
+            idx = k1.set(idx, (int)border_size);
+            idx = k1.set(idx, (int)patch_size);
+            idx = k1.set(idx, (int)patch_stride);
+            idx = k1.set(idx, (int)w);
+            idx = k1.set(idx, (int)h);
+            idx = k1.set(idx, (int)ws);
+            idx = k1.set(idx, (int)hs);
+            idx = k1.set(idx, (int)pyr_level);
+            idx = k1.set(idx, ocl::KernelArg::PtrWriteOnly(u_Sx));
+            idx = k1.set(idx, ocl::KernelArg::PtrWriteOnly(u_Sy));
+            if (!k1.run(1, global_sz, local_sz, false))
+                return false;
+
+            ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::optflow::dis_flow_oclsrc);
+            idx = 0;
+
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(src_Ux));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(src_Uy));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I1));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0x));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0y));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xx_buf));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0yy_buf));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xy_buf));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0x_buf));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0y_buf));
+            idx = k2.set(idx, (int)border_size);
+            idx = k2.set(idx, (int)patch_size);
+            idx = k2.set(idx, (int)patch_stride);
+            idx = k2.set(idx, (int)w);
+            idx = k2.set(idx, (int)h);
+            idx = k2.set(idx, (int)ws);
+            idx = k2.set(idx, (int)hs);
+            idx = k2.set(idx, (int)num_inner_iter);
+            idx = k2.set(idx, (int)pyr_level);
+            idx = k2.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
+            if (!k2.run(2, globalSize, localSize, false))
+                return false;
+        }
+        else
+        {
+            ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::optflow::dis_flow_oclsrc);
+            size_t global_sz[] = {(size_t)hs * 8};
+            size_t local_sz[]  = {8};
+            idx = 0;
+
+            idx = k3.set(idx, ocl::KernelArg::PtrReadOnly(I0));
+            idx = k3.set(idx, ocl::KernelArg::PtrReadOnly(I1));
+            idx = k3.set(idx, (int)border_size);
+            idx = k3.set(idx, (int)patch_size);
+            idx = k3.set(idx, (int)patch_stride);
+            idx = k3.set(idx, (int)w);
+            idx = k3.set(idx, (int)h);
+            idx = k3.set(idx, (int)ws);
+            idx = k3.set(idx, (int)hs);
+            idx = k3.set(idx, (int)pyr_level);
+            idx = k3.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
+            idx = k3.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
+            if (!k3.run(1, global_sz, local_sz, false))
+                return false;
+
+            ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::optflow::dis_flow_oclsrc);
+            idx = 0;
+
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I1));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0x));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0y));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xx_buf));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0yy_buf));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xy_buf));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0x_buf));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0y_buf));
+            idx = k4.set(idx, (int)border_size);
+            idx = k4.set(idx, (int)patch_size);
+            idx = k4.set(idx, (int)patch_stride);
+            idx = k4.set(idx, (int)w);
+            idx = k4.set(idx, (int)h);
+            idx = k4.set(idx, (int)ws);
+            idx = k4.set(idx, (int)hs);
+            idx = k4.set(idx, (int)num_inner_iter);
+            idx = k4.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
+            if (!k4.run(2, globalSize, localSize, false))
+                return false;
+        }
+    }
+    return true;
+}
+
+bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_Ux, UMat &dst_Uy, UMat &src_Sx, UMat &src_Sy, UMat &_I0, UMat &_I1)
+{
+    size_t globalSize[] = {(size_t)w, (size_t)h};
+    size_t localSize[]  = {16, 16};
+
+    ocl::Kernel kernel("dis_densification", ocl::optflow::dis_flow_oclsrc);
+    kernel.args(ocl::KernelArg::PtrReadOnly(src_Sx),
+                ocl::KernelArg::PtrReadOnly(src_Sy),
+                ocl::KernelArg::PtrReadOnly(_I0),
+                ocl::KernelArg::PtrReadOnly(_I1),
+                (int)patch_size, (int)patch_stride,
+                (int)w, (int)h, (int)ws,
+                ocl::KernelArg::PtrWriteOnly(dst_Ux),
+                ocl::KernelArg::PtrWriteOnly(dst_Uy));
+    return kernel.run(2, globalSize, localSize, false);
+}
+
+void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool use_flow)
+{
+    u_I0s.resize(coarsest_scale + 1);
+    u_I1s.resize(coarsest_scale + 1);
+    u_I1s_ext.resize(coarsest_scale + 1);
+    u_I0xs.resize(coarsest_scale + 1);
+    u_I0ys.resize(coarsest_scale + 1);
+    u_Ux.resize(coarsest_scale + 1);
+    u_Uy.resize(coarsest_scale + 1);
+
+    vector<UMat> flow_uv(2);
+    if (use_flow)
+    {
+        split(flow, flow_uv);
+        u_initial_Ux.resize(coarsest_scale + 1);
+        u_initial_Uy.resize(coarsest_scale + 1);
+    }
+
+    int fraction = 1;
+    int cur_rows = 0, cur_cols = 0;
+
+    for (int i = 0; i <= coarsest_scale; i++)
+    {
+        /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
+        if (i == finest_scale)
+        {
+            cur_rows = I0.rows / fraction;
+            cur_cols = I0.cols / fraction;
+            u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
+            resize(I0, u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
+            u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
+            resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
+
+            /* These buffers are reused in each scale so we initialize them once on the finest scale: */
+            u_Sx.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_Sy.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+
+            u_I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_I0x_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_I0y_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+
+            u_U.create(cur_rows, cur_cols, CV_32FC2);
+        }
+        else if (i > finest_scale)
+        {
+            cur_rows = u_I0s[i - 1].rows / 2;
+            cur_cols = u_I0s[i - 1].cols / 2;
+            u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
+            resize(u_I0s[i - 1], u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
+            u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
+            resize(u_I1s[i - 1], u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
+        }
+
+        if (i >= finest_scale)
+        {
+            u_I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size, CV_8UC1);
+            copyMakeBorder(u_I1s[i], u_I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
+            u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1);
+            u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1);
+            spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]);
+            u_Ux[i].create(cur_rows, cur_cols, CV_32FC1);
+            u_Uy[i].create(cur_rows, cur_cols, CV_32FC1);
+            variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
+            variational_refinement_processors[i]->setDelta(variational_refinement_delta);
+            variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
+            variational_refinement_processors[i]->setSorIterations(5);
+            variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
+
+            if (use_flow)
+            {
+                resize(flow_uv[0], u_initial_Ux[i], Size(cur_cols, cur_rows));
+                divide(u_initial_Ux[i], static_cast<float>(fraction), u_initial_Ux[i]);
+                resize(flow_uv[1], u_initial_Uy[i], Size(cur_cols, cur_rows));
+                divide(u_initial_Uy[i], static_cast<float>(fraction), u_initial_Uy[i]);
+            }
+        }
+
+        fraction *= 2;
+    }
+}
+
+bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
+                                                       UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y)
+{
+    size_t globalSizeX[] = {(size_t)h};
+    size_t localSizeX[]  = {16};
+
+    ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::optflow::dis_flow_oclsrc);
+    kernelX.args(ocl::KernelArg::PtrReadOnly(I0x),
+                 ocl::KernelArg::PtrReadOnly(I0y),
+                 (int)patch_size, (int)patch_stride,
+                 (int)w, (int)h, (int)ws,
+                 ocl::KernelArg::PtrWriteOnly(u_I0xx_buf_aux),
+                 ocl::KernelArg::PtrWriteOnly(u_I0yy_buf_aux),
+                 ocl::KernelArg::PtrWriteOnly(u_I0xy_buf_aux),
+                 ocl::KernelArg::PtrWriteOnly(u_I0x_buf_aux),
+                 ocl::KernelArg::PtrWriteOnly(u_I0y_buf_aux));
+    if (!kernelX.run(1, globalSizeX, localSizeX, false))
+        return false;
+
+    size_t globalSizeY[] = {(size_t)ws};
+    size_t localSizeY[]  = {16};
+
+    ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::optflow::dis_flow_oclsrc);
+    kernelY.args(ocl::KernelArg::PtrReadOnly(u_I0xx_buf_aux),
+                 ocl::KernelArg::PtrReadOnly(u_I0yy_buf_aux),
+                 ocl::KernelArg::PtrReadOnly(u_I0xy_buf_aux),
+                 ocl::KernelArg::PtrReadOnly(u_I0x_buf_aux),
+                 ocl::KernelArg::PtrReadOnly(u_I0y_buf_aux),
+                 (int)patch_size, (int)patch_stride,
+                 (int)w, (int)h, (int)ws,
+                 ocl::KernelArg::PtrWriteOnly(dst_I0xx),
+                 ocl::KernelArg::PtrWriteOnly(dst_I0yy),
+                 ocl::KernelArg::PtrWriteOnly(dst_I0xy),
+                 ocl::KernelArg::PtrWriteOnly(dst_I0x),
+                 ocl::KernelArg::PtrWriteOnly(dst_I0y));
+    return kernelY.run(1, globalSizeY, localSizeY, false);
+}
+
+bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow)
+{
+    UMat I0Mat = I0.getUMat();
+    UMat I1Mat = I1.getUMat();
+    bool use_input_flow = false;
+    if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
+        use_input_flow = true;
+    else
+        flow.create(I1Mat.size(), CV_32FC2);
+    UMat &u_flowMat = flow.getUMatRef();
+    coarsest_scale = (int)(log((2 * I0Mat.cols) / (4.0 * patch_size)) / log(2.0) + 0.5) - 1;
+
+    ocl_prepareBuffers(I0Mat, I1Mat, u_flowMat, use_input_flow);
+    u_Ux[coarsest_scale].setTo(0.0f);
+    u_Uy[coarsest_scale].setTo(0.0f);
+
+    for (int i = coarsest_scale; i >= finest_scale; i--)
+    {
+        w = u_I0s[i].cols;
+        h = u_I0s[i].rows;
+        ws = 1 + (w - patch_size) / patch_stride;
+        hs = 1 + (h - patch_size) / patch_stride;
+
+        if (!ocl_precomputeStructureTensor(u_I0xx_buf, u_I0yy_buf, u_I0xy_buf,
+                                           u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i]))
+            return false;
+
+        if (!ocl_PatchInverseSearch(u_Ux[i], u_Uy[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i))
+            return false;
+
+        if (!ocl_Densification(u_Ux[i], u_Uy[i], u_Sx, u_Sy, u_I0s[i], u_I1s[i]))
+            return false;
+
+        if (variational_refinement_iter > 0)
+            variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i],
+                                                         u_Ux[i].getMat(ACCESS_WRITE), u_Uy[i].getMat(ACCESS_WRITE));
+
+        if (i > finest_scale)
+        {
+            resize(u_Ux[i], u_Ux[i - 1], u_Ux[i - 1].size());
+            resize(u_Uy[i], u_Uy[i - 1], u_Uy[i - 1].size());
+            multiply(u_Ux[i - 1], 2, u_Ux[i - 1]);
+            multiply(u_Uy[i - 1], 2, u_Uy[i - 1]);
+        }
+    }
+    vector<UMat> uxy(2);
+    uxy[0] = u_Ux[finest_scale];
+    uxy[1] = u_Uy[finest_scale];
+    merge(uxy, u_U);
+    resize(u_U, u_flowMat, u_flowMat.size());
+    multiply(u_flowMat, 1 << finest_scale, u_flowMat);
+
+    return true;
+}
+#endif
+
 void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
 {
    CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
@ -1012,6 +1371,10 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
    CV_Assert(I0.isContinuous());
    CV_Assert(I1.isContinuous());

+    CV_OCL_RUN(ocl::Device::getDefault().isIntel() && flow.isUMat() &&
+               (patch_size == 8) && (use_spatial_propagation == true),
+               ocl_calc(I0, I1, flow));
+
    Mat I0Mat = I0.getMat();
    Mat I1Mat = I1.getMat();
    bool use_input_flow = false;
@ -1019,7 +1382,7 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
        use_input_flow = true;
    else
        flow.create(I1Mat.size(), CV_32FC2);
-    Mat &flowMat = flow.getMatRef();
+    Mat flowMat = flow.getMat();
    coarsest_scale = (int)(log((2 * I0Mat.cols) / (4.0 * patch_size)) / log(2.0) + 0.5) - 1;
    int num_stripes = getNumThreads();

@ -1088,6 +1451,25 @@ void DISOpticalFlowImpl::collectGarbage()
    I0yy_buf_aux.release();
    I0xy_buf_aux.release();

+#ifdef HAVE_OPENCL
+    u_I0s.clear();
+    u_I1s.clear();
+    u_I1s_ext.clear();
+    u_I0xs.clear();
+    u_I0ys.clear();
+    u_Ux.clear();
+    u_Uy.clear();
+    u_U.release();
+    u_Sx.release();
+    u_Sy.release();
+    u_I0xx_buf.release();
+    u_I0yy_buf.release();
+    u_I0xy_buf.release();
+    u_I0xx_buf_aux.release();
+    u_I0yy_buf_aux.release();
+    u_I0xy_buf_aux.release();
+#endif
+
    for (int i = finest_scale; i <= coarsest_scale; i++)
        variational_refinement_processors[i]->collectGarbage();
    variational_refinement_processors.clear();
--- a/modules/optflow/src/opencl/dis_flow.cl
+++ b/modules/optflow/src/opencl/dis_flow.cl
@ -0,0 +1,522 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#define EPS 0.001f
+#define INF 1E+10F
+
+__kernel void dis_precomputeStructureTensor_hor(__global const short *I0x,
+                                                __global const short *I0y,
+                                                int patch_size, int patch_stride,
+                                                int w, int h, int ws,
+                                                __global float *I0xx_aux_ptr,
+                                                __global float *I0yy_aux_ptr,
+                                                __global float *I0xy_aux_ptr,
+                                                __global float *I0x_aux_ptr,
+                                                __global float *I0y_aux_ptr)
+{
+
+    int i = get_global_id(0);
+
+    if (i >= h) return;
+
+    const __global short *x_row = I0x + i * w;
+    const __global short *y_row = I0y + i * w;
+
+    float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
+    float8 x_vec = convert_float8(vload8(0, x_row));
+    float8 y_vec = convert_float8(vload8(0, y_row));
+    sum_xx = dot(x_vec.lo, x_vec.lo) + dot(x_vec.hi, x_vec.hi);
+    sum_yy = dot(y_vec.lo, y_vec.lo) + dot(y_vec.hi, y_vec.hi);
+    sum_xy = dot(x_vec.lo, y_vec.lo) + dot(x_vec.hi, y_vec.hi);
+    sum_x = dot(x_vec.lo, 1.0f) + dot(x_vec.hi, 1.0f);
+    sum_y = dot(y_vec.lo, 1.0f) + dot(y_vec.hi, 1.0f);
+
+    I0xx_aux_ptr[i * ws] = sum_xx;
+    I0yy_aux_ptr[i * ws] = sum_yy;
+    I0xy_aux_ptr[i * ws] = sum_xy;
+    I0x_aux_ptr[i * ws] = sum_x;
+    I0y_aux_ptr[i * ws] = sum_y;
+
+    int js = 1;
+    for (int j = patch_size; j < w; j++)
+    {
+        short x_val1 = x_row[j];
+        short x_val2 = x_row[j - patch_size];
+        short y_val1 = y_row[j];
+        short y_val2 = y_row[j - patch_size];
+        sum_xx += (x_val1 * x_val1 - x_val2 * x_val2);
+        sum_yy += (y_val1 * y_val1 - y_val2 * y_val2);
+        sum_xy += (x_val1 * y_val1 - x_val2 * y_val2);
+        sum_x += (x_val1 - x_val2);
+        sum_y += (y_val1 - y_val2);
+        if ((j - patch_size + 1) % patch_stride == 0)
+        {
+            int index = i * ws + js;
+            I0xx_aux_ptr[index] = sum_xx;
+            I0yy_aux_ptr[index] = sum_yy;
+            I0xy_aux_ptr[index] = sum_xy;
+            I0x_aux_ptr[index] = sum_x;
+            I0y_aux_ptr[index] = sum_y;
+            js++;
+        }
+    }
+}
+
+__kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_ptr,
+                                                __global const float *I0yy_aux_ptr,
+                                                __global const float *I0xy_aux_ptr,
+                                                __global const float *I0x_aux_ptr,
+                                                __global const float *I0y_aux_ptr,
+                                                int patch_size, int patch_stride,
+                                                int w, int h, int ws,
+                                                __global float *I0xx_ptr,
+                                                __global float *I0yy_ptr,
+                                                __global float *I0xy_ptr,
+                                                __global float *I0x_ptr,
+                                                __global float *I0y_ptr)
+{
+    int j = get_global_id(0);
+
+    if (j >= ws) return;
+
+    float sum_xx, sum_yy, sum_xy, sum_x, sum_y;
+    sum_xx = sum_yy = sum_xy = sum_x = sum_y = 0.0f;
+
+    for (int i = 0; i < patch_size; i++)
+    {
+        sum_xx += I0xx_aux_ptr[i * ws + j];
+        sum_yy += I0yy_aux_ptr[i * ws + j];
+        sum_xy += I0xy_aux_ptr[i * ws + j];
+        sum_x  += I0x_aux_ptr[i * ws + j];
+        sum_y  += I0y_aux_ptr[i * ws + j];
+    }
+    I0xx_ptr[j] = sum_xx;
+    I0yy_ptr[j] = sum_yy;
+    I0xy_ptr[j] = sum_xy;
+    I0x_ptr[j] = sum_x;
+    I0y_ptr[j] = sum_y;
+
+    int is = 1;
+    for (int i = patch_size; i < h; i++)
+    {
+        sum_xx += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
+        sum_yy += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
+        sum_xy += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
+        sum_x  += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
+        sum_y  += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
+
+        if ((i - patch_size + 1) % patch_stride == 0)
+        {
+            I0xx_ptr[is * ws + j] = sum_xx;
+            I0yy_ptr[is * ws + j] = sum_yy;
+            I0xy_ptr[is * ws + j] = sum_xy;
+            I0x_ptr[is * ws + j] = sum_x;
+            I0y_ptr[is * ws + j] = sum_y;
+            is++;
+        }
+    }
+}
+
+__kernel void dis_densification(__global const float *sx, __global const float *sy,
+                                __global const uchar *i0, __global const uchar *i1,
+                                int psz, int pstr,
+                                int w, int h, int ws,
+                                __global float *ux, __global float *uy)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int i, j;
+
+    if (x >= w || y >= h) return;
+
+    int start_is, end_is;
+    int start_js, end_js;
+
+    end_is = min(y / pstr, (h - psz) / pstr);
+    start_is = max(0, y - psz + pstr) / pstr;
+    start_is = min(start_is, end_is);
+
+    end_js = min(x / pstr, (w - psz) / pstr);
+    start_js = max(0, x - psz + pstr) / pstr;
+    start_js = min(start_js, end_js);
+
+    float coef, sum_coef = 0.0f;
+    float sum_Ux = 0.0f;
+    float sum_Uy = 0.0f;
+
+    int i_l, i_u;
+    int j_l, j_u;
+    float i_m, j_m, diff;
+
+    i = y;
+    j = x;
+
+    /* Iterate through all the patches that overlap the current location (i,j) */
+    for (int is = start_is; is <= end_is; is++)
+        for (int js = start_js; js <= end_js; js++)
+        {
+            float sx_val = sx[is * ws + js];
+            float sy_val = sy[is * ws + js];
+            uchar2 i1_vec1, i1_vec2;
+
+            j_m = min(max(j + sx_val, 0.0f), w - 1.0f - EPS);
+            i_m = min(max(i + sy_val, 0.0f), h - 1.0f - EPS);
+            j_l = (int)j_m;
+            j_u = j_l + 1;
+            i_l = (int)i_m;
+            i_u = i_l + 1;
+            i1_vec1 = vload2(0, i1 + i_u * w + j_l);
+            i1_vec2 = vload2(0, i1 + i_l * w + j_l);
+            diff = (j_m - j_l) * (i_m - i_l) * i1_vec1.y +
+                   (j_u - j_m) * (i_m - i_l) * i1_vec1.x +
+                   (j_m - j_l) * (i_u - i_m) * i1_vec2.y +
+                   (j_u - j_m) * (i_u - i_m) * i1_vec2.x - i0[i * w + j];
+            coef = 1 / max(1.0f, fabs(diff));
+            sum_Ux += coef * sx_val;
+            sum_Uy += coef * sy_val;
+            sum_coef += coef;
+        }
+
+    ux[i * w + j] = sum_Ux / sum_coef;
+    uy[i * w + j] = sum_Uy / sum_coef;
+}
+
+#define INIT_BILINEAR_WEIGHTS(Ux, Uy)                                                                                  \
+    i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit);                                                       \
+    j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit);                                                       \
+                                                                                                                       \
+    w11 = (i_I1 - floor(i_I1)) * (j_I1 - floor(j_I1));                                                                 \
+    w10 = (i_I1 - floor(i_I1)) * (floor(j_I1) + 1 - j_I1);                                                             \
+    w01 = (floor(i_I1) + 1 - i_I1) * (j_I1 - floor(j_I1));                                                             \
+    w00 = (floor(i_I1) + 1 - i_I1) * (floor(j_I1) + 1 - j_I1);
+
+float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
+                         int I0_stride, int I1_stride,
+                         float w00, float w01, float w10, float w11, int patch_sz, int i)
+{
+    float sum_diff = 0.0f, sum_diff_sq = 0.0f;
+    int n = patch_sz * patch_sz;
+
+    uchar8 I1_vec1, I1_vec2, I0_vec;
+    uchar I1_val1, I1_val2;
+
+    I0_vec  = vload8(0, I0_ptr + i * I0_stride);
+    I1_vec1 = vload8(0, I1_ptr + i * I1_stride);
+    I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
+    I1_val1 = I1_ptr[i * I1_stride + 8];
+    I1_val2 = I1_ptr[(i + 1) * I1_stride + 8];
+
+    float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
+                 w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
+                 convert_float8(I0_vec);
+
+    sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
+    sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
+
+    sum_diff = sub_group_reduce_add(sum_diff);
+    sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
+
+    return sum_diff_sq - sum_diff * sum_diff / n;
+}
+
+__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr,
+                                             __global const uchar *I0_ptr, __global const uchar *I1_ptr,
+                                             int border_size, int patch_size, int patch_stride,
+                                             int w, int h, int ws, int hs, int pyr_level,
+                                             __global float *Sx_ptr, __global float *Sy_ptr)
+{
+    int id = get_global_id(0);
+    int is = id / 8;
+    if (id >= (hs * 8)) return;
+
+    int i = is * patch_stride;
+    int j = 0;
+    int psz = patch_size;
+    int psz2 = psz / 2;
+    int w_ext = w + 2 * border_size;
+    int bsz = border_size;
+
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + w - 1.0f;
+    float i_I1, j_I1, w00, w01, w10, w11;
+
+    float prev_Ux = Ux_ptr[(i + psz2) * w + j + psz2];
+    float prev_Uy = Uy_ptr[(i + psz2) * w + j + psz2];
+    Sx_ptr[is * ws] = prev_Ux;
+    Sy_ptr[is * ws] = prev_Uy;
+    j += patch_stride;
+
+    int sid = get_sub_group_local_id();
+    for (int js = 1; js < ws; js++, j += patch_stride)
+    {
+        float min_SSD, cur_SSD;
+        float Ux = Ux_ptr[(i + psz2) * w + j + psz2];
+        float Uy = Uy_ptr[(i + psz2) * w + j + psz2];
+
+        INIT_BILINEAR_WEIGHTS(Ux, Uy);
+        min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                     w, w_ext, w00, w01, w10, w11, psz, sid);
+
+        INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
+        cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                     w, w_ext, w00, w01, w10, w11, psz, sid);
+        if (cur_SSD < min_SSD)
+        {
+            Ux = prev_Ux;
+            Uy = prev_Uy;
+        }
+
+        prev_Ux = Ux;
+        prev_Uy = Uy;
+        Sx_ptr[is * ws + js] = Ux;
+        Sy_ptr[is * ws + js] = Uy;
+    }
+}
+
+float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
+                            const __global short *I0x_ptr, const __global short *I0y_ptr,
+                            int I0_stride, int I1_stride, float w00, float w01, float w10,
+                            float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
+{
+    float sum_diff = 0.0, sum_diff_sq = 0.0;
+    float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
+    int n = patch_sz * patch_sz;
+    uchar8 I1_vec1, I1_vec2;
+    uchar I1_val1, I1_val2;
+
+    for (int i = 0; i < 8; i++)
+    {
+        uchar8 I0_vec = vload8(0, I0_ptr + i * I0_stride);
+
+        I1_vec1 = (i == 0) ? vload8(0, I1_ptr + i * I1_stride) : I1_vec2;
+        I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
+        I1_val1 = (i == 0) ? I1_ptr[i * I1_stride + patch_sz] : I1_val2;
+        I1_val2 = I1_ptr[(i + 1) * I1_stride + patch_sz];
+
+        float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
+                     w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
+                     convert_float8(I0_vec);
+
+        sum_diff += (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
+        sum_diff_sq += (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
+
+        short8 I0x_vec = vload8(0, I0x_ptr + i * I0_stride);
+        short8 I0y_vec = vload8(0, I0y_ptr + i * I0_stride);
+
+        sum_I0x_mul += dot(vec.lo, convert_float4(I0x_vec.lo));
+        sum_I0x_mul += dot(vec.hi, convert_float4(I0x_vec.hi));
+        sum_I0y_mul += dot(vec.lo, convert_float4(I0y_vec.lo));
+        sum_I0y_mul += dot(vec.hi, convert_float4(I0y_vec.hi));
+    }
+
+    float dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
+    float dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
+    float SSD = sum_diff_sq - sum_diff * sum_diff / n;
+
+    return (float3)(SSD, dst_dUx, dst_dUy);
+}
+
+__kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __global const float *Uy_ptr,
+                                             __global const uchar *I0_ptr, __global const uchar *I1_ptr,
+                                             __global const short *I0x_ptr, __global const short *I0y_ptr,
+                                             __global const float *xx_ptr, __global const float *yy_ptr,
+                                             __global const float *xy_ptr,
+                                             __global const float *x_ptr, __global const float *y_ptr,
+                                             int border_size, int patch_size, int patch_stride,
+                                             int w, int h, int ws, int hs, int num_inner_iter, int pyr_level,
+                                             __global float *Sx_ptr, __global float *Sy_ptr)
+{
+    int js = get_global_id(0);
+    int is = get_global_id(1);
+    int i = is * patch_stride;
+    int j = js * patch_stride;
+    int psz = patch_size;
+    int psz2 = psz / 2;
+    int w_ext = w + 2 * border_size;
+    int bsz = border_size;
+    int index = is * ws + js;
+
+    if (js >= ws || is >= hs) return;
+
+    float Ux = Sx_ptr[index];
+    float Uy = Sy_ptr[index];
+    float cur_Ux = Ux;
+    float cur_Uy = Uy;
+    float cur_xx = xx_ptr[index];
+    float cur_yy = yy_ptr[index];
+    float cur_xy = xy_ptr[index];
+    float detH = cur_xx * cur_yy - cur_xy * cur_xy;
+
+    if (fabs(detH) < EPS) detH = EPS;
+
+    float invH11 = cur_yy / detH;
+    float invH12 = -cur_xy / detH;
+    float invH22 = cur_xx / detH;
+    float prev_SSD = INF, SSD;
+    float x_grad_sum = x_ptr[index];
+    float y_grad_sum = y_ptr[index];
+
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + w - 1.0f;
+    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
+    float3 res;
+
+    for (int t = 0; t < num_inner_iter; t++)
+    {
+        INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
+        res = processPatchMeanNorm(I0_ptr + i * w + j,
+                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * w + j,
+                                   I0y_ptr + i * w + j, w, w_ext, w00, w01, w10, w11, psz,
+                                   x_grad_sum, y_grad_sum);
+
+        SSD = res.x;
+        dUx = res.y;
+        dUy = res.z;
+        dx = invH11 * dUx + invH12 * dUy;
+        dy = invH12 * dUx + invH22 * dUy;
+
+        cur_Ux -= dx;
+        cur_Uy -= dy;
+
+        if (SSD >= prev_SSD)
+            break;
+        prev_SSD = SSD;
+    }
+
+    float2 vec = (float2)(cur_Ux - Ux, cur_Uy - Uy);
+    if (dot(vec, vec) <= (float)(psz * psz))
+    {
+        Sx_ptr[index] = cur_Ux;
+        Sy_ptr[index] = cur_Uy;
+    }
+}
+
+__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
+                                             int border_size, int patch_size, int patch_stride,
+                                             int w, int h, int ws, int hs, int pyr_level,
+                                             __global float *Sx_ptr, __global float *Sy_ptr)
+{
+    int id = get_global_id(0);
+    int is = id / 8;
+    if (id >= (hs * 8)) return;
+
+    is = (hs - 1 - is);
+    int i = is * patch_stride;
+    int j = (ws - 2) * patch_stride;
+    int psz = patch_size;
+    int psz2 = psz / 2;
+    int w_ext = w + 2 * border_size;
+    int bsz = border_size;
+
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + w - 1.0f;
+    float i_I1, j_I1, w00, w01, w10, w11;
+
+    int sid = get_sub_group_local_id();
+    for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
+    {
+        float min_SSD, cur_SSD;
+        float2 Ux = vload2(0, Sx_ptr + is * ws + js);
+        float2 Uy = vload2(0, Sy_ptr + is * ws + js);
+
+        INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
+        min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                     w, w_ext, w00, w01, w10, w11, psz, sid);
+
+        INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
+        cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                     w, w_ext, w00, w01, w10, w11, psz, sid);
+        if (cur_SSD < min_SSD)
+        {
+            Sx_ptr[is * ws + js] = Ux.y;
+            Sy_ptr[is * ws + js] = Uy.y;
+        }
+    }
+}
+
+__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
+                                             __global const short *I0x_ptr, __global const short *I0y_ptr,
+                                             __global const float *xx_ptr, __global const float *yy_ptr,
+                                             __global const float *xy_ptr,
+                                             __global const float *x_ptr, __global const float *y_ptr,
+                                             int border_size, int patch_size, int patch_stride,
+                                             int w, int h, int ws, int hs, int num_inner_iter,
+                                             __global float *Sx_ptr, __global float *Sy_ptr)
+{
+    int js = get_global_id(0);
+    int is = get_global_id(1);
+    if (js >= ws || is >= hs) return;
+
+    js = (ws - 1 - js);
+    is = (hs - 1 - is);
+
+    int j = js * patch_stride;
+    int i = is * patch_stride;
+    int psz = patch_size;
+    int psz2 = psz / 2;
+    int w_ext = w + 2 * border_size;
+    int bsz = border_size;
+    int index = is * ws + js;
+
+    float Ux = Sx_ptr[index];
+    float Uy = Sy_ptr[index];
+    float cur_Ux = Ux;
+    float cur_Uy = Uy;
+    float cur_xx = xx_ptr[index];
+    float cur_yy = yy_ptr[index];
+    float cur_xy = xy_ptr[index];
+    float detH = cur_xx * cur_yy - cur_xy * cur_xy;
+
+    if (fabs(detH) < EPS) detH = EPS;
+
+    float invH11 = cur_yy / detH;
+    float invH12 = -cur_xy / detH;
+    float invH22 = cur_xx / detH;
+    float prev_SSD = INF, SSD;
+    float x_grad_sum = x_ptr[index];
+    float y_grad_sum = y_ptr[index];
+
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + w - 1.0f;
+    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
+    float3 res;
+
+    for (int t = 0; t < num_inner_iter; t++)
+    {
+        INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
+        res = processPatchMeanNorm(I0_ptr + i * w + j,
+                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * w + j,
+                                   I0y_ptr + i * w + j, w, w_ext, w00, w01, w10, w11, psz,
+                                   x_grad_sum, y_grad_sum);
+
+        SSD = res.x;
+        dUx = res.y;
+        dUy = res.z;
+        dx = invH11 * dUx + invH12 * dUy;
+        dy = invH12 * dUx + invH22 * dUy;
+
+        cur_Ux -= dx;
+        cur_Uy -= dy;
+
+        if (SSD >= prev_SSD)
+            break;
+        prev_SSD = SSD;
+    }
+
+    float2 vec = (float2)(cur_Ux - Ux, cur_Uy - Uy);
+    if ((dot(vec, vec)) <= (float)(psz * psz))
+    {
+        Sx_ptr[index] = cur_Ux;
+        Sy_ptr[index] = cur_Uy;
+    }
+}
+
--- a/modules/optflow/src/precomp.hpp
+++ b/modules/optflow/src/precomp.hpp
@ -45,6 +45,8 @@ the use of this software, even if advised of the possibility of such damage.
 #include <opencv2/optflow.hpp>
 #include <opencv2/video.hpp>
 #include <opencv2/imgproc.hpp>
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/private.hpp"
 #include "opencv2/core/ocl.hpp"

 #include <algorithm>
--- a/modules/optflow/test/ocl/test_dis.cpp
+++ b/modules/optflow/test/ocl/test_dis.cpp
@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+using namespace cv;
+using namespace optflow;
+
+namespace cvtest {
+namespace ocl {
+
+PARAM_TEST_CASE(OCL_DenseOpticalFlow_DIS, int)
+{
+    int preset;
+
+    virtual void SetUp()
+    {
+        preset = GET_PARAM(0);
+    }
+};
+
+OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat)
+{
+    Mat frame1, frame2, GT;
+
+    frame1 = imread(TS::ptr()->get_data_path() + "optflow/RubberWhale1.png");
+    frame2 = imread(TS::ptr()->get_data_path() + "optflow/RubberWhale2.png");
+
+    CV_Assert(!frame1.empty() && !frame2.empty());
+
+    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
+    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
+
+    Ptr<DenseOpticalFlow> algo;
+
+    // iterate over presets:
+    for (int i = 0; i < test_loop_times; i++)
+    {
+        Mat flow;
+        UMat ocl_flow;
+
+        algo = createOptFlow_DIS(preset);
+        OCL_OFF(algo->calc(frame1, frame2, flow));
+        OCL_ON(algo->calc(frame1, frame2, ocl_flow));
+        ASSERT_EQ(flow.rows, ocl_flow.rows);
+        ASSERT_EQ(flow.cols, ocl_flow.cols);
+
+        EXPECT_MAT_SIMILAR(flow, ocl_flow, 6e-3);
+    }
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Contrib, OCL_DenseOpticalFlow_DIS,
+                            Values(DISOpticalFlow::PRESET_ULTRAFAST,
+                                   DISOpticalFlow::PRESET_FAST,
+                                   DISOpticalFlow::PRESET_MEDIUM));
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL