From ac62d70f9765facb5cd114b77909f57f24dc54c3 Mon Sep 17 00:00:00 2001
From: Vladislav Samsonov <vvladxx@gmail.com>
Date: Mon, 17 Oct 2016 18:15:22 +0300
Subject: [PATCH] [GSoC] Implementation of the Global Patch Collider and demo
 for PCAFlow (#752)

* Minor fixes

* Start adding correspondence finding

* Added finding of correspondences using GPC

* New evaluation tool for GPC

* Changed default parameters

* Display ground truth in the evaluation tool

* Added training tool for MPI Sintel dataset

* Added the training tool for Middlebury dataset

* Added some OpenCL optimization

* Added explanatory notes

* Minor improvements: time measurements + little ocl optimization

* Added demos

* Fixed warnings

* Make parameter struct assignable

* Fix warning

* Proper command line argument usage

* Prettified training tool, added parameters

* Fixed VS warning

* Fixed VS warning

* Using of compressed forest.yml.gz files by default to save space

* Added OpenCL flag to the evaluation tool

* Updated documentation

* Major speed and memory improvements:
1) Added new (optional) type of patch descriptors which are much faster. Retraining with option --descriptor-type=1 is required.
2) Got rid of hash table for descriptors, less memory usage.

* Fixed various floating point errors related to precision.
SIMD for dot product, forest traversing is a little bit faster now.

* Tolerant floating point comparison

* Triplets

* Added comment

* Choosing negative sample among nearest neighbors

* Fix warning

* Usage of parallel_for_() in critical places. Performance improvments.

* Simulated annealing heuristic

* Moved OpenCL kernel to separate file

* Moved implementation to source file

* Added basic accuracy tests for GPC and PCAFlow

* Fixing warnings

* Test accuracy constraints were too strict

* Test accuracy constraints were too strict

* Make tests more lightweight
---
 modules/optflow/doc/optflow.bib               |  16 +
 modules/optflow/include/opencv2/optflow.hpp   |   6 +-
 .../include/opencv2/optflow/pcaflow.hpp       |  51 +-
 .../opencv2/optflow/sparse_matching_gpc.hpp   | 250 +++++++-
 modules/optflow/samples/gpc_evaluate.cpp      | 164 +++++
 modules/optflow/samples/gpc_train.cpp         |  65 +-
 .../optflow/samples/gpc_train_middlebury.py   |  58 ++
 modules/optflow/samples/gpc_train_sintel.py   |  60 ++
 modules/optflow/samples/pcaflow_demo.cpp      | 172 +++++
 .../optflow/src/opencl/sparse_matching_gpc.cl |  69 ++
 modules/optflow/src/sparse_matching_gpc.cpp   | 603 +++++++++++++++---
 modules/optflow/test/test_OF_accuracy.cpp     | 101 ++-
 12 files changed, 1468 insertions(+), 147 deletions(-)
 create mode 100644 modules/optflow/samples/gpc_evaluate.cpp
 create mode 100644 modules/optflow/samples/gpc_train_middlebury.py
 create mode 100644 modules/optflow/samples/gpc_train_sintel.py
 create mode 100644 modules/optflow/samples/pcaflow_demo.cpp
 create mode 100644 modules/optflow/src/opencl/sparse_matching_gpc.cl

diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib
index a7b218698..227c7bc61 100644
--- a/modules/optflow/doc/optflow.bib
+++ b/modules/optflow/doc/optflow.bib
@@ -52,3 +52,19 @@
   pages={25--36},
   year={2004}
 }
+
+@inproceedings{Wulff:CVPR:2015,
+  title = {Efficient Sparse-to-Dense Optical Flow Estimation using a Learned Basis and Layers},
+  author = {Wulff, Jonas and Black, Michael J.},
+  booktitle = { IEEE Conf. on Computer Vision and Pattern Recognition (CVPR) 2015},
+  month = {June},
+  year = {2015}
+}
+
+@inproceedings{Wang_2016_CVPR,
+  author = {Wang, Shenlong and Ryan Fanello, Sean and Rhemann, Christoph and Izadi, Shahram and Kohli, Pushmeet},
+  title = {The Global Patch Collider},
+  booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2016}
+}
diff --git a/modules/optflow/include/opencv2/optflow.hpp b/modules/optflow/include/opencv2/optflow.hpp
index ccd7f8865..e68d5b78d 100644
--- a/modules/optflow/include/opencv2/optflow.hpp
+++ b/modules/optflow/include/opencv2/optflow.hpp
@@ -43,9 +43,6 @@ the use of this software, even if advised of the possibility of such damage.
 #include "opencv2/core.hpp"
 #include "opencv2/video.hpp"
 
-#include "opencv2/optflow/pcaflow.hpp"
-#include "opencv2/optflow/sparse_matching_gpc.hpp"
-
 /**
 @defgroup optflow Optical Flow Algorithms
 
@@ -69,6 +66,9 @@ Functions reading and writing .flo files in "Middlebury" format, see: <http://vi
 
  */
 
+#include "opencv2/optflow/pcaflow.hpp"
+#include "opencv2/optflow/sparse_matching_gpc.hpp"
+
 namespace cv
 {
 namespace optflow
diff --git a/modules/optflow/include/opencv2/optflow/pcaflow.hpp b/modules/optflow/include/opencv2/optflow/pcaflow.hpp
index b8aeab8e2..6645363b2 100644
--- a/modules/optflow/include/opencv2/optflow/pcaflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/pcaflow.hpp
@@ -37,23 +37,19 @@ or tort (including negligence or otherwise) arising in any way out of
 the use of this software, even if advised of the possibility of such damage.
 */
 
-/*
-Implementation of the PCAFlow algorithm from the following paper:
-http://files.is.tue.mpg.de/black/papers/cvpr2015_pcaflow.pdf
-
-@inproceedings{Wulff:CVPR:2015,
-  title = {Efficient Sparse-to-Dense Optical Flow Estimation using a Learned Basis and Layers},
-  author = {Wulff, Jonas and Black, Michael J.},
-  booktitle = { IEEE Conf. on Computer Vision and Pattern Recognition (CVPR) 2015},
-  month = jun,
-  year = {2015}
-}
-
-There are some key differences which distinguish this algorithm from the original PCAFlow (see paper):
-  - Discrete Cosine Transform basis is used instead of basis extracted with PCA.
-    Reasoning: DCT basis has comparable performance and it doesn't require additional storage space.
-    Also, this decision helps to avoid overloading the algorithm with a lot of external input.
-  - Usage of built-in OpenCV feature tracking instead of libviso.
+/**
+ * @file   pcaflow.hpp
+ * @author Vladislav Samsonov <vvladxx@gmail.com>
+ * @brief  Implementation of the PCAFlow algorithm from the following paper:
+ * http://files.is.tue.mpg.de/black/papers/cvpr2015_pcaflow.pdf
+ *
+ * @cite Wulff:CVPR:2015
+ *
+ * There are some key differences which distinguish this algorithm from the original PCAFlow (see paper):
+ * - Discrete Cosine Transform basis is used instead of basis extracted with PCA.
+ *   Reasoning: DCT basis has comparable performance and it doesn't require additional storage space.
+ *   Also, this decision helps to avoid overloading the algorithm with a lot of external input.
+ * - Usage of built-in OpenCV feature tracking instead of libviso.
 */
 
 #ifndef __OPENCV_OPTFLOW_PCAFLOW_HPP__
@@ -67,7 +63,10 @@ namespace cv
 namespace optflow
 {
 
-/*
+//! @addtogroup optflow
+//! @{
+
+/** @brief
  * This class can be used for imposing a learned prior on the resulting optical flow.
  * Solution will be regularized according to this prior.
  * You need to generate appropriate prior file with "learn_prior.py" script beforehand.
@@ -90,6 +89,8 @@ public:
   void fillConstraints( float *A1, float *A2, float *b1, float *b2 ) const;
 };
 
+/** @brief PCAFlow algorithm.
+ */
 class CV_EXPORTS_W OpticalFlowPCAFlow : public DenseOpticalFlow
 {
 protected:
@@ -103,6 +104,15 @@ protected:
   bool useOpenCL;
 
 public:
+  /** @brief Creates an instance of PCAFlow algorithm.
+   * @param _prior Learned prior or no prior (default). @see cv::optflow::PCAPrior
+   * @param _basisSize Number of basis vectors.
+   * @param _sparseRate Controls density of sparse matches.
+   * @param _retainedCornersFraction Retained corners fraction.
+   * @param _occlusionsThreshold Occlusion threshold.
+   * @param _dampingFactor Regularization term for solving least-squares. It is not related to the prior regularization.
+   * @param _claheClip Clip parameter for CLAHE.
+   */
   OpticalFlowPCAFlow( Ptr<const PCAPrior> _prior = Ptr<const PCAPrior>(), const Size _basisSize = Size( 18, 14 ),
                       float _sparseRate = 0.024, float _retainedCornersFraction = 0.2,
                       float _occlusionsThreshold = 0.0003, float _dampingFactor = 0.00002, float _claheClip = 14 );
@@ -127,7 +137,12 @@ private:
   OpticalFlowPCAFlow& operator=( const OpticalFlowPCAFlow& ); // make it non-assignable
 };
 
+/** @brief Creates an instance of PCAFlow
+*/
 CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_PCAFlow();
+
+//! @}
+
 }
 }
 
diff --git a/modules/optflow/include/opencv2/optflow/sparse_matching_gpc.hpp b/modules/optflow/include/opencv2/optflow/sparse_matching_gpc.hpp
index 2c0aee46b..312771030 100644
--- a/modules/optflow/include/opencv2/optflow/sparse_matching_gpc.hpp
+++ b/modules/optflow/include/opencv2/optflow/sparse_matching_gpc.hpp
@@ -37,68 +37,135 @@ or tort (including negligence or otherwise) arising in any way out of
 the use of this software, even if advised of the possibility of such damage.
 */
 
-/*
-Implementation of the Global Patch Collider algorithm from the following paper:
-http://research.microsoft.com/en-us/um/people/pkohli/papers/wfrik_cvpr2016.pdf
-
-@InProceedings{Wang_2016_CVPR,
- author = {Wang, Shenlong and Ryan Fanello, Sean and Rhemann, Christoph and Izadi, Shahram and Kohli, Pushmeet},
- title = {The Global Patch Collider},
- booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- month = {June},
- year = {2016}
-}
-*/
+/**
+ * @file   sparse_matching_gpc.hpp
+ * @author Vladislav Samsonov <vvladxx@gmail.com>
+ * @brief  Implementation of the Global Patch Collider.
+ *
+ * Implementation of the Global Patch Collider algorithm from the following paper:
+ * http://research.microsoft.com/en-us/um/people/pkohli/papers/wfrik_cvpr2016.pdf
+ *
+ * @cite Wang_2016_CVPR
+ */
 
 #ifndef __OPENCV_OPTFLOW_SPARSE_MATCHING_GPC_HPP__
 #define __OPENCV_OPTFLOW_SPARSE_MATCHING_GPC_HPP__
 
 #include "opencv2/core.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/imgproc.hpp"
 
 namespace cv
 {
 namespace optflow
 {
 
+//! @addtogroup optflow
+//! @{
+
 struct CV_EXPORTS_W GPCPatchDescriptor
 {
-  static const unsigned nFeatures = 18; // number of features in a patch descriptor
+  static const unsigned nFeatures = 18; //!< number of features in a patch descriptor
   Vec< double, nFeatures > feature;
 
-  GPCPatchDescriptor( const Mat *imgCh, int i, int j );
+  double dot( const Vec< double, nFeatures > &coef ) const;
+
+  void markAsSeparated() { feature[0] = std::numeric_limits< double >::quiet_NaN(); }
+
+  bool isSeparated() const { return cvIsNaN( feature[0] ) != 0; }
+};
+
+struct CV_EXPORTS_W GPCPatchSample
+{
+  GPCPatchDescriptor ref;
+  GPCPatchDescriptor pos;
+  GPCPatchDescriptor neg;
+
+  void getDirections( bool &refdir, bool &posdir, bool &negdir, const Vec< double, GPCPatchDescriptor::nFeatures > &coef, double rhs ) const;
 };
 
-typedef std::pair< GPCPatchDescriptor, GPCPatchDescriptor > GPCPatchSample;
 typedef std::vector< GPCPatchSample > GPCSamplesVector;
 
+/** @brief Descriptor types for the Global Patch Collider.
+ */
+enum GPCDescType
+{
+  GPC_DESCRIPTOR_DCT = 0, //!< Better quality but slow
+  GPC_DESCRIPTOR_WHT      //!< Worse quality but much faster
+};
+
 /** @brief Class encapsulating training samples.
  */
 class CV_EXPORTS_W GPCTrainingSamples
 {
 private:
   GPCSamplesVector samples;
+  int descriptorType;
 
 public:
   /** @brief This function can be used to extract samples from a pair of images and a ground truth flow.
    * Sizes of all the provided vectors must be equal.
    */
   static Ptr< GPCTrainingSamples > create( const std::vector< String > &imagesFrom, const std::vector< String > &imagesTo,
-                                           const std::vector< String > &gt );
+                                           const std::vector< String > &gt, int descriptorType );
+
+  static Ptr< GPCTrainingSamples > create( InputArrayOfArrays imagesFrom, InputArrayOfArrays imagesTo, InputArrayOfArrays gt,
+                                           int descriptorType );
 
   size_t size() const { return samples.size(); }
 
-  operator GPCSamplesVector() const { return samples; }
+  int type() const { return descriptorType; }
 
   operator GPCSamplesVector &() { return samples; }
 };
 
+/** @brief Class encapsulating training parameters.
+ */
+struct GPCTrainingParams
+{
+  unsigned maxTreeDepth;  //!< Maximum tree depth to stop partitioning.
+  int minNumberOfSamples; //!< Minimum number of samples in the node to stop partitioning.
+  int descriptorType;     //!< Type of descriptors to use.
+  bool printProgress;     //!< Print progress to stdout.
+
+  GPCTrainingParams( unsigned _maxTreeDepth = 20, int _minNumberOfSamples = 3, GPCDescType _descriptorType = GPC_DESCRIPTOR_DCT,
+                     bool _printProgress = true )
+      : maxTreeDepth( _maxTreeDepth ), minNumberOfSamples( _minNumberOfSamples ), descriptorType( _descriptorType ),
+        printProgress( _printProgress )
+  {
+    CV_Assert( check() );
+  }
+
+  GPCTrainingParams( const GPCTrainingParams &params )
+      : maxTreeDepth( params.maxTreeDepth ), minNumberOfSamples( params.minNumberOfSamples ), descriptorType( params.descriptorType ),
+        printProgress( params.printProgress )
+  {
+    CV_Assert( check() );
+  }
+
+  bool check() const { return maxTreeDepth > 1 && minNumberOfSamples > 1; }
+};
+
+/** @brief Class encapsulating matching parameters.
+ */
+struct GPCMatchingParams
+{
+  bool useOpenCL; //!< Whether to use OpenCL to speed up the matching.
+
+  GPCMatchingParams( bool _useOpenCL = false ) : useOpenCL( _useOpenCL ) {}
+
+  GPCMatchingParams( const GPCMatchingParams &params ) : useOpenCL( params.useOpenCL ) {}
+};
+
+/** @brief Class for individual tree.
+ */
 class CV_EXPORTS_W GPCTree : public Algorithm
 {
 public:
   struct Node
   {
-    Vec< double, GPCPatchDescriptor::nFeatures > coef; // hyperplane coefficients
-    double rhs;
+    Vec< double, GPCPatchDescriptor::nFeatures > coef; //!< Hyperplane coefficients
+    double rhs;                                        //!< Bias term of the hyperplane
     unsigned left;
     unsigned right;
 
@@ -109,45 +176,100 @@ private:
   typedef GPCSamplesVector::iterator SIter;
 
   std::vector< Node > nodes;
+  GPCTrainingParams params;
 
   bool trainNode( size_t nodeId, SIter begin, SIter end, unsigned depth );
 
 public:
-  void train( GPCSamplesVector &samples );
+  void train( GPCTrainingSamples &samples, const GPCTrainingParams params = GPCTrainingParams() );
 
   void write( FileStorage &fs ) const;
 
   void read( const FileNode &fn );
 
+  unsigned findLeafForPatch( const GPCPatchDescriptor &descr ) const;
+
   static Ptr< GPCTree > create() { return makePtr< GPCTree >(); }
 
   bool operator==( const GPCTree &t ) const { return nodes == t.nodes; }
+
+  int getDescriptorType() const { return params.descriptorType; }
 };
 
 template < int T > class CV_EXPORTS_W GPCForest : public Algorithm
 {
 private:
+  struct Trail
+  {
+    unsigned leaf[T]; //!< Inside which leaf of the tree 0..T the patch fell?
+    Point2i coord;    //!< Patch coordinates.
+
+    bool operator==( const Trail &trail ) const { return memcmp( leaf, trail.leaf, sizeof( leaf ) ) == 0; }
+
+    bool operator<( const Trail &trail ) const
+    {
+      for ( int i = 0; i < T - 1; ++i )
+        if ( leaf[i] != trail.leaf[i] )
+          return leaf[i] < trail.leaf[i];
+      return leaf[T - 1] < trail.leaf[T - 1];
+    }
+  };
+
+  class ParallelTrailsFilling : public ParallelLoopBody
+  {
+  private:
+    const GPCForest *forest;
+    const std::vector< GPCPatchDescriptor > *descr;
+    std::vector< Trail > *trails;
+
+    ParallelTrailsFilling &operator=( const ParallelTrailsFilling & );
+
+  public:
+    ParallelTrailsFilling( const GPCForest *_forest, const std::vector< GPCPatchDescriptor > *_descr, std::vector< Trail > *_trails )
+        : forest( _forest ), descr( _descr ), trails( _trails ){};
+
+    void operator()( const Range &range ) const
+    {
+      for ( int t = range.start; t < range.end; ++t )
+        for ( size_t i = 0; i < descr->size(); ++i )
+          trails->at( i ).leaf[t] = forest->tree[t].findLeafForPatch( descr->at( i ) );
+    }
+  };
+
   GPCTree tree[T];
 
 public:
   /** @brief Train the forest using one sample set for every tree.
    * Please, consider using the next method instead of this one for better quality.
    */
-  void train( GPCSamplesVector &samples )
+  void train( GPCTrainingSamples &samples, const GPCTrainingParams params = GPCTrainingParams() )
   {
     for ( int i = 0; i < T; ++i )
-      tree[i].train( samples );
+      tree[i].train( samples, params );
   }
 
   /** @brief Train the forest using individual samples for each tree.
    * It is generally better to use this instead of the first method.
    */
-  void train( const std::vector< String > &imagesFrom, const std::vector< String > &imagesTo, const std::vector< String > &gt )
+  void train( const std::vector< String > &imagesFrom, const std::vector< String > &imagesTo, const std::vector< String > &gt,
+              const GPCTrainingParams params = GPCTrainingParams() )
   {
     for ( int i = 0; i < T; ++i )
     {
-      Ptr< GPCTrainingSamples > samples = GPCTrainingSamples::create( imagesFrom, imagesTo, gt ); // Create training set for the tree
-      tree[i].train( *samples );
+      Ptr< GPCTrainingSamples > samples =
+        GPCTrainingSamples::create( imagesFrom, imagesTo, gt, params.descriptorType ); // Create training set for the tree
+      tree[i].train( *samples, params );
+    }
+  }
+
+  void train( InputArrayOfArrays imagesFrom, InputArrayOfArrays imagesTo, InputArrayOfArrays gt,
+              const GPCTrainingParams params = GPCTrainingParams() )
+  {
+    for ( int i = 0; i < T; ++i )
+    {
+      Ptr< GPCTrainingSamples > samples =
+        GPCTrainingSamples::create( imagesFrom, imagesTo, gt, params.descriptorType ); // Create training set for the tree
+      tree[i].train( *samples, params );
     }
   }
 
@@ -166,19 +288,93 @@ public:
 
   void read( const FileNode &fn )
   {
-    CV_Assert( T == (int)fn["ntrees"] );
+    CV_Assert( T <= (int)fn["ntrees"] );
     FileNodeIterator it = fn["trees"].begin();
     for ( int i = 0; i < T; ++i, ++it )
       tree[i].read( *it );
   }
 
+  /** @brief Find correspondences between two images.
+   * @param[in] imgFrom First image in a sequence.
+   * @param[in] imgTo Second image in a sequence.
+   * @param[out] corr Output vector with pairs of corresponding points.
+   * @param[in] params Additional matching parameters for fine-tuning.
+   */
+  void findCorrespondences( InputArray imgFrom, InputArray imgTo, std::vector< std::pair< Point2i, Point2i > > &corr,
+                            const GPCMatchingParams params = GPCMatchingParams() ) const;
+
   static Ptr< GPCForest > create() { return makePtr< GPCForest >(); }
 };
+
+class CV_EXPORTS_W GPCDetails
+{
+public:
+  static void dropOutliers( std::vector< std::pair< Point2i, Point2i > > &corr );
+
+  static void getAllDescriptorsForImage( const Mat *imgCh, std::vector< GPCPatchDescriptor > &descr, const GPCMatchingParams &mp,
+                                         int type );
+
+  static void getCoordinatesFromIndex( size_t index, Size sz, int &x, int &y );
+};
+
+template < int T >
+void GPCForest< T >::findCorrespondences( InputArray imgFrom, InputArray imgTo, std::vector< std::pair< Point2i, Point2i > > &corr,
+                                          const GPCMatchingParams params ) const
+{
+  CV_Assert( imgFrom.channels() == 3 );
+  CV_Assert( imgTo.channels() == 3 );
+
+  Mat from, to;
+  imgFrom.getMat().convertTo( from, CV_32FC3 );
+  imgTo.getMat().convertTo( to, CV_32FC3 );
+  cvtColor( from, from, COLOR_BGR2YCrCb );
+  cvtColor( to, to, COLOR_BGR2YCrCb );
+
+  Mat fromCh[3], toCh[3];
+  split( from, fromCh );
+  split( to, toCh );
+
+  std::vector< GPCPatchDescriptor > descr;
+  GPCDetails::getAllDescriptorsForImage( fromCh, descr, params, tree[0].getDescriptorType() );
+  std::vector< Trail > trailsFrom( descr.size() ), trailsTo( descr.size() );
+
+  for ( size_t i = 0; i < descr.size(); ++i )
+    GPCDetails::getCoordinatesFromIndex( i, from.size(), trailsFrom[i].coord.x, trailsFrom[i].coord.y );
+  parallel_for_( Range( 0, T ), ParallelTrailsFilling( this, &descr, &trailsFrom ) );
+
+  descr.clear();
+  GPCDetails::getAllDescriptorsForImage( toCh, descr, params, tree[0].getDescriptorType() );
+
+  for ( size_t i = 0; i < descr.size(); ++i )
+    GPCDetails::getCoordinatesFromIndex( i, to.size(), trailsTo[i].coord.x, trailsTo[i].coord.y );
+  parallel_for_( Range( 0, T ), ParallelTrailsFilling( this, &descr, &trailsTo ) );
+
+  std::sort( trailsFrom.begin(), trailsFrom.end() );
+  std::sort( trailsTo.begin(), trailsTo.end() );
+
+  for ( size_t i = 0; i < trailsFrom.size(); ++i )
+  {
+    bool uniq = true;
+    while ( i + 1 < trailsFrom.size() && trailsFrom[i] == trailsFrom[i + 1] )
+      ++i, uniq = false;
+    if ( uniq )
+    {
+      typename std::vector< Trail >::const_iterator lb = std::lower_bound( trailsTo.begin(), trailsTo.end(), trailsFrom[i] );
+      if ( lb != trailsTo.end() && *lb == trailsFrom[i] && ( ( lb + 1 ) == trailsTo.end() || !( *lb == *( lb + 1 ) ) ) )
+        corr.push_back( std::make_pair( trailsFrom[i].coord, lb->coord ) );
+    }
+  }
+
+  GPCDetails::dropOutliers( corr );
 }
 
+//! @}
+
+} // namespace optflow
+
 CV_EXPORTS void write( FileStorage &fs, const String &name, const optflow::GPCTree::Node &node );
 
 CV_EXPORTS void read( const FileNode &fn, optflow::GPCTree::Node &node, optflow::GPCTree::Node );
-}
+} // namespace cv
 
 #endif
diff --git a/modules/optflow/samples/gpc_evaluate.cpp b/modules/optflow/samples/gpc_evaluate.cpp
new file mode 100644
index 000000000..7a53e8f32
--- /dev/null
+++ b/modules/optflow/samples/gpc_evaluate.cpp
@@ -0,0 +1,164 @@
+#include "opencv2/core/ocl.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/optflow.hpp"
+#include <fstream>
+#include <iostream>
+#include <stdio.h>
+
+/* This tool finds correspondences between two images using Global Patch Collider
+ * and calculates error using provided ground truth flow.
+ *
+ * It will look for the file named "forest.yml.gz" with a learned forest.
+ * You can obtain the "forest.yml.gz" either by manually training it using another tool with *_train suffix
+ * or by downloading one of the files trained on some publicly available dataset from here:
+ *
+ * https://drive.google.com/open?id=0B7Hb8cfuzrIIZDFscXVYd0NBNFU
+ */
+
+using namespace cv;
+
+const String keys = "{help h ?     |             | print this message}"
+                    "{@image1      |<none>       | image1}"
+                    "{@image2      |<none>       | image2}"
+                    "{@groundtruth |<none>       | path to the .flo file}"
+                    "{@output      |             | output to a file instead of displaying, output image path}"
+                    "{g gpu        |             | use OpenCL}"
+                    "{f forest     |forest.yml.gz| path to the forest.yml.gz}";
+
+const int nTrees = 5;
+
+static double normL2( const Point2f &v ) { return sqrt( v.x * v.x + v.y * v.y ); }
+
+static Vec3d getFlowColor( const Point2f &f, const bool logScale = true, const double scaleDown = 5 )
+{
+  if ( f.x == 0 && f.y == 0 )
+    return Vec3d( 0, 0, 1 );
+
+  double radius = normL2( f );
+  if ( logScale )
+    radius = log( radius + 1 );
+  radius /= scaleDown;
+  radius = std::min( 1.0, radius );
+
+  double angle = ( atan2( -f.y, -f.x ) + CV_PI ) * 180 / CV_PI;
+  return Vec3d( angle, radius, 1 );
+}
+
+static void displayFlow( InputArray _flow, OutputArray _img )
+{
+  const Size sz = _flow.size();
+  Mat flow = _flow.getMat();
+  _img.create( sz, CV_32FC3 );
+  Mat img = _img.getMat();
+
+  for ( int i = 0; i < sz.height; ++i )
+    for ( int j = 0; j < sz.width; ++j )
+      img.at< Vec3f >( i, j ) = getFlowColor( flow.at< Point2f >( i, j ) );
+
+  cvtColor( img, img, COLOR_HSV2BGR );
+}
+
+static bool fileProbe( const char *name ) { return std::ifstream( name ).good(); }
+
+int main( int argc, const char **argv )
+{
+  CommandLineParser parser( argc, argv, keys );
+  parser.about( "Global Patch Collider evaluation tool" );
+
+  if ( parser.has( "help" ) )
+  {
+    parser.printMessage();
+    return 0;
+  }
+
+  String fromPath = parser.get< String >( 0 );
+  String toPath = parser.get< String >( 1 );
+  String gtPath = parser.get< String >( 2 );
+  String outPath = parser.get< String >( 3 );
+  const bool useOpenCL = parser.has( "gpu" );
+  String forestDumpPath = parser.get< String >( "forest" );
+
+  if ( !parser.check() )
+  {
+    parser.printErrors();
+    return 1;
+  }
+
+  if ( !fileProbe( forestDumpPath.c_str() ) )
+  {
+    std::cerr << "Can't open the file with a trained model: `" << forestDumpPath
+              << "`.\nYou can obtain this file either by manually training the model using another tool with *_train suffix or by "
+                 "downloading one of the files trained on some publicly available dataset from "
+                 "here:\nhttps://drive.google.com/open?id=0B7Hb8cfuzrIIZDFscXVYd0NBNFU"
+              << std::endl;
+    return 1;
+  }
+
+  ocl::setUseOpenCL( useOpenCL );
+
+  Ptr< optflow::GPCForest< nTrees > > forest = Algorithm::load< optflow::GPCForest< nTrees > >( forestDumpPath );
+
+  Mat from = imread( fromPath );
+  Mat to = imread( toPath );
+  Mat gt = optflow::readOpticalFlow( gtPath );
+  std::vector< std::pair< Point2i, Point2i > > corr;
+
+  TickMeter meter;
+  meter.start();
+
+  forest->findCorrespondences( from, to, corr, optflow::GPCMatchingParams( useOpenCL ) );
+
+  meter.stop();
+
+  std::cout << "Found " << corr.size() << " matches." << std::endl;
+  std::cout << "Time:  " << meter.getTimeSec() << " sec." << std::endl;
+  double error = 0;
+  Mat dispErr = Mat::zeros( from.size(), CV_32FC3 );
+  dispErr = Scalar( 0, 0, 1 );
+  Mat disp = Mat::zeros( from.size(), CV_32FC3 );
+  disp = Scalar( 0, 0, 1 );
+
+  for ( size_t i = 0; i < corr.size(); ++i )
+  {
+    const Point2f a = corr[i].first;
+    const Point2f b = corr[i].second;
+    const Point2f c = a + gt.at< Point2f >( corr[i].first.y, corr[i].first.x );
+    error += normL2( b - c );
+    circle( disp, a, 3, getFlowColor( b - a ), -1 );
+    circle( dispErr, a, 3, getFlowColor( b - c, false, 32 ), -1 );
+  }
+
+  error /= corr.size();
+
+  std::cout << "Average endpoint error: " << error << " px." << std::endl;
+
+  cvtColor( disp, disp, COLOR_HSV2BGR );
+  cvtColor( dispErr, dispErr, COLOR_HSV2BGR );
+
+  Mat dispGroundTruth;
+  displayFlow( gt, dispGroundTruth );
+
+  if ( outPath.length() )
+  {
+    putText( disp, "Sparse matching: Global Patch Collider", Point2i( 24, 40 ), FONT_HERSHEY_DUPLEX, 1, Vec3b( 1, 0, 0 ), 2, LINE_AA );
+    char buf[256];
+    sprintf( buf, "Average EPE: %.2f", error );
+    putText( disp, buf, Point2i( 24, 80 ), FONT_HERSHEY_DUPLEX, 1, Vec3b( 1, 0, 0 ), 2, LINE_AA );
+    sprintf( buf, "Number of matches: %u", (unsigned)corr.size() );
+    putText( disp, buf, Point2i( 24, 120 ), FONT_HERSHEY_DUPLEX, 1, Vec3b( 1, 0, 0 ), 2, LINE_AA );
+    disp *= 255;
+    imwrite( outPath, disp );
+    return 0;
+  }
+
+  namedWindow( "Correspondences", WINDOW_AUTOSIZE );
+  imshow( "Correspondences", disp );
+  namedWindow( "Error", WINDOW_AUTOSIZE );
+  imshow( "Error", dispErr );
+  namedWindow( "Ground truth", WINDOW_AUTOSIZE );
+  imshow( "Ground truth", dispGroundTruth );
+  waitKey( 0 );
+
+  return 0;
+}
diff --git a/modules/optflow/samples/gpc_train.cpp b/modules/optflow/samples/gpc_train.cpp
index d4583f925..6557eb573 100644
--- a/modules/optflow/samples/gpc_train.cpp
+++ b/modules/optflow/samples/gpc_train.cpp
@@ -1,31 +1,66 @@
 #include "opencv2/optflow.hpp"
 #include <iostream>
 
+/* This tool trains the forest for the Global Patch Collider and stores output to the "forest.yml.gz".
+ */
+
+using namespace cv;
+
+const String keys = "{help h ?       |             | print this message}"
+                    "{max-tree-depth |             | Maximum tree depth to stop partitioning}"
+                    "{min-samples    |             | Minimum number of samples in the node to stop partitioning}"
+                    "{descriptor-type|0            | Descriptor type. Set to 0 for quality, 1 for speed.}"
+                    "{print-progress |             | Set to 0 to enable quiet mode, set to 1 to print progress}"
+                    "{f forest       |forest.yml.gz| Path where to store resulting forest. It is recommended to use .yml.gz extension.}";
+
 const int nTrees = 5;
 
-int main( int argc, const char **argv )
+static void fillInputImagesFromCommandLine( std::vector< String > &img1, std::vector< String > &img2, std::vector< String > &gt, int argc,
+                                            const char **argv )
 {
-  int nSequences = argc - 1;
-
-  if ( nSequences <= 0 || nSequences % 3 != 0 )
+  for ( int i = 1, j = 0; i < argc; ++i )
   {
-    std::cerr << "Usage: " << argv[0] << " ImageFrom1 ImageTo1 GroundTruth1 ... ImageFromN ImageToN GroundTruthN" << std::endl;
-    return 1;
+    if ( argv[i][0] == '-' )
+      continue;
+    if ( j % 3 == 0 )
+      img1.push_back( argv[i] );
+    if ( j % 3 == 1 )
+      img2.push_back( argv[i] );
+    if ( j % 3 == 2 )
+      gt.push_back( argv[i] );
+    ++j;
   }
+}
 
-  nSequences /= 3;
-  std::vector< cv::String > img1, img2, gt;
+int main( int argc, const char **argv )
+{
+  CommandLineParser parser( argc, argv, keys );
+  parser.about( "Global Patch Collider training tool" );
+
+  std::vector< String > img1, img2, gt;
+  optflow::GPCTrainingParams params;
 
-  for ( int i = 0; i < nSequences; ++i )
+  if ( parser.has( "max-tree-depth" ) )
+    params.maxTreeDepth = parser.get< unsigned >( "max-tree-depth" );
+  if ( parser.has( "min-samples" ) )
+    params.minNumberOfSamples = parser.get< unsigned >( "min-samples" );
+  if ( parser.has( "descriptor-type" ) )
+    params.descriptorType = parser.get< int >( "descriptor-type" );
+  if ( parser.has( "print-progress" ) )
+    params.printProgress = parser.get< unsigned >( "print-progress" ) != 0;
+
+  fillInputImagesFromCommandLine( img1, img2, gt, argc, argv );
+
+  if ( parser.has( "help" ) || img1.size() != img2.size() || img1.size() != gt.size() || img1.size() == 0 )
   {
-    img1.push_back( argv[1 + i * 3] );
-    img2.push_back( argv[1 + i * 3 + 1] );
-    gt.push_back( argv[1 + i * 3 + 2] );
+    std::cerr << "\nUsage: " << argv[0] << " [params] ImageFrom1 ImageTo1 GroundTruth1 ... ImageFromN ImageToN GroundTruthN\n" << std::endl;
+    parser.printMessage();
+    return 1;
   }
 
-  cv::Ptr< cv::optflow::GPCForest< nTrees > > forest = cv::optflow::GPCForest< nTrees >::create();
-  forest->train( img1, img2, gt );
-  forest->save( "forest.dump" );
+  Ptr< optflow::GPCForest< nTrees > > forest = optflow::GPCForest< nTrees >::create();
+  forest->train( img1, img2, gt, params );
+  forest->save( parser.get< String >( "forest" ) );
 
   return 0;
 }
diff --git a/modules/optflow/samples/gpc_train_middlebury.py b/modules/optflow/samples/gpc_train_middlebury.py
new file mode 100644
index 000000000..e923258ce
--- /dev/null
+++ b/modules/optflow/samples/gpc_train_middlebury.py
@@ -0,0 +1,58 @@
+import argparse
+import glob
+import os
+import subprocess
+
+
+def execute(cmd):
+    popen = subprocess.Popen(cmd,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE)
+    for stdout_line in iter(popen.stdout.readline, ''):
+        print(stdout_line.rstrip())
+    for stderr_line in iter(popen.stderr.readline, ''):
+        print(stderr_line.rstrip())
+    popen.stdout.close()
+    popen.stderr.close()
+    return_code = popen.wait()
+    if return_code != 0:
+        raise subprocess.CalledProcessError(return_code, cmd)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Train Global Patch Collider using Middlebury dataset')
+    parser.add_argument(
+        '--bin_path',
+        help='Path to the training executable (example_optflow_gpc_train)',
+        required=True)
+    parser.add_argument('--dataset_path',
+                        help='Path to the directory with frames',
+                        required=True)
+    parser.add_argument('--gt_path',
+                        help='Path to the directory with ground truth flow',
+                        required=True)
+    parser.add_argument('--descriptor_type',
+                        help='Descriptor type',
+                        type=int,
+                        default=0)
+    args = parser.parse_args()
+    seq = glob.glob(os.path.join(args.dataset_path, '*'))
+    seq.sort()
+    input_files = []
+    for s in seq:
+        if os.path.isdir(s):
+            seq_name = os.path.basename(s)
+            frames = glob.glob(os.path.join(s, 'frame*.png'))
+            frames.sort()
+            assert (len(frames) == 2)
+            assert (os.path.basename(frames[0]) == 'frame10.png')
+            assert (os.path.basename(frames[1]) == 'frame11.png')
+            gt_flow = os.path.join(args.gt_path, seq_name, 'flow10.flo')
+            if os.path.isfile(gt_flow):
+                input_files += [frames[0], frames[1], gt_flow]
+    execute([args.bin_path, '--descriptor-type=%d' % args.descriptor_type] + input_files)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/optflow/samples/gpc_train_sintel.py b/modules/optflow/samples/gpc_train_sintel.py
new file mode 100644
index 000000000..1f1c5d8a3
--- /dev/null
+++ b/modules/optflow/samples/gpc_train_sintel.py
@@ -0,0 +1,60 @@
+import argparse
+import glob
+import os
+import subprocess
+
+FRAME_DIST = 2
+
+assert (FRAME_DIST >= 1)
+
+
+def execute(cmd):
+    popen = subprocess.Popen(cmd,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE)
+    for stdout_line in iter(popen.stdout.readline, ''):
+        print(stdout_line.rstrip())
+    for stderr_line in iter(popen.stderr.readline, ''):
+        print(stderr_line.rstrip())
+    popen.stdout.close()
+    popen.stderr.close()
+    return_code = popen.wait()
+    if return_code != 0:
+        raise subprocess.CalledProcessError(return_code, cmd)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Train Global Patch Collider using MPI Sintel dataset')
+    parser.add_argument(
+        '--bin_path',
+        help='Path to the training executable (example_optflow_gpc_train)',
+        required=True)
+    parser.add_argument('--dataset_path',
+                        help='Path to the directory with frames',
+                        required=True)
+    parser.add_argument('--gt_path',
+                        help='Path to the directory with ground truth flow',
+                        required=True)
+    parser.add_argument('--descriptor_type',
+                        help='Descriptor type',
+                        type=int,
+                        default=0)
+    args = parser.parse_args()
+    seq = glob.glob(os.path.join(args.dataset_path, '*'))
+    seq.sort()
+    input_files = []
+    for s in seq:
+        seq_name = os.path.basename(s)
+        frames = glob.glob(os.path.join(s, 'frame*.png'))
+        frames.sort()
+        for i in range(0, len(frames) - 1, FRAME_DIST):
+            gt_flow = os.path.join(args.gt_path, seq_name,
+                                   os.path.basename(frames[i])[0:-4] + '.flo')
+            assert (os.path.isfile(gt_flow))
+            input_files += [frames[i], frames[i + 1], gt_flow]
+    execute([args.bin_path, '--descriptor-type=%d' % args.descriptor_type] + input_files)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/optflow/samples/pcaflow_demo.cpp b/modules/optflow/samples/pcaflow_demo.cpp
new file mode 100644
index 000000000..ea9a60712
--- /dev/null
+++ b/modules/optflow/samples/pcaflow_demo.cpp
@@ -0,0 +1,172 @@
+#include "opencv2/core/ocl.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/optflow.hpp"
+#include <fstream>
+#include <iostream>
+#include <stdio.h>
+
+using namespace cv;
+using optflow::OpticalFlowPCAFlow;
+using optflow::PCAPrior;
+
+const String keys = "{help h ?     |      | print this message}"
+                    "{@image1      |<none>| image1}"
+                    "{@image2      |<none>| image2}"
+                    "{@groundtruth |<none>| path to the .flo file}"
+                    "{@prior       |<none>| path to a prior file for PCAFlow}"
+                    "{@output      |<none>| output image path}"
+                    "{g gpu        |      | use OpenCL}";
+
+static double normL2( const Point2f &v ) { return sqrt( v.x * v.x + v.y * v.y ); }
+
+static bool fileProbe( const char *name ) { return std::ifstream( name ).good(); }
+
+static Vec3d getFlowColor( const Point2f &f, const bool logScale = true, const double scaleDown = 5 )
+{
+  if ( f.x == 0 && f.y == 0 )
+    return Vec3d( 0, 0, 1 );
+
+  double radius = normL2( f );
+  if ( logScale )
+    radius = log( radius + 1 );
+  radius /= scaleDown;
+  radius = std::min( 1.0, radius );
+
+  double angle = ( atan2( -f.y, -f.x ) + CV_PI ) * 180 / CV_PI;
+  return Vec3d( angle, radius, 1 );
+}
+
+static void displayFlow( InputArray _flow, OutputArray _img )
+{
+  const Size sz = _flow.size();
+  Mat flow = _flow.getMat();
+  _img.create( sz, CV_32FC3 );
+  Mat img = _img.getMat();
+
+  for ( int i = 0; i < sz.height; ++i )
+    for ( int j = 0; j < sz.width; ++j )
+      img.at< Vec3f >( i, j ) = getFlowColor( flow.at< Point2f >( i, j ) );
+
+  cvtColor( img, img, COLOR_HSV2BGR );
+}
+
+static bool isFlowCorrect( const Point2f &u )
+{
+  return !cvIsNaN( u.x ) && !cvIsNaN( u.y ) && ( fabs( u.x ) < 1e9 ) && ( fabs( u.y ) < 1e9 );
+}
+
+static double calcEPE( const Mat &f1, const Mat &f2 )
+{
+  double sum = 0;
+  Size sz = f1.size();
+  size_t cnt = 0;
+  for ( int i = 0; i < sz.height; ++i )
+    for ( int j = 0; j < sz.width; ++j )
+      if ( isFlowCorrect( f1.at< Point2f >( i, j ) ) && isFlowCorrect( f2.at< Point2f >( i, j ) ) )
+      {
+        sum += normL2( f1.at< Point2f >( i, j ) - f2.at< Point2f >( i, j ) );
+        ++cnt;
+      }
+  return sum / cnt;
+}
+
+static void displayResult( Mat &i1, Mat &i2, Mat &gt, Ptr< DenseOpticalFlow > &algo, OutputArray _img, const char *descr,
+                           const bool useGpu = false )
+{
+  Mat flow( i1.size[0], i1.size[1], CV_32FC2 );
+  TickMeter meter;
+  meter.start();
+
+  if ( useGpu )
+    algo->calc( i1, i2, flow.getUMat( ACCESS_RW ) );
+  else
+    algo->calc( i1, i2, flow );
+
+  meter.stop();
+  displayFlow( flow, _img );
+  Mat img = _img.getMat();
+  putText( img, descr, Point2i( 24, 40 ), FONT_HERSHEY_DUPLEX, 1, Vec3b( 1, 0, 0 ), 2, LINE_AA );
+  char buf[256];
+  sprintf( buf, "Average EPE: %.2f", calcEPE( flow, gt ) );
+  putText( img, buf, Point2i( 24, 80 ), FONT_HERSHEY_DUPLEX, 1, Vec3b( 1, 0, 0 ), 2, LINE_AA );
+  sprintf( buf, "Time: %.2fs", meter.getTimeSec() );
+  putText( img, buf, Point2i( 24, 120 ), FONT_HERSHEY_DUPLEX, 1, Vec3b( 1, 0, 0 ), 2, LINE_AA );
+}
+
+static void displayGT( InputArray _flow, OutputArray _img, const char *descr )
+{
+  displayFlow( _flow, _img );
+  Mat img = _img.getMat();
+  putText( img, descr, Point2i( 24, 40 ), FONT_HERSHEY_DUPLEX, 1, Vec3b( 1, 0, 0 ), 2, LINE_AA );
+}
+
+int main( int argc, const char **argv )
+{
+  CommandLineParser parser( argc, argv, keys );
+  parser.about( "PCAFlow demonstration" );
+
+  if ( parser.has( "help" ) )
+  {
+    parser.printMessage();
+    return 0;
+  }
+
+  String img1 = parser.get< String >( 0 );
+  String img2 = parser.get< String >( 1 );
+  String groundtruth = parser.get< String >( 2 );
+  String prior = parser.get< String >( 3 );
+  String outimg = parser.get< String >( 4 );
+  const bool useGpu = parser.has( "gpu" );
+
+  if ( !parser.check() )
+  {
+    parser.printErrors();
+    return 1;
+  }
+
+  if ( !fileProbe( prior.c_str() ) )
+  {
+    std::cerr << "Can't open the file with prior! Check the provided path: " << prior << std::endl;
+    return 1;
+  }
+
+  cv::ocl::setUseOpenCL( useGpu );
+
+  Mat i1 = imread( img1 );
+  Mat i2 = imread( img2 );
+  Mat gt = optflow::readOpticalFlow( groundtruth );
+
+  Mat i1g, i2g;
+  cvtColor( i1, i1g, COLOR_BGR2GRAY );
+  cvtColor( i2, i2g, COLOR_BGR2GRAY );
+
+  Mat pcaflowDisp, pcaflowpriDisp, farnebackDisp, gtDisp;
+
+  {
+    Ptr< DenseOpticalFlow > pcaflow = makePtr< OpticalFlowPCAFlow >( makePtr< PCAPrior >( prior.c_str() ) );
+    displayResult( i1, i2, gt, pcaflow, pcaflowpriDisp, "PCAFlow with prior", useGpu );
+  }
+
+  {
+    Ptr< DenseOpticalFlow > pcaflow = makePtr< OpticalFlowPCAFlow >();
+    displayResult( i1, i2, gt, pcaflow, pcaflowDisp, "PCAFlow without prior", useGpu );
+  }
+
+  {
+    Ptr< DenseOpticalFlow > farneback = optflow::createOptFlow_Farneback();
+    displayResult( i1g, i2g, gt, farneback, farnebackDisp, "Farneback", useGpu );
+  }
+
+  displayGT( gt, gtDisp, "Ground truth" );
+
+  Mat disp1, disp2;
+  vconcat( pcaflowpriDisp, farnebackDisp, disp1 );
+  vconcat( pcaflowDisp, gtDisp, disp2 );
+  hconcat( disp1, disp2, disp1 );
+  disp1 *= 255;
+
+  imwrite( outimg, disp1 );
+
+  return 0;
+}
diff --git a/modules/optflow/src/opencl/sparse_matching_gpc.cl b/modules/optflow/src/opencl/sparse_matching_gpc.cl
new file mode 100644
index 000000000..ba19d7b34
--- /dev/null
+++ b/modules/optflow/src/opencl/sparse_matching_gpc.cl
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Itseez, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+// @Authors
+//    Vladislav Samsonov, vvladxx@gmail.com
+
+__kernel void getPatchDescriptor(
+  __global const uchar* imgCh0, int ic0step, int ic0off,
+  __global const uchar* imgCh1, int ic1step, int ic1off,
+  __global const uchar* imgCh2, int ic2step, int ic2off,
+  __global uchar* out, int outstep, int outoff,
+  const int gh, const int gw, const int PR  )
+{
+  const int i = get_global_id(0);
+  const int j = get_global_id(1);
+
+  if (i >= gh || j >= gw)
+    return;
+
+  __global double* desc = (__global double*)(out + (outstep * (i * gw + j) + outoff));
+  const int patchRadius = PR * 2;
+  float patch[PATCH_RADIUS_DOUBLED][PATCH_RADIUS_DOUBLED];
+
+  for (int i0 = 0; i0 < patchRadius; ++i0) {
+    __global const float* ch0Row = (__global const float*)(imgCh0 + (ic0step * (i + i0) + ic0off + j * sizeof(float)));
+    for (int j0 = 0; j0 < patchRadius; ++j0)
+      patch[i0][j0] = ch0Row[j0];
+  }
+
+  #pragma unroll
+  for (int n0 = 0; n0 < 4; ++n0) {
+    #pragma unroll
+    for (int n1 = 0; n1 < 4; ++n1) {
+      double sum = 0;
+      for (int i0 = 0; i0 < patchRadius; ++i0)
+        for (int j0 = 0; j0 < patchRadius; ++j0)
+          sum += patch[i0][j0] * cos(CV_PI * (i0 + 0.5) * n0 / patchRadius) * cos(CV_PI * (j0 + 0.5) * n1 / patchRadius);
+      desc[n0 * 4 + n1] = sum / PR;
+    }
+  }
+
+  for (int k = 0; k < 4; ++k) {
+    desc[k] *= SQRT2_INV;
+    desc[k * 4] *= SQRT2_INV;
+  }
+
+  double sum = 0;
+
+  for (int i0 = 0; i0 < patchRadius; ++i0) {
+    __global const float* ch1Row = (__global const float*)(imgCh1 + (ic1step * (i + i0) + ic1off + j * sizeof(float)));
+    for (int j0 = 0; j0 < patchRadius; ++j0)
+      sum += ch1Row[j0];
+  }
+
+  desc[16] = sum / patchRadius;
+  sum = 0;
+
+  for (int i0 = 0; i0 < patchRadius; ++i0) {
+    __global const float* ch2Row = (__global const float*)(imgCh2 + (ic2step * (i + i0) + ic2off + j * sizeof(float)));
+    for (int j0 = 0; j0 < patchRadius; ++j0)
+      sum += ch2Row[j0];
+  }
+
+  desc[17] = sum / patchRadius;
+}
diff --git a/modules/optflow/src/sparse_matching_gpc.cpp b/modules/optflow/src/sparse_matching_gpc.cpp
index 11e2abaf3..99ecc1a12 100644
--- a/modules/optflow/src/sparse_matching_gpc.cpp
+++ b/modules/optflow/src/sparse_matching_gpc.cpp
@@ -41,8 +41,22 @@
  //M*/
 
 #include "opencv2/core/core_c.h"
+#include "opencv2/core/private.hpp"
+#include "opencv2/flann/miniflann.hpp"
 #include "opencv2/highgui.hpp"
 #include "precomp.hpp"
+#include "opencl_kernels_optflow.hpp"
+
+/* Disable "from double to float" and "from size_t to int" warnings.
+ * Fixing these would make the code look ugly by introducing explicit cast all around.
+ * Here these warning are pointless anyway.
+ */
+#ifdef _MSC_VER
+#pragma warning( disable : 4244 4267 4838 )
+#endif
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+#endif
 
 namespace cv
 {
@@ -51,12 +65,23 @@ namespace optflow
 namespace
 {
 
-const int patchRadius = 10;
-const double thresholdMagnitudeFrac = 0.6666666666;
+#define PATCH_RADIUS 10
+#define PATCH_RADIUS_DOUBLED 20
+#define SQRT2_INV 0.7071067811865475
+
+const int patchRadius = PATCH_RADIUS;
 const int globalIters = 3;
 const int localIters = 500;
-const int minNumberOfSamples = 2;
-//const bool debugOutput = true;
+const double thresholdOutliers = 0.98;
+const double thresholdMagnitudeFrac = 0.8;
+const double epsTolerance = 1e-12;
+const unsigned scoreGainPos = 5;
+const unsigned scoreGainNeg = 1;
+const unsigned negSearchKNN = 5;
+const double simulatedAnnealingTemperatureCoef = 200.0;
+const double sigmaGrowthRate = 0.2;
+
+RNG rng;
 
 struct Magnitude
 {
@@ -79,9 +104,9 @@ struct PartitionPredicate1
 
   bool operator()( const GPCPatchSample &sample ) const
   {
-    const bool direction1 = ( coef.dot( sample.first.feature ) < rhs );
-    const bool direction2 = ( coef.dot( sample.second.feature ) < rhs );
-    return direction1 == false && direction1 == direction2;
+    bool refdir, posdir, negdir;
+    sample.getDirections( refdir, posdir, negdir, coef, rhs );
+    return refdir == false && ( posdir == false || negdir == true );
   }
 };
 
@@ -94,20 +119,276 @@ struct PartitionPredicate2
 
   bool operator()( const GPCPatchSample &sample ) const
   {
-    const bool direction1 = ( coef.dot( sample.first.feature ) < rhs );
-    const bool direction2 = ( coef.dot( sample.second.feature ) < rhs );
-    return direction1 != direction2;
+    bool refdir, posdir, negdir;
+    sample.getDirections( refdir, posdir, negdir, coef, rhs );
+    return refdir != posdir && refdir == negdir;
+  }
+};
+
+struct CompareWithTolerance
+{
+  double val;
+
+  CompareWithTolerance( double _val ) : val( _val ) {};
+
+  bool operator()( const double &elem ) const
+  {
+    const double diff = ( val + elem == 0 ) ? std::abs( val - elem ) : std::abs( ( val - elem ) / ( val + elem ) );
+    return diff <= epsTolerance;
   }
 };
 
 float normL2Sqr( const Vec2f &v ) { return v[0] * v[0] + v[1] * v[1]; }
 
+int normL2Sqr( const Point2i &v ) { return v.x * v.x + v.y * v.y; }
+
 bool checkBounds( int i, int j, Size sz )
 {
   return i >= patchRadius && j >= patchRadius && i + patchRadius < sz.height && j + patchRadius < sz.width;
 }
 
-void getTrainingSamples( const Mat &from, const Mat &to, const Mat &gt, GPCSamplesVector &samples )
+void getDCTPatchDescriptor( GPCPatchDescriptor &patchDescr, const Mat *imgCh, int i, int j )
+{
+  Rect roi( j - patchRadius, i - patchRadius, 2 * patchRadius, 2 * patchRadius );
+  Mat freqDomain;
+  dct( imgCh[0]( roi ), freqDomain );
+
+  double *feature = patchDescr.feature.val;
+  feature[0] = freqDomain.at< float >( 0, 0 );
+  feature[1] = freqDomain.at< float >( 0, 1 );
+  feature[2] = freqDomain.at< float >( 0, 2 );
+  feature[3] = freqDomain.at< float >( 0, 3 );
+
+  feature[4] = freqDomain.at< float >( 1, 0 );
+  feature[5] = freqDomain.at< float >( 1, 1 );
+  feature[6] = freqDomain.at< float >( 1, 2 );
+  feature[7] = freqDomain.at< float >( 1, 3 );
+
+  feature[8] = freqDomain.at< float >( 2, 0 );
+  feature[9] = freqDomain.at< float >( 2, 1 );
+  feature[10] = freqDomain.at< float >( 2, 2 );
+  feature[11] = freqDomain.at< float >( 2, 3 );
+
+  feature[12] = freqDomain.at< float >( 3, 0 );
+  feature[13] = freqDomain.at< float >( 3, 1 );
+  feature[14] = freqDomain.at< float >( 3, 2 );
+  feature[15] = freqDomain.at< float >( 3, 3 );
+
+  feature[16] = cv::sum( imgCh[1]( roi ) )[0] / ( 2 * patchRadius );
+  feature[17] = cv::sum( imgCh[2]( roi ) )[0] / ( 2 * patchRadius );
+}
+
+double sumInt( const Mat &integ, int i, int j, int h, int w )
+{
+  return integ.at< double >( i + h, j + w ) - integ.at< double >( i + h, j ) - integ.at< double >( i, j + w ) + integ.at< double >( i, j );
+}
+
+void getWHTPatchDescriptor( GPCPatchDescriptor &patchDescr, const Mat *imgCh, int i, int j )
+{
+  i -= patchRadius;
+  j -= patchRadius;
+  const int k = 2 * patchRadius;
+  const double s = sumInt( imgCh[0], i, j, k, k );
+  double *feature = patchDescr.feature.val;
+
+  feature[0] = s;
+  feature[1] = s - 2 * sumInt( imgCh[0], i, j + k / 2, k, k / 2 );
+  feature[2] = s - 2 * sumInt( imgCh[0], i, j + k / 4, k, k / 2 );
+  feature[3] = s - 2 * sumInt( imgCh[0], i, j + k / 4, k, k / 4 ) - 2 * sumInt( imgCh[0], i, j + 3 * k / 4, k, k / 4 );
+
+  feature[4] = s - 2 * sumInt( imgCh[0], i + k / 2, j, k / 2, k );
+  feature[5] = s - 2 * sumInt( imgCh[0], i, j + k / 2, k / 2, k / 2 ) - 2 * sumInt( imgCh[0], i + k / 2, j, k / 2, k / 2 );
+  feature[6] = s - 2 * sumInt( imgCh[0], i, j + k / 4, k / 2, k / 2 ) - 2 * sumInt( imgCh[0], i + k / 2, j, k / 2, k / 4 ) -
+               2 * sumInt( imgCh[0], i + k / 2, j + 3 * k / 4, k / 2, k / 4 );
+  feature[7] = s - 2 * sumInt( imgCh[0], i, j + k / 4, k / 2, k / 4 ) - 2 * sumInt( imgCh[0], i, j + 3 * k / 4, k / 2, k / 4 ) -
+               2 * sumInt( imgCh[0], i + k / 2, j, k / 2, k / 4 ) - 2 * sumInt( imgCh[0], i + k / 2, j + k / 2, k / 2, k / 4 );
+
+  feature[8] = s - 2 * sumInt( imgCh[0], i + k / 4, j, k / 2, k );
+  feature[9] = s - 2 * sumInt( imgCh[0], i + k / 4, j, k / 2, k / 2 ) - 2 * sumInt( imgCh[0], i, j + k / 2, k / 4, k / 2 ) -
+               2 * sumInt( imgCh[0], i + 3 * k / 4, j + k / 2, k / 4, k / 2 );
+  feature[10] = s - 2 * sumInt( imgCh[0], i + k / 4, j, k / 2, k / 4 ) - 2 * sumInt( imgCh[0], i + k / 4, j + 3 * k / 4, k / 2, k / 4 ) -
+                2 * sumInt( imgCh[0], i, j + k / 4, k / 4, k / 2 ) - 2 * sumInt( imgCh[0], i + 3 * k / 4, j + k / 4, k / 4, k / 2 );
+  feature[11] = s - 2 * sumInt( imgCh[0], i, j + k / 4, k / 4, k / 4 ) - 2 * sumInt( imgCh[0], i, j + 3 * k / 4, k / 4, k / 4 ) -
+                2 * sumInt( imgCh[0], i + k / 4, j, k / 2, k / 4 ) - 2 * sumInt( imgCh[0], i + k / 4, j + k / 2, k / 2, k / 4 ) -
+                2 * sumInt( imgCh[0], i + 3 * k / 4, j + k / 4, k / 4, k / 4 ) -
+                2 * sumInt( imgCh[0], i + 3 * k / 4, j + 3 * k / 4, k / 4, k / 4 );
+
+  feature[12] = s - 2 * sumInt( imgCh[0], i + k / 4, j, k / 4, k ) - 2 * sumInt( imgCh[0], i + 3 * k / 4, j, k / 4, k );
+  feature[13] = s - 2 * sumInt( imgCh[0], i + k / 4, j, k / 4, k / 2 ) - 2 * sumInt( imgCh[0], i + 3 * k / 4, j, k / 4, k / 2 ) -
+                2 * sumInt( imgCh[0], i, j + k / 2, k / 4, k / 2 ) - 2 * sumInt( imgCh[0], i + k / 2, j + k / 2, k / 4, k / 2 );
+  feature[14] = s - 2 * sumInt( imgCh[0], i + k / 4, j, k / 4, k / 4 ) - 2 * sumInt( imgCh[0], i + 3 * k / 4, j, k / 4, k / 4 ) -
+                2 * sumInt( imgCh[0], i, j + k / 4, k / 4, k / 2 ) - 2 * sumInt( imgCh[0], i + k / 2, j + k / 4, k / 4, k / 2 ) -
+                2 * sumInt( imgCh[0], i + k / 4, j + 3 * k / 4, k / 4, k / 4 ) -
+                2 * sumInt( imgCh[0], i + 3 * k / 4, j + 3 * k / 4, k / 4, k / 4 );
+  feature[15] = s - 2 * sumInt( imgCh[0], i, j + k / 4, k / 4, k / 4 ) - 2 * sumInt( imgCh[0], i, j + 3 * k / 4, k / 4, k / 4 ) -
+                2 * sumInt( imgCh[0], i + k / 4, j, k / 4, k / 4 ) - 2 * sumInt( imgCh[0], i + k / 4, j + k / 2, k / 4, k / 4 ) -
+                2 * sumInt( imgCh[0], i + k / 2, j + k / 4, k / 4, k / 4 ) -
+                2 * sumInt( imgCh[0], i + k / 2, j + 3 * k / 4, k / 4, k / 4 ) - 2 * sumInt( imgCh[0], i + 3 * k / 4, j, k / 4, k / 4 ) -
+                2 * sumInt( imgCh[0], i + 3 * k / 4, j + k / 2, k / 4, k / 4 );
+
+  feature[16] = sumInt( imgCh[1], i, j, k, k );
+  feature[17] = sumInt( imgCh[2], i, j, k, k );
+
+  patchDescr.feature /= patchRadius;
+}
+
+class ParallelDCTFiller : public ParallelLoopBody
+{
+private:
+  const Size sz;
+  const Mat *imgCh;
+  std::vector< GPCPatchDescriptor > *descr;
+
+  ParallelDCTFiller &operator=( const ParallelDCTFiller & );
+
+public:
+  ParallelDCTFiller( const Size &_sz, const Mat *_imgCh, std::vector< GPCPatchDescriptor > *_descr )
+      : sz( _sz ), imgCh( _imgCh ), descr( _descr ){};
+
+  void operator()( const Range &range ) const
+  {
+    for ( int i = range.start; i < range.end; ++i )
+    {
+      int x, y;
+      GPCDetails::getCoordinatesFromIndex( i, sz, x, y );
+      getDCTPatchDescriptor( descr->at( i ), imgCh, y, x );
+    }
+  }
+};
+
+#ifdef HAVE_OPENCL
+
+bool ocl_getAllDCTDescriptorsForImage( const Mat *imgCh, std::vector< GPCPatchDescriptor > &descr )
+{
+  const Size sz = imgCh[0].size();
+  ocl::Kernel kernel( "getPatchDescriptor", ocl::optflow::sparse_matching_gpc_oclsrc,
+                      format( "-DPATCH_RADIUS_DOUBLED=%d -DCV_PI=%f -DSQRT2_INV=%f", PATCH_RADIUS_DOUBLED, CV_PI, SQRT2_INV ) );
+  size_t globSize[] = {sz.height - 2 * patchRadius, sz.width - 2 * patchRadius};
+  UMat out( globSize[0] * globSize[1], GPCPatchDescriptor::nFeatures, CV_64F );
+  if (
+    kernel
+    .args( cv::ocl::KernelArg::ReadOnlyNoSize( imgCh[0].getUMat( ACCESS_READ ) ),
+           cv::ocl::KernelArg::ReadOnlyNoSize( imgCh[1].getUMat( ACCESS_READ ) ),
+           cv::ocl::KernelArg::ReadOnlyNoSize( imgCh[2].getUMat( ACCESS_READ ) ),
+           cv::ocl::KernelArg::WriteOnlyNoSize( out ),
+           (int)globSize[0], (int)globSize[1], (int)patchRadius )
+    .run( 2, globSize, 0, true ) == false )
+    return false;
+  Mat cpuOut = out.getMat( 0 );
+  for ( int i = 0; i + 2 * patchRadius < sz.height; ++i )
+    for ( int j = 0; j + 2 * patchRadius < sz.width; ++j )
+      descr.push_back( *cpuOut.ptr< GPCPatchDescriptor >( i * globSize[1] + j ) );
+  return true;
+}
+
+#endif
+
+void getAllDCTDescriptorsForImage( const Mat *imgCh, std::vector< GPCPatchDescriptor > &descr, const GPCMatchingParams &mp )
+{
+  const Size sz = imgCh[0].size();
+  descr.reserve( ( sz.height - 2 * patchRadius ) * ( sz.width - 2 * patchRadius ) );
+
+  (void)mp; // Fix unused parameter warning in case OpenCL is not available
+  CV_OCL_RUN( mp.useOpenCL, ocl_getAllDCTDescriptorsForImage( imgCh, descr ) )
+
+  descr.resize( ( sz.height - 2 * patchRadius ) * ( sz.width - 2 * patchRadius ) );
+  parallel_for_( Range( 0, descr.size() ), ParallelDCTFiller( sz, imgCh, &descr ) );
+}
+
+class ParallelWHTFiller : public ParallelLoopBody
+{
+private:
+  const Size sz;
+  const Mat *imgChInt;
+  std::vector< GPCPatchDescriptor > *descr;
+
+  ParallelWHTFiller &operator=( const ParallelWHTFiller & );
+
+public:
+  ParallelWHTFiller( const Size &_sz, const Mat *_imgChInt, std::vector< GPCPatchDescriptor > *_descr )
+      : sz( _sz ), imgChInt( _imgChInt ), descr( _descr ){};
+
+  void operator()( const Range &range ) const
+  {
+    for ( int i = range.start; i < range.end; ++i )
+    {
+      int x, y;
+      GPCDetails::getCoordinatesFromIndex( i, sz, x, y );
+      getWHTPatchDescriptor( descr->at( i ), imgChInt, y, x );
+    }
+  }
+};
+
+void getAllWHTDescriptorsForImage( const Mat *imgCh, std::vector< GPCPatchDescriptor > &descr, const GPCMatchingParams & )
+{
+  const Size sz = imgCh[0].size();
+  descr.resize( ( sz.height - 2 * patchRadius ) * ( sz.width - 2 * patchRadius ) );
+
+  Mat imgChInt[3];
+  integral( imgCh[0], imgChInt[0], CV_64F );
+  integral( imgCh[1], imgChInt[1], CV_64F );
+  integral( imgCh[2], imgChInt[2], CV_64F );
+
+  parallel_for_( Range( 0, descr.size() ), ParallelWHTFiller( sz, imgChInt, &descr ) );
+}
+
+void buildIndex( OutputArray featuresOut, flann::Index &index, const Mat *imgCh,
+                 void ( *getAllDescrFn )( const Mat *, std::vector< GPCPatchDescriptor > &, const GPCMatchingParams & ) )
+{
+  std::vector< GPCPatchDescriptor > descriptors;
+  getAllDescrFn( imgCh, descriptors, GPCMatchingParams() );
+
+  featuresOut.create( descriptors.size(), GPCPatchDescriptor::nFeatures, CV_32F );
+  Mat features = featuresOut.getMat();
+
+  for ( size_t i = 0; i < descriptors.size(); ++i )
+    *features.ptr< Vec< float, GPCPatchDescriptor::nFeatures > >( i ) = descriptors[i].feature;
+
+  cv::flann::KDTreeIndexParams indexParams;
+  index.build( features, indexParams, cvflann::FLANN_DIST_L2 );
+}
+
+void getTriplet( const Magnitude &mag, const Mat &gt, const Mat *fromCh, const Mat *toCh, GPCSamplesVector &samples, flann::Index &index,
+                 void ( *getDescFn )( GPCPatchDescriptor &, const Mat *, int, int ) )
+{
+  const Size sz = gt.size();
+  const int i0 = mag.i;
+  const int j0 = mag.j;
+  const int i1 = i0 + cvRound( gt.at< Vec2f >( i0, j0 )[1] );
+  const int j1 = j0 + cvRound( gt.at< Vec2f >( i0, j0 )[0] );
+  if ( checkBounds( i1, j1, sz ) )
+  {
+    GPCPatchSample ps;
+    getDescFn( ps.ref, fromCh, i0, j0 );
+    getDescFn( ps.pos, toCh, i1, j1 );
+    ps.neg.markAsSeparated();
+
+    Matx< float, 1, GPCPatchDescriptor::nFeatures > ref32;
+    Matx< int, 1, negSearchKNN > indices;
+    int maxDist = 0;
+
+    for ( unsigned i = 0; i < GPCPatchDescriptor::nFeatures; ++i )
+      ref32( 0, i ) = ps.ref.feature[i];
+
+    index.knnSearch( ref32, indices, noArray(), negSearchKNN );
+
+    for ( unsigned i = 0; i < negSearchKNN; ++i )
+    {
+      int i2, j2;
+      GPCDetails::getCoordinatesFromIndex( indices( 0, i ), sz, j2, i2 );
+      const int dist = ( i2 - i1 ) * ( i2 - i1 ) + ( j2 - j1 ) * ( j2 - j1 );
+      if ( maxDist < dist )
+      {
+        maxDist = dist;
+        getDescFn( ps.neg, toCh, i2, j2 );
+      }
+    }
+
+    samples.push_back( ps );
+  }
+}
+
+void getTrainingSamples( const Mat &from, const Mat &to, const Mat &gt, GPCSamplesVector &samples, const int type )
 {
   const Size sz = gt.size();
   std::vector< Magnitude > mag;
@@ -116,33 +397,53 @@ void getTrainingSamples( const Mat &from, const Mat &to, const Mat &gt, GPCSampl
     for ( int j = patchRadius; j + patchRadius < sz.width; ++j )
       mag.push_back( Magnitude( normL2Sqr( gt.at< Vec2f >( i, j ) ), i, j ) );
 
-  size_t n = size_t(mag.size() * thresholdMagnitudeFrac); // As suggested in the paper, we discard part of the training samples
-                                                          // with a small displacement and train to better distinguish hard pairs.
+  size_t n = size_t( mag.size() * thresholdMagnitudeFrac ); // As suggested in the paper, we discard part of the training samples
+                                                            // with a small displacement and train to better distinguish hard pairs.
   std::nth_element( mag.begin(), mag.begin() + n, mag.end() );
   mag.resize( n );
   std::random_shuffle( mag.begin(), mag.end() );
   n /= patchRadius;
   mag.resize( n );
 
-  Mat fromCh[3], toCh[3];
-  split( from, fromCh );
-  split( to, toCh );
+  if ( type == GPC_DESCRIPTOR_DCT )
+  {
+    Mat fromCh[3], toCh[3];
+    split( from, fromCh );
+    split( to, toCh );
+
+    Mat allDescriptors;
+    flann::Index index;
+    buildIndex( allDescriptors, index, toCh, getAllDCTDescriptorsForImage );
 
-  for ( size_t k = 0; k < n; ++k )
+    for ( size_t k = 0; k < n; ++k )
+      getTriplet( mag[k], gt, fromCh, toCh, samples, index, getDCTPatchDescriptor );
+  }
+  else if ( type == GPC_DESCRIPTOR_WHT )
   {
-    int i0 = mag[k].i;
-    int j0 = mag[k].j;
-    int i1 = i0 + cvRound( gt.at< Vec2f >( i0, j0 )[1] );
-    int j1 = j0 + cvRound( gt.at< Vec2f >( i0, j0 )[0] );
-    if ( checkBounds( i1, j1, sz ) )
-      samples.push_back( std::make_pair( GPCPatchDescriptor( fromCh, i0, j0 ), GPCPatchDescriptor( toCh, i1, j1 ) ) );
+    Mat fromCh[3], toCh[3], fromChInt[3], toChInt[3];
+    split( from, fromCh );
+    split( to, toCh );
+    integral( fromCh[0], fromChInt[0], CV_64F );
+    integral( fromCh[1], fromChInt[1], CV_64F );
+    integral( fromCh[2], fromChInt[2], CV_64F );
+    integral( toCh[0], toChInt[0], CV_64F );
+    integral( toCh[1], toChInt[1], CV_64F );
+    integral( toCh[2], toChInt[2], CV_64F );
+
+    Mat allDescriptors;
+    flann::Index index;
+    buildIndex( allDescriptors, index, toCh, getAllWHTDescriptorsForImage );
+
+    for ( size_t k = 0; k < n; ++k )
+      getTriplet( mag[k], gt, fromChInt, toChInt, samples, index, getWHTPatchDescriptor );
   }
+  else
+    CV_Error( CV_StsBadArg, "Unknown descriptor type" );
 }
 
 /* Sample random number from Cauchy distribution. */
 double getRandomCauchyScalar()
 {
-  static RNG rng;
   return tan( rng.uniform( -1.54, 1.54 ) ); // I intentionally used the value slightly less than PI/2 to enforce strictly
                                             // zero probability for large numbers. Resulting PDF for Cauchy has
                                             // truncated "tails".
@@ -155,41 +456,65 @@ void getRandomCauchyVector( Vec< double, GPCPatchDescriptor::nFeatures > &v )
   for ( unsigned i = 0; i < GPCPatchDescriptor::nFeatures; ++i )
     v[i] = getRandomCauchyScalar();
 }
+
+double getRobustMedian( double m ) { return m < 0 ? m * ( 1.0 + epsTolerance ) : m * ( 1.0 - epsTolerance ); }
 }
 
-GPCPatchDescriptor::GPCPatchDescriptor( const Mat *imgCh, int i, int j )
+double GPCPatchDescriptor::dot( const Vec< double, nFeatures > &coef ) const
 {
-  Rect roi( j - patchRadius, i - patchRadius, 2 * patchRadius, 2 * patchRadius );
-  Mat freqDomain;
-  dct( imgCh[0]( roi ), freqDomain );
-
-  feature[0] = freqDomain.at< float >( 0, 0 );
-  feature[1] = freqDomain.at< float >( 0, 1 );
-  feature[2] = freqDomain.at< float >( 0, 2 );
-  feature[3] = freqDomain.at< float >( 0, 3 );
-
-  feature[4] = freqDomain.at< float >( 1, 0 );
-  feature[5] = freqDomain.at< float >( 1, 1 );
-  feature[6] = freqDomain.at< float >( 1, 2 );
-  feature[7] = freqDomain.at< float >( 1, 3 );
+#if CV_SIMD128_64F
+  v_float64x2 sum = v_setzero_f64();
+  for ( unsigned i = 0; i < nFeatures; i += 2 )
+  {
+    v_float64x2 x = v_load_aligned( &feature.val[i] );
+    v_float64x2 y = v_load_aligned( &coef.val[i] );
+    sum = v_muladd( x, y, sum );
+  }
+#if CV_SSE2
+  __m128d sumrev = _mm_shuffle_pd( sum.val, sum.val, _MM_SHUFFLE2( 0, 1 ) );
+  return _mm_cvtsd_f64( _mm_add_pd( sum.val, sumrev ) );
+#else
+  double CV_DECL_ALIGNED( 16 ) buf[2];
+  v_store_aligned( buf, sum );
+  return OPENCV_HAL_ADD( buf[0], buf[1] );
+#endif
+
+#else
+  return feature.dot( coef );
+#endif
+}
 
-  feature[8] = freqDomain.at< float >( 2, 0 );
-  feature[9] = freqDomain.at< float >( 2, 1 );
-  feature[10] = freqDomain.at< float >( 2, 2 );
-  feature[11] = freqDomain.at< float >( 2, 3 );
+void GPCPatchSample::getDirections( bool &refdir, bool &posdir, bool &negdir, const Vec< double, GPCPatchDescriptor::nFeatures > &coef, double rhs ) const
+{
+  refdir = ( ref.dot( coef ) < rhs );
+  posdir = pos.isSeparated() ? ( !refdir ) : ( pos.dot( coef ) < rhs );
+  negdir = neg.isSeparated() ? ( !refdir ) : ( neg.dot( coef ) < rhs );
+}
 
-  feature[12] = freqDomain.at< float >( 3, 0 );
-  feature[13] = freqDomain.at< float >( 3, 1 );
-  feature[14] = freqDomain.at< float >( 3, 2 );
-  feature[15] = freqDomain.at< float >( 3, 3 );
+void GPCDetails::getAllDescriptorsForImage( const Mat *imgCh, std::vector< GPCPatchDescriptor > &descr, const GPCMatchingParams &mp,
+                                            int type )
+{
+  if ( type == GPC_DESCRIPTOR_DCT )
+    getAllDCTDescriptorsForImage( imgCh, descr, mp );
+  else if ( type == GPC_DESCRIPTOR_WHT )
+    getAllWHTDescriptorsForImage( imgCh, descr, mp );
+  else
+    CV_Error( CV_StsBadArg, "Unknown descriptor type" );
+}
 
-  feature[16] = cv::sum( imgCh[1]( roi ) )[0] / ( 2 * patchRadius );
-  feature[17] = cv::sum( imgCh[2]( roi ) )[0] / ( 2 * patchRadius );
+void GPCDetails::getCoordinatesFromIndex( size_t index, Size sz, int &x, int &y )
+{
+  const size_t stride = sz.width - patchRadius * 2;
+  y = int( index / stride );
+  x = int( index - y * stride + patchRadius );
+  y += patchRadius;
 }
 
 bool GPCTree::trainNode( size_t nodeId, SIter begin, SIter end, unsigned depth )
 {
-  if ( std::distance( begin, end ) < minNumberOfSamples )
+  const int nSamples = (int)std::distance( begin, end );
+
+  if ( nSamples < params.minNumberOfSamples || depth >= params.maxTreeDepth )
     return false;
 
   if ( nodeId >= nodes.size() )
@@ -200,6 +525,7 @@ bool GPCTree::trainNode( size_t nodeId, SIter begin, SIter end, unsigned depth )
   // Select the best hyperplane
   unsigned globalBestScore = 0;
   std::vector< double > values;
+  values.reserve( nSamples * 2 );
 
   for ( int j = 0; j < globalIters; ++j )
   { // Global search step
@@ -209,50 +535,81 @@ bool GPCTree::trainNode( size_t nodeId, SIter begin, SIter end, unsigned depth )
 
     for ( int i = 0; i < localIters; ++i )
     { // Local search step
-      double randomModification = getRandomCauchyScalar();
+      double randomModification = getRandomCauchyScalar() * ( 1.0 + sigmaGrowthRate * int( i / GPCPatchDescriptor::nFeatures ) );
       const int pos = i % GPCPatchDescriptor::nFeatures;
       std::swap( coef[pos], randomModification );
       values.clear();
 
       for ( SIter iter = begin; iter != end; ++iter )
-      {
-        values.push_back( coef.dot( iter->first.feature ) );
-        values.push_back( coef.dot( iter->second.feature ) );
-      }
+        values.push_back( iter->ref.dot( coef ) );
+
+      std::nth_element( values.begin(), values.begin() + nSamples / 2, values.end() );
+      double median = values[nSamples / 2];
+
+      // Skip obviously malformed division. This may happen in case there are a large number of equal samples.
+      // Most likely this won't happen with samples collected from a good dataset.
+      // Happens in case dataset contains plain (or close to plain) images.
+      if ( std::count_if( values.begin(), values.end(), CompareWithTolerance( median ) ) > std::max( 1, nSamples / 4 ) )
+        continue;
 
-      std::nth_element( values.begin(), values.begin() + values.size() / 2, values.end() );
-      const double median = values[values.size() / 2];
-      unsigned correct = 0;
+      median = getRobustMedian( median );
 
+      unsigned score = 0;
       for ( SIter iter = begin; iter != end; ++iter )
       {
-        const bool direction = ( coef.dot( iter->first.feature ) < median );
-        if ( direction == ( coef.dot( iter->second.feature ) < median ) )
-          ++correct;
+        bool refdir, posdir, negdir;
+        iter->getDirections( refdir, posdir, negdir, coef, median );
+        if ( refdir == posdir )
+          score += scoreGainPos;
+        if ( refdir != negdir )
+          score += scoreGainNeg;
       }
 
-      if ( correct > localBestScore )
-        localBestScore = correct;
+      if ( score > localBestScore )
+        localBestScore = score;
       else
-        coef[pos] = randomModification;
+      {
+        const double beta = simulatedAnnealingTemperatureCoef * std::sqrt( i ) / ( nSamples * ( scoreGainPos + scoreGainNeg ) );
+        if ( rng.uniform( 0.0, 1.0 ) > std::exp( -beta * ( localBestScore - score) ) )
+          coef[pos] = randomModification;
+      }
 
-      if ( correct > globalBestScore )
+      if ( score > globalBestScore )
       {
-        globalBestScore = correct;
+        globalBestScore = score;
         node.coef = coef;
         node.rhs = median;
-
-        /*if ( debugOutput )
-        {
-          printf( "[%u] Updating weights: correct %.2f (%u/%ld)\n", depth, double( correct ) / std::distance( begin, end ), correct,
-                  std::distance( begin, end ) );
-          for ( unsigned k = 0; k < GPCPatchDescriptor::nFeatures; ++k )
-            printf( "%.3f ", coef[k] );
-          printf( "\n" );
-        }*/
       }
     }
   }
+
+  if ( globalBestScore == 0 )
+    return false;
+
+  if ( params.printProgress )
+  {
+    const int maxScore = nSamples * ( scoreGainPos + scoreGainNeg );
+    const double correctRatio = double( globalBestScore ) / maxScore;
+    printf( "[%u] Correct %.2f (%u/%d)\nWeights:", depth, correctRatio, globalBestScore, maxScore );
+    for ( unsigned k = 0; k < GPCPatchDescriptor::nFeatures; ++k )
+      printf( " %.3f", node.coef[k] );
+    printf( "\n" );
+  }
+
+  for ( SIter iter = begin; iter != end; ++iter )
+  {
+    bool refdir, posdir, negdir;
+    iter->getDirections( refdir, posdir, negdir, node.coef, node.rhs );
+    // We shouldn't account for positive sample in the scoring in case it was separated before. So mark it as separated.
+    // After all, we can't bring back samples which were separated from reference on early levels.
+    if ( refdir != posdir )
+      iter->pos.markAsSeparated();
+    // The same for negative sample.
+    if ( refdir != negdir )
+      iter->neg.markAsSeparated();
+    // If both positive and negative were separated before then such triplet doesn't make sense on deeper levels. We discard it.
+  }
+
   // Partition vector with samples according to the hyperplane in QuickSort-like manner.
   // Unlike QuickSort, we need to partition it into 3 parts (left subtree samples; undefined samples; right subtree
   // samples), so we call it two times.
@@ -260,16 +617,21 @@ bool GPCTree::trainNode( size_t nodeId, SIter begin, SIter end, unsigned depth )
   SIter rightBegin =
     std::partition( leftEnd, end, PartitionPredicate2( node.coef, node.rhs ) ); // Separate undefined samples from right subtree samples.
 
-  node.left = ( trainNode( nodeId * 2 + 1, begin, leftEnd, depth + 1 ) ) ? unsigned(nodeId * 2 + 1) : 0;
-  node.right = ( trainNode( nodeId * 2 + 2, rightBegin, end, depth + 1 ) ) ? unsigned(nodeId * 2 + 2) : 0;
+  node.left = ( trainNode( nodeId * 2 + 1, begin, leftEnd, depth + 1 ) ) ? unsigned( nodeId * 2 + 1 ) : 0;
+  node.right = ( trainNode( nodeId * 2 + 2, rightBegin, end, depth + 1 ) ) ? unsigned( nodeId * 2 + 2 ) : 0;
 
   return true;
 }
 
-void GPCTree::train( GPCSamplesVector &samples )
+void GPCTree::train( GPCTrainingSamples &samples, const GPCTrainingParams _params )
 {
+  if ( _params.descriptorType != samples.type() )
+    CV_Error( CV_StsBadArg, "Descriptor type mismatch! Check that samples are collected with the same descriptor type." );
+  nodes.clear();
   nodes.reserve( samples.size() * 2 - 1 ); // set upper bound for the possible number of nodes so all subsequent resize() will be no-op
-  trainNode( 0, samples.begin(), samples.end(), 0 );
+  params = _params;
+  GPCSamplesVector &sv = samples;
+  trainNode( 0, sv.begin(), sv.end(), 0 );
 }
 
 void GPCTree::write( FileStorage &fs ) const
@@ -277,17 +639,39 @@ void GPCTree::write( FileStorage &fs ) const
   if ( nodes.empty() )
     CV_Error( CV_StsBadArg, "Tree have not been trained" );
   fs << "nodes" << nodes;
+  fs << "dtype" << (int)params.descriptorType;
+}
+
+void GPCTree::read( const FileNode &fn )
+{
+  fn["nodes"] >> nodes;
+  fn["dtype"] >> (int &)params.descriptorType;
 }
 
-void GPCTree::read( const FileNode &fn ) { fn["nodes"] >> nodes; }
+unsigned GPCTree::findLeafForPatch( const GPCPatchDescriptor &descr ) const
+{
+  unsigned id = 0, prevId;
+  do
+  {
+    prevId = id;
+    if ( descr.dot( nodes[id].coef ) < nodes[id].rhs )
+      id = nodes[id].right;
+    else
+      id = nodes[id].left;
+  } while ( id );
+  return prevId;
+}
 
 Ptr< GPCTrainingSamples > GPCTrainingSamples::create( const std::vector< String > &imagesFrom, const std::vector< String > &imagesTo,
-                                                      const std::vector< String > &gt )
+                                                      const std::vector< String > &gt, int _descriptorType )
 {
   CV_Assert( imagesFrom.size() == imagesTo.size() );
   CV_Assert( imagesFrom.size() == gt.size() );
 
   Ptr< GPCTrainingSamples > ts = makePtr< GPCTrainingSamples >();
+
+  ts->descriptorType = _descriptorType;
+
   for ( size_t i = 0; i < imagesFrom.size(); ++i )
   {
     Mat from = imread( imagesFrom[i] );
@@ -304,12 +688,69 @@ Ptr< GPCTrainingSamples > GPCTrainingSamples::create( const std::vector< String
     cvtColor( from, from, COLOR_BGR2YCrCb );
     cvtColor( to, to, COLOR_BGR2YCrCb );
 
-    getTrainingSamples( from, to, gtFlow, ts->samples );
+    getTrainingSamples( from, to, gtFlow, ts->samples, ts->descriptorType );
   }
 
   return ts;
 }
 
+Ptr< GPCTrainingSamples > GPCTrainingSamples::create( InputArrayOfArrays imagesFrom, InputArrayOfArrays imagesTo,
+                                                      InputArrayOfArrays gt, int _descriptorType )
+{
+  CV_Assert( imagesFrom.total() == imagesTo.total() );
+  CV_Assert( imagesFrom.total() == gt.total() );
+
+  Ptr< GPCTrainingSamples > ts = makePtr< GPCTrainingSamples >();
+
+  ts->descriptorType = _descriptorType;
+
+  for ( size_t i = 0; i < imagesFrom.total(); ++i )
+  {
+    Mat from = imagesFrom.getMat( static_cast<int>( i ) );
+    Mat to = imagesTo.getMat( static_cast<int>( i ) );
+    Mat gtFlow = gt.getMat( static_cast<int>( i ) );
+
+    CV_Assert( from.size == to.size );
+    CV_Assert( from.size == gtFlow.size );
+    CV_Assert( from.channels() == 3 );
+    CV_Assert( to.channels() == 3 );
+
+    from.convertTo( from, CV_32FC3 );
+    to.convertTo( to, CV_32FC3 );
+    cvtColor( from, from, COLOR_BGR2YCrCb );
+    cvtColor( to, to, COLOR_BGR2YCrCb );
+
+    getTrainingSamples( from, to, gtFlow, ts->samples, ts->descriptorType );
+  }
+
+  return ts;
+}
+
+void GPCDetails::dropOutliers( std::vector< std::pair< Point2i, Point2i > > &corr )
+{
+  std::vector< float > mag( corr.size() );
+
+  for ( size_t i = 0; i < corr.size(); ++i )
+    mag[i] = normL2Sqr( corr[i].first - corr[i].second );
+
+  const size_t threshold = size_t( mag.size() * thresholdOutliers );
+  std::nth_element( mag.begin(), mag.begin() + threshold, mag.end() );
+  const float percentile = mag[threshold];
+  size_t i = 0, j = 0;
+
+  while ( i < corr.size() )
+  {
+    if ( normL2Sqr( corr[i].first - corr[i].second ) <= percentile )
+    {
+      corr[j] = corr[i];
+      ++j;
+    }
+    ++i;
+  }
+
+  corr.resize( j );
+}
+
 } // namespace optflow
 
 void write( FileStorage &fs, const String &name, const optflow::GPCTree::Node &node )
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index 2125f3fd1..c47ff90fc 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -49,8 +49,16 @@ using namespace optflow;
 
 static string getDataDir() { return TS::ptr()->get_data_path(); }
 
+static string getRubberWhaleFrame1() { return getDataDir() + "optflow/RubberWhale1.png"; }
+
+static string getRubberWhaleFrame2() { return getDataDir() + "optflow/RubberWhale2.png"; }
+
+static string getRubberWhaleGroundTruth() { return getDataDir() + "optflow/RubberWhale.flo"; }
+
 static bool isFlowCorrect(float u) { return !cvIsNaN(u) && (fabs(u) < 1e9); }
 
+static bool isFlowCorrect(double u) { return !cvIsNaN(u) && (fabs(u) < 1e9); }
+
 static float calcRMSE(Mat flow1, Mat flow2)
 {
     float sum = 0;
@@ -80,11 +88,36 @@ static float calcRMSE(Mat flow1, Mat flow2)
     return (float)sqrt(sum / (1e-9 + counter));
 }
 
+static float calcAvgEPE(vector< pair<Point2i, Point2i> > corr, Mat flow)
+{
+    double sum = 0;
+    int counter = 0;
+
+    for (size_t i = 0; i < corr.size(); ++i)
+    {
+        Vec2f flow1_at_point = Point2f(corr[i].second - corr[i].first);
+        Vec2f flow2_at_point = flow.at<Vec2f>(corr[i].first.y, corr[i].first.x);
+
+        double u1 = (double)flow1_at_point[0];
+        double v1 = (double)flow1_at_point[1];
+        double u2 = (double)flow2_at_point[0];
+        double v2 = (double)flow2_at_point[1];
+
+        if (isFlowCorrect(u1) && isFlowCorrect(u2) && isFlowCorrect(v1) && isFlowCorrect(v2))
+        {
+            sum += sqrt((u1 - u2) * (u1 - u2) + (v1 - v2) * (v1 - v2));
+            counter++;
+        }
+    }
+
+    return (float)(sum / counter);
+}
+
 bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT)
 {
-    const string frame1_path = getDataDir() + "optflow/RubberWhale1.png";
-    const string frame2_path = getDataDir() + "optflow/RubberWhale2.png";
-    const string gt_flow_path = getDataDir() + "optflow/RubberWhale.flo";
+    const string frame1_path = getRubberWhaleFrame1();
+    const string frame2_path = getRubberWhaleFrame2();
+    const string gt_flow_path = getRubberWhaleGroundTruth();
 
     dst_frame_1 = imread(frame1_path);
     dst_frame_2 = imread(frame2_path);
@@ -188,3 +221,65 @@ TEST(DenseOpticalFlow_VariationalRefinement, ReferenceAccuracy)
     ASSERT_EQ(GT.cols, flow.cols);
     EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
 }
+
+TEST(DenseOpticalFlow_PCAFlow, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    const float target_RMSE = 0.55f;
+
+    Mat flow;
+    Ptr<DenseOpticalFlow> algo = createOptFlow_PCAFlow();
+    algo->calc(frame1, frame2, flow);
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
+}
+
+TEST(DenseOpticalFlow_GlobalPatchColliderDCT, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+
+    const Size sz = frame1.size() / 2;
+    frame1 = frame1(Rect(0, 0, sz.width, sz.height));
+    frame2 = frame2(Rect(0, 0, sz.width, sz.height));
+    GT = GT(Rect(0, 0, sz.width, sz.height));
+
+    vector<Mat> img1, img2, gt;
+    vector< pair<Point2i, Point2i> > corr;
+    img1.push_back(frame1);
+    img2.push_back(frame2);
+    gt.push_back(GT);
+
+    Ptr< GPCForest<5> > forest = GPCForest<5>::create();
+    forest->train(img1, img2, gt, GPCTrainingParams(8, 3, GPC_DESCRIPTOR_DCT, false));
+    forest->findCorrespondences(frame1, frame2, corr);
+
+    ASSERT_LE(7500U, corr.size());
+    ASSERT_LE(calcAvgEPE(corr, GT), 0.5f);
+}
+
+TEST(DenseOpticalFlow_GlobalPatchColliderWHT, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+
+    const Size sz = frame1.size() / 2;
+    frame1 = frame1(Rect(0, 0, sz.width, sz.height));
+    frame2 = frame2(Rect(0, 0, sz.width, sz.height));
+    GT = GT(Rect(0, 0, sz.width, sz.height));
+
+    vector<Mat> img1, img2, gt;
+    vector< pair<Point2i, Point2i> > corr;
+    img1.push_back(frame1);
+    img2.push_back(frame2);
+    gt.push_back(GT);
+
+    Ptr< GPCForest<5> > forest = GPCForest<5>::create();
+    forest->train(img1, img2, gt, GPCTrainingParams(8, 3, GPC_DESCRIPTOR_WHT, false));
+    forest->findCorrespondences(frame1, frame2, corr);
+
+    ASSERT_LE(7000U, corr.size());
+    ASSERT_LE(calcAvgEPE(corr, GT), 0.5f);
+}