Merge pull request #3 from Itseez/master

Update
10 years ago · a80308fd44
parent 3011e80f87 f9634fd8cd
commit a80308fd44
75 changed files with 41112 additions and 682 deletions
--- a/modules/README.md
+++ b/modules/README.md
@ -51,3 +51,5 @@ $ cmake -D OPENCV_EXTRA_MODULES_PATH=<opencv_contrib>/modules -D BUILD_opencv_re
 21. **opencv_xobjdetect**: Integral Channel Features Detector Framework.

 22. **opencv_xphoto**: Additional photo processing algorithms: Color balance / Denoising / Inpainting.
+
+23. **opencv_stereo**: Stereo Correspondence done with different descriptors: Census / CS-Census / MCT / BRIEF / MV / RT.
--- a/modules/datasets/include/opencv2/datasets/or_pascal.hpp
+++ b/modules/datasets/include/opencv2/datasets/or_pascal.hpp
@ -0,0 +1,102 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Itseez Inc or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DATASETS_VOC_PASCAL_HPP
+#define OPENCV_DATASETS_VOC_PASCAL_HPP
+
+#include <string>
+#include <vector>
+
+#include "opencv2/datasets/dataset.hpp"
+
+#include <opencv2/core.hpp>
+
+namespace cv
+{
+namespace datasets
+{
+
+//! @addtogroup datasets_or
+//! @{
+struct PascalPart: public Object
+{
+    std::string name;
+    int xmin;
+    int ymin;
+    int xmax;
+    int ymax;
+};
+
+struct PascalObj: public PascalPart
+{
+    std::string pose;
+    bool truncated;
+    bool difficult;
+    bool occluded;
+
+    std::vector<PascalPart> parts;
+};
+
+struct OR_pascalObj : public Object
+{
+    std::string filename;
+
+    int width;
+    int height;
+    int depth;
+
+    std::vector<PascalObj> objects;
+};
+
+class CV_EXPORTS OR_pascal : public Dataset
+{
+public:
+    virtual void load(const std::string &path) = 0;
+
+    static Ptr<OR_pascal> create();
+};
+
+//! @}
+
+}// namespace dataset
+}// namespace cv
+
+#endif
--- a/modules/datasets/include/opencv2/datasets/pd_inria.hpp
+++ b/modules/datasets/include/opencv2/datasets/pd_inria.hpp
@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Itseez Inc or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DATASETS_PD_INRIA_HPP
+#define OPENCV_DATASETS_PD_INRIA_HPP
+
+#include <string>
+#include <vector>
+
+#include "opencv2/datasets/dataset.hpp"
+
+#include <opencv2/core.hpp>
+
+namespace cv
+{
+namespace datasets
+{
+
+//! @addtogroup datasets_pd
+//! @{
+
+enum sampleType 
+{
+    POS = 0,
+    NEG = 1
+};
+
+struct PD_inriaObj : public Object
+{
+    // image file name
+    std::string filename;
+    
+    // positive or negative
+    sampleType sType;
+
+    // image size
+    int width;
+    int height;
+    int depth;
+
+    // bounding boxes
+    std::vector< Rect > bndboxes;
+};
+
+class CV_EXPORTS PD_inria : public Dataset
+{
+public:
+    virtual void load(const std::string &path) = 0;
+
+    static Ptr<PD_inria> create();
+};
+
+//! @}
+
+}
+}
+
+#endif
--- a/modules/datasets/samples/or_pascal.cpp
+++ b/modules/datasets/samples/or_pascal.cpp
@ -0,0 +1,112 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Itseez Inc or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/datasets/or_pascal.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstdio>
+#include <cstdlib> // atoi
+
+#include <string>
+#include <vector>
+#include <set>
+
+using namespace std;
+using namespace cv;
+using namespace cv::datasets;
+
+int main(int argc, char *argv[])
+{
+    const char *keys =
+            "{ help h usage ? |    | show this message }"
+            "{ path p         |true| path to folder with dataset }";
+    CommandLineParser parser(argc, argv, keys);
+    string path(parser.get<string>("path"));
+    if (parser.has("help") || path=="true")
+    {
+        parser.printMessage();
+        return -1;
+    }
+
+    Ptr<OR_pascal> dataset = OR_pascal::create();
+    dataset->load(path);
+
+    // Print train/test/validation size and first example
+    OR_pascalObj *example;
+    vector< Ptr<Object> > &train = dataset->getTrain();
+    printf("\ntrain:\nsize: %u", (unsigned int)train.size());
+    example = static_cast<OR_pascalObj *>(train[0].get());
+    printf("\nfirst image: \n%s", example->filename.c_str());
+
+    printf("\nsize:");
+    printf("\n - width: %d", example->width);
+    printf("\n - height: %d", example->height);
+    printf("\n - depth: %d", example->depth);
+
+    for (unsigned int i = 0; i < example->objects.size(); i++)
+    {
+        printf("\nobject %d", i);
+        printf("\nname: %s", example->objects[i].name.c_str());
+        printf("\npose: %s", example->objects[i].pose.c_str());
+        printf("\ntruncated: %d", example->objects[i].truncated);
+        printf("\ndifficult: %d", example->objects[i].difficult);
+        printf("\noccluded: %d", example->objects[i].occluded);
+
+        printf("\nbounding box:");
+        printf("\n - xmin: %d", example->objects[i].xmin);
+        printf("\n - ymin: %d", example->objects[i].ymin);
+        printf("\n - xmax: %d", example->objects[i].xmax);
+        printf("\n - ymax: %d", example->objects[i].ymax);
+    }
+
+    vector< Ptr<Object> > &test = dataset->getTest();
+    printf("\ntest:\nsize: %u", (unsigned int)test.size());
+    example = static_cast<OR_pascalObj *>(test[0].get());
+    printf("\nfirst image: \n%s", example->filename.c_str());
+
+    vector< Ptr<Object> > &validation = dataset->getValidation();
+    printf("\nvalidation:\nsize: %u", (unsigned int)validation.size());
+    example = static_cast<OR_pascalObj *>(validation[0].get());
+    printf("\nfirst image: \n%s\n", example->filename.c_str());
+
+    return 0;
+}
--- a/modules/datasets/samples/pd_inria.cpp
+++ b/modules/datasets/samples/pd_inria.cpp
@ -0,0 +1,94 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Itseez Inc or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/datasets/pd_inria.hpp"
+#include <opencv2/core.hpp>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+using namespace std;
+using namespace cv;
+using namespace cv::datasets;
+
+int main(int argc, char *argv[])
+{
+    const char *keys =
+            "{ help h usage ? |    | show this message }"
+            "{ path p         |true| path to dataset }";
+    CommandLineParser parser(argc, argv, keys);
+    string path(parser.get<string>("path"));
+    if (parser.has("help") || path=="true")
+    {
+        parser.printMessage();
+        return -1;
+    }
+
+    Ptr<PD_inria> dataset = PD_inria::create();
+    dataset->load(path);
+
+    cout << "train size: " << (unsigned int)dataset->getTrain().size() << endl;
+
+    PD_inriaObj *example = static_cast<PD_inriaObj *>(dataset->getTrain()[0].get());
+    cout << "first train object: " << endl;
+    cout << "name: " << example->filename << endl;
+    
+    // image size
+    cout << "image size: " << endl;
+    cout << "  - width: " << example->width << endl;
+    cout << "  - height: " << example->height << endl;
+    cout << "  - depth: " << example->depth << endl;
+
+    // bounding boxes
+    for (unsigned int i = 0; i < (example->bndboxes).size(); i++)
+    {
+        cout << "object " << i << endl;
+        int x = (example->bndboxes)[i].x;
+        int y = (example->bndboxes)[i].y;
+        cout << "  - xmin = " << x  << endl;
+        cout << "  - ymin = " << y << endl;
+        cout << "  - xmax = " << (example->bndboxes)[i].width + x << endl;
+        cout << "  - ymax = " << (example->bndboxes)[i].height + y<< endl;
+    }
+
+    return 0;
+}
--- a/modules/datasets/src/or_pascal.cpp
+++ b/modules/datasets/src/or_pascal.cpp
@ -0,0 +1,214 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Itseez Inc or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/datasets/or_pascal.hpp"
+#include "opencv2/datasets/util.hpp"
+#include <opencv2/datasets/tinyxml2/tinyxml2.h>
+#include <fstream>
+
+namespace cv
+{
+namespace datasets
+{
+
+using namespace std;
+using namespace tinyxml2;
+
+class OR_pascalImp : public OR_pascal
+{
+public:
+    OR_pascalImp() {}
+
+    virtual void load(const string &path);
+
+private:
+    void loadDataset(const string &path, const string &nameImageSet, vector< Ptr<Object> > &imageSet);
+    Ptr<Object> parseAnnotation(const string path, const string id);
+    const char*  parseNodeText(XMLElement* node, const string nodeName, const string defaultValue);
+};
+
+
+void OR_pascalImp::load(const string &path)
+{
+    train.push_back(vector< Ptr<Object> >());
+    test.push_back(vector< Ptr<Object> >());
+    validation.push_back(vector< Ptr<Object> >());
+
+    loadDataset(path, "train", train.back());
+    loadDataset(path, "test", test.back());
+    loadDataset(path, "val", validation.back());
+}
+
+void OR_pascalImp::loadDataset(const string &path, const string &nameImageSet, vector< Ptr<Object> > &imageSet)
+{
+    string pathImageSets(path + "ImageSets/Main/");
+    string imageList = pathImageSets + nameImageSet + ".txt";
+
+    ifstream in(imageList.c_str());
+    string error_message = format("Image list not exists!\n%s", imageList.c_str());
+
+    if (!in.is_open())
+        CV_Error(Error::StsBadArg, error_message);
+
+    string id = "";
+
+    while( getline(in, id) )
+    {
+        if( strcmp(nameImageSet.c_str(), "test") == 0 ) // test set ground truth is not available
+        {
+            Ptr<OR_pascalObj> annotation(new OR_pascalObj);
+            annotation->filename = path + "JPEGImages/" + id + ".jpg";
+            imageSet.push_back(annotation);
+        }
+        else
+        {
+            imageSet.push_back(parseAnnotation(path, id));
+        }
+    }
+}
+
+const char* OR_pascalImp::parseNodeText(XMLElement* node, const string nodeName, const string defaultValue)
+{
+    const char* e = node->FirstChildElement(nodeName.c_str())->GetText();
+
+    if( e != 0 )
+        return e ;
+    else
+        return defaultValue.c_str();
+}
+
+Ptr<Object> OR_pascalImp::parseAnnotation(const string path, const string id)
+{
+    string pathAnnotations(path + "Annotations/");
+    string pathImages(path + "JPEGImages/");
+    Ptr<OR_pascalObj> annotation(new OR_pascalObj);
+
+    XMLDocument doc;
+    string xml_file = pathAnnotations + id + ".xml";
+
+    XMLError error_code = doc.LoadFile(xml_file.c_str());
+    string error_message = format("Parsing XML failed. Error code = %d. \nFile = %s", error_code, xml_file.c_str());
+    switch (error_code)
+    {
+        case XML_SUCCESS:
+            break;
+        case XML_ERROR_FILE_NOT_FOUND:
+            error_message = "XML file not found! " + error_message;
+            CV_Error(Error::StsParseError, error_message);
+            return annotation;
+        default:
+            CV_Error(Error::StsParseError, error_message);
+            break;
+    }
+
+    // <annotation/>
+    XMLElement *xml_ann = doc.RootElement();
+
+    // <filename/>
+    string img_name = xml_ann->FirstChildElement("filename")->GetText();
+    annotation->filename = pathImages + img_name;
+
+    // <size/>
+    XMLElement *sz = xml_ann->FirstChildElement("size");
+    int width = atoi(sz->FirstChildElement("width")->GetText());
+    int height = atoi(sz->FirstChildElement("height")->GetText());
+    int depth = atoi(sz->FirstChildElement("depth")->GetText());
+    annotation->width = width;
+    annotation->height = height;
+    annotation->depth = depth;
+
+    // <object/>
+    vector<PascalObj> objects;
+    XMLElement *xml_obj = xml_ann->FirstChildElement("object");
+
+    while (xml_obj)
+    {
+        PascalObj pascal_obj;
+        pascal_obj.name = xml_obj->FirstChildElement("name")->GetText();
+        pascal_obj.pose = parseNodeText(xml_obj, "pose", "Unspecified");
+        pascal_obj.truncated = atoi(parseNodeText(xml_obj, "truncated", "0")) > 0;
+        pascal_obj.difficult = atoi(parseNodeText(xml_obj, "difficult", "0")) > 0;
+        pascal_obj.occluded = atoi(parseNodeText(xml_obj, "occluded", "0")) > 0;
+
+        // <bndbox/>
+        XMLElement *xml_bndbox = xml_obj->FirstChildElement("bndbox");
+        pascal_obj.xmin = atoi(xml_bndbox->FirstChildElement("xmin")->GetText());
+        pascal_obj.ymin = atoi(xml_bndbox->FirstChildElement("ymin")->GetText());
+        pascal_obj.xmax = atoi(xml_bndbox->FirstChildElement("xmax")->GetText());
+        pascal_obj.ymax = atoi(xml_bndbox->FirstChildElement("ymax")->GetText());
+
+        // <part/>
+        vector<PascalPart> parts;
+        XMLElement *xml_part = xml_obj->FirstChildElement("part");
+
+        while (xml_part)
+        {
+            PascalPart pascal_part;
+            pascal_part.name = xml_part->FirstChildElement("name")->GetText();
+
+            xml_bndbox = xml_part->FirstChildElement("bndbox");
+            pascal_part.xmin = atoi(xml_bndbox->FirstChildElement("xmin")->GetText());
+            pascal_part.ymin = atoi(xml_bndbox->FirstChildElement("ymin")->GetText());
+            pascal_part.xmax = atoi(xml_bndbox->FirstChildElement("xmax")->GetText());
+            pascal_part.ymax = atoi(xml_bndbox->FirstChildElement("ymax")->GetText());
+            parts.push_back(pascal_part);
+
+            xml_part = xml_part->NextSiblingElement("part");
+        }
+
+        pascal_obj.parts = parts;
+        objects.push_back(pascal_obj);
+
+        xml_obj = xml_obj->NextSiblingElement("object");
+    }
+
+    annotation->objects = objects;
+
+    return annotation;
+}
+
+Ptr<OR_pascal> OR_pascal::create()
+{
+    return Ptr<OR_pascalImp>(new OR_pascalImp);
+}
+
+}
+}
--- a/modules/datasets/src/pd_inria.cpp
+++ b/modules/datasets/src/pd_inria.cpp
@ -0,0 +1,203 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Itseez Inc or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/datasets/pd_inria.hpp"
+#include "opencv2/datasets/util.hpp"
+
+namespace cv
+{
+namespace datasets
+{
+
+using namespace std;
+
+class PD_inriaImp : public PD_inria
+{
+public:
+    PD_inriaImp() {}
+    
+    virtual ~PD_inriaImp() {}
+
+    virtual void load(const string &path);
+
+private:
+    void loadDataset(const string &path, const string nameImageSet, vector< Ptr<Object> > &imageSet);
+    void readTextLines(const string filename, vector< string > &lines);
+    void parseAnnotation(const string filename, Ptr< PD_inriaObj > &object);
+};
+
+void PD_inriaImp::load(const string &path)
+{
+    // Training set
+    train.push_back(vector< Ptr<Object> >());
+    loadDataset(path, "Train", train.back());
+    
+    // Testing set
+    test.push_back(vector< Ptr<Object> >());
+    loadDataset(path, "Test", test.back());
+
+    // There is no validation set
+    validation.push_back(vector< Ptr<Object> >());
+}
+
+void PD_inriaImp::readTextLines(const string filename, vector< string > &lines)
+{
+    ifstream in(filename.c_str());
+    string error_message = "";
+
+    if (!in.is_open())
+    {
+        error_message = format("Unable to open file: \n%s\n", filename.c_str());
+        CV_Error(Error::StsBadArg, error_message);
+    }
+
+    string currline = "";
+
+    while (getline(in, currline))
+        lines.push_back(currline);
+}
+
+void PD_inriaImp::parseAnnotation(const string filename, Ptr< PD_inriaObj > &object)
+{
+    string error_message = "";
+
+    ifstream in(filename.c_str());
+
+    if (!in.is_open())
+    {
+        error_message = format("Unable to open file: \n%s\n", filename.c_str());
+        CV_Error(Error::StsBadArg, error_message);
+    }
+    
+    string imageSizeHeader = "Image size (X x Y x C) : ";
+    string imageSizeFmt = imageSizeHeader + "%d x %d x %d";
+    string objWithGTHeader = "Objects with ground truth : ";
+    string objWithGTFmt = objWithGTHeader + "%d { \"PASperson\" }";
+    string boundBoxHeader = "Bounding box for object ";
+    string boundBoxFmt = boundBoxHeader + "%*d \"PASperson\" (Xmin, Ymin) - (Xmax, Ymax) : (%d, %d) - (%d, %d)";
+    
+    string line = "";
+    
+    int width = 0;
+    int height = 0;
+    int depth = 0;
+    int xmin, ymin, xmax, ymax;
+
+    int numObjects = 0;
+
+    while (getline(in, line))
+    {
+        if (line[0] == '#' || !line[0])
+            continue;
+
+        if (strstr(line.c_str(), imageSizeHeader.c_str()))
+        {
+            sscanf(line.c_str(), imageSizeFmt.c_str(), &width, &height, &depth);
+            object->width = width;
+            object->height = height;
+            object->depth = depth;
+        }
+        else if (strstr(line.c_str(), objWithGTHeader.c_str()))
+        {
+            sscanf(line.c_str(), objWithGTFmt.c_str(), &numObjects);
+            
+            if (numObjects <= 0)
+                break;
+        }
+        else if (strstr(line.c_str(), boundBoxHeader.c_str()))
+        {
+            sscanf(line.c_str(), boundBoxFmt.c_str(), &xmin, &ymin, &xmax, &ymax);
+            Rect bndbox;
+            bndbox.x = xmin;
+            bndbox.y = ymin;
+            bndbox.width = xmax - xmin;
+            bndbox.height = ymax - ymin;
+            (object->bndboxes).push_back(bndbox);
+        }
+    }
+
+    CV_Assert((object->bndboxes).size() == (unsigned int)numObjects);
+}
+
+void PD_inriaImp::loadDataset(const string &path, const string nameImageSet, vector< Ptr<Object> > &imageSet)
+{
+    string listAnn = path + nameImageSet + "/annotations.lst";
+    string listPos = path + nameImageSet + "/pos.lst";
+    string listNeg = path + nameImageSet + "/neg.lst";
+    
+    vector< string > fsAnn;
+    vector< string > fsPos;
+    vector< string > fsNeg;
+   
+    // read file names
+    readTextLines(listAnn, fsAnn);
+    readTextLines(listPos, fsPos);
+    readTextLines(listNeg, fsNeg);
+   
+    CV_Assert(fsAnn.size() == fsPos.size());
+
+    for (unsigned int i = 0; i < fsPos.size(); i++)
+    {
+        Ptr<PD_inriaObj> curr(new PD_inriaObj);
+        parseAnnotation(path + fsAnn[i], curr);
+        curr->filename = path + fsPos[i];
+        curr->sType = POS;
+        
+        imageSet.push_back(curr);
+    }
+    
+    for (unsigned int i = 0; i < fsNeg.size(); i++)
+    {
+        Ptr<PD_inriaObj> curr(new PD_inriaObj);
+        curr->filename = path + fsNeg[i]; 
+        curr->sType = NEG;
+        
+        imageSet.push_back(curr);
+    }
+}
+
+Ptr<PD_inria> PD_inria::create()
+{
+    return Ptr<PD_inriaImp>(new PD_inriaImp);
+}
+
+}
+}
--- a/modules/stereo/CMakeLists.txt
+++ b/modules/stereo/CMakeLists.txt
@ -0,0 +1,2 @@
+set(the_description "Stereo Correspondence")
+ocv_define_module(stereo opencv_imgproc opencv_features2d opencv_core opencv_highgui opencv_calib3d)
--- a/modules/stereo/README.md
+++ b/modules/stereo/README.md
@ -0,0 +1,2 @@
+Stereo Correspondence with different descriptors
+================================================
--- a/modules/stereo/include/opencv2/stereo.hpp
+++ b/modules/stereo/include/opencv2/stereo.hpp
@ -0,0 +1,262 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_STEREO_HPP__
+#define __OPENCV_STEREO_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/core/affine.hpp"
+
+/**
+@defgroup stereo Stereo Correspondance Algorithms
+
+ */
+
+namespace cv
+{
+	namespace stereo
+	{
+		    
+//! @addtogroup stereo
+//! @{
+//		 void correctMatches( InputArray F, InputArray points1, InputArray points2,
+		//	OutputArray newPoints1, OutputArray newPoints2 );
+
+		/** @brief Filters off small noise blobs (speckles) in the disparity map
+
+		@param img The input 16-bit signed disparity image
+		@param newVal The disparity value used to paint-off the speckles
+		@param maxSpeckleSize The maximum speckle size to consider it a speckle. Larger blobs are not
+		affected by the algorithm
+		@param maxDiff Maximum difference between neighbor disparity pixels to put them into the same
+		blob. Note that since StereoBM, StereoSGBM and may be other algorithms return a fixed-point
+		disparity map, where disparity values are multiplied by 16, this scale factor should be taken into
+		account when specifying this parameter value.
+		@param buf The optional temporary buffer to avoid memory allocation within the function.
+		*/
+		/** @brief The base class for stereo correspondence algorithms.
+		*/
+		class StereoMatcher : public Algorithm
+		{
+		public:
+			enum { DISP_SHIFT = 4,
+				DISP_SCALE = (1 << DISP_SHIFT)
+			};
+
+			/** @brief Computes disparity map for the specified stereo pair
+
+			@param left Left 8-bit single-channel image.
+			@param right Right image of the same size and the same type as the left one.
+			@param disparity Output disparity map. It has the same size as the input images. Some algorithms,
+			like StereoBM or StereoSGBM compute 16-bit fixed-point disparity map (where each disparity value
+			has 4 fractional bits), whereas other algorithms output 32-bit floating-point disparity map.
+			*/
+			 virtual void compute( InputArray left, InputArray right,
+				OutputArray disparity ) = 0;
+
+			 virtual int getMinDisparity() const = 0;
+			 virtual void setMinDisparity(int minDisparity) = 0;
+
+			 virtual int getNumDisparities() const = 0;
+			 virtual void setNumDisparities(int numDisparities) = 0;
+
+			 virtual int getBlockSize() const = 0;
+			 virtual void setBlockSize(int blockSize) = 0;
+
+			 virtual int getSpeckleWindowSize() const = 0;
+			 virtual void setSpeckleWindowSize(int speckleWindowSize) = 0;
+
+			 virtual int getSpeckleRange() const = 0;
+			 virtual void setSpeckleRange(int speckleRange) = 0;
+
+			 virtual int getDisp12MaxDiff() const = 0;
+			 virtual void setDisp12MaxDiff(int disp12MaxDiff) = 0;
+		};
+
+
+		/** @brief Class for computing stereo correspondence using the block matching algorithm, introduced and
+		contributed to OpenCV by K. Konolige.
+		*/
+		class StereoBinaryBM : public StereoMatcher
+		{
+		public:
+			enum { PREFILTER_NORMALIZED_RESPONSE = 0,
+				PREFILTER_XSOBEL              = 1
+			};
+
+			 virtual int getPreFilterType() const = 0;
+			 virtual void setPreFilterType(int preFilterType) = 0;
+
+			 virtual int getPreFilterSize() const = 0;
+			 virtual void setPreFilterSize(int preFilterSize) = 0;
+
+			 virtual int getPreFilterCap() const = 0;
+			 virtual void setPreFilterCap(int preFilterCap) = 0;
+
+			 virtual int getTextureThreshold() const = 0;
+			 virtual void setTextureThreshold(int textureThreshold) = 0;
+
+			 virtual int getUniquenessRatio() const = 0;
+			 virtual void setUniquenessRatio(int uniquenessRatio) = 0;
+
+			 virtual int getSmallerBlockSize() const = 0;
+			 virtual void setSmallerBlockSize(int blockSize) = 0;
+
+			 virtual Rect getROI1() const = 0;
+			 virtual void setROI1(Rect roi1) = 0;
+
+			 virtual Rect getROI2() const = 0;
+			 virtual void setROI2(Rect roi2) = 0;
+
+			/** @brief Creates StereoBM object
+
+			@param numDisparities the disparity search range. For each pixel algorithm will find the best
+			disparity from 0 (default minimum disparity) to numDisparities. The search range can then be
+			shifted by changing the minimum disparity.
+			@param blockSize the linear size of the blocks compared by the algorithm. The size should be odd
+			(as the block is centered at the current pixel). Larger block size implies smoother, though less
+			accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher
+			chance for algorithm to find a wrong correspondence.
+
+			The function create StereoBM object. You can then call StereoBM::compute() to compute disparity for
+			a specific stereo pair.
+			*/
+			CV_EXPORTS static Ptr< cv::stereo::StereoBinaryBM > create(int numDisparities = 0, int blockSize = 21);
+		};
+
+		/** @brief The class implements the modified H. Hirschmuller algorithm @cite HH08 that differs from the original
+		one as follows:
+
+		-   By default, the algorithm is single-pass, which means that you consider only 5 directions
+		instead of 8. Set mode=StereoSGBM::MODE_HH in createStereoSGBM to run the full variant of the
+		algorithm but beware that it may consume a lot of memory.
+		-   The algorithm matches blocks, not individual pixels. Though, setting blockSize=1 reduces the
+		blocks to single pixels.
+		-   Mutual information cost function is not implemented. Instead, a simpler Birchfield-Tomasi
+		sub-pixel metric from @cite BT98 is used. Though, the color images are supported as well.
+		-   Some pre- and post- processing steps from K. Konolige algorithm StereoBM are included, for
+		example: pre-filtering (StereoBM::PREFILTER_XSOBEL type) and post-filtering (uniqueness
+		check, quadratic interpolation and speckle filtering).
+
+		@note
+		-   (Python) An example illustrating the use of the StereoSGBM matching algorithm can be found
+		at opencv_source_code/samples/python2/stereo_match.py
+		*/
+		class StereoBinarySGBM : public StereoMatcher
+		{
+		public:
+			enum
+			{
+				MODE_SGBM = 0,
+				MODE_HH   = 1
+			};
+
+			 virtual int getPreFilterCap() const = 0;
+			 virtual void setPreFilterCap(int preFilterCap) = 0;
+
+			 virtual int getUniquenessRatio() const = 0;
+			 virtual void setUniquenessRatio(int uniquenessRatio) = 0;
+
+			 virtual int getP1() const = 0;
+			 virtual void setP1(int P1) = 0;
+
+			 virtual int getP2() const = 0;
+			 virtual void setP2(int P2) = 0;
+
+			 virtual int getMode() const = 0;
+			 virtual void setMode(int mode) = 0;
+
+			/** @brief Creates StereoSGBM object
+
+			@param minDisparity Minimum possible disparity value. Normally, it is zero but sometimes
+			rectification algorithms can shift images, so this parameter needs to be adjusted accordingly.
+			@param numDisparities Maximum disparity minus minimum disparity. The value is always greater than
+			zero. In the current implementation, this parameter must be divisible by 16.
+			@param blockSize Matched block size. It must be an odd number \>=1 . Normally, it should be
+			somewhere in the 3..11 range.
+			@param P1 The first parameter controlling the disparity smoothness. See below.
+			@param P2 The second parameter controlling the disparity smoothness. The larger the values are,
+			the smoother the disparity is. P1 is the penalty on the disparity change by plus or minus 1
+			between neighbor pixels. P2 is the penalty on the disparity change by more than 1 between neighbor
+			pixels. The algorithm requires P2 \> P1 . See stereo_match.cpp sample where some reasonably good
+			P1 and P2 values are shown (like 8\*number_of_image_channels\*SADWindowSize\*SADWindowSize and
+			32\*number_of_image_channels\*SADWindowSize\*SADWindowSize , respectively).
+			@param disp12MaxDiff Maximum allowed difference (in integer pixel units) in the left-right
+			disparity check. Set it to a non-positive value to disable the check.
+			@param preFilterCap Truncation value for the prefiltered image pixels. The algorithm first
+			computes x-derivative at each pixel and clips its value by [-preFilterCap, preFilterCap] interval.
+			The result values are passed to the Birchfield-Tomasi pixel cost function.
+			@param uniquenessRatio Margin in percentage by which the best (minimum) computed cost function
+			value should "win" the second best value to consider the found match correct. Normally, a value
+			within the 5-15 range is good enough.
+			@param speckleWindowSize Maximum size of smooth disparity regions to consider their noise speckles
+			and invalidate. Set it to 0 to disable speckle filtering. Otherwise, set it somewhere in the
+			50-200 range.
+			@param speckleRange Maximum disparity variation within each connected component. If you do speckle
+			filtering, set the parameter to a positive value, it will be implicitly multiplied by 16.
+			Normally, 1 or 2 is good enough.
+			@param mode Set it to StereoSGBM::MODE_HH to run the full-scale two-pass dynamic programming
+			algorithm. It will consume O(W\*H\*numDisparities) bytes, which is large for 640x480 stereo and
+			huge for HD-size pictures. By default, it is set to false .
+
+			The first constructor initializes StereoSGBM with all the default parameters. So, you only have to
+			set StereoSGBM::numDisparities at minimum. The second constructor enables you to set each parameter
+			to a custom value.
+			*/
+			 CV_EXPORTS static Ptr<cv::stereo::StereoBinarySGBM> create(int minDisparity, int numDisparities, int blockSize,
+				int P1 = 0, int P2 = 0, int disp12MaxDiff = 0,
+				int preFilterCap = 0, int uniquenessRatio = 0,
+				int speckleWindowSize = 0, int speckleRange = 0,
+				int mode = StereoBinarySGBM::MODE_SGBM);
+		};
+		//! @}
+	}//stereo
+} // cv
+
+#ifndef DISABLE_OPENCV_24_COMPATIBILITY
+#include "opencv2/stereo/stereo_c.h"
+#endif
+
+#endif
+
--- a/modules/stereo/include/opencv2/stereo/stereo.hpp
+++ b/modules/stereo/include/opencv2/stereo/stereo.hpp
@ -0,0 +1,49 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/stereo.hpp"
+
--- a/modules/stereo/src/precomp.hpp
+++ b/modules/stereo/src/precomp.hpp
@ -0,0 +1,59 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_STEREO_PRECOMP_H__
+#define __OPENCV_STEREO_PRECOMP_H__
+
+#include "opencv2/stereo.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/private.hpp"
+#include "opencv2/core/cvdef.h"
+#include "opencv2/highgui.hpp"
+#include "opencv2/calib3d.hpp"
+
+#include <algorithm>
+#include <cmath>
+
+#endif
+
--- a/modules/stereo/src/stereo_binary_bm.cpp
+++ b/modules/stereo/src/stereo_binary_bm.cpp
@ -0,0 +1,738 @@
+//M*//////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/****************************************************************************************\
+*    Very fast SAD-based (Sum-of-Absolute-Diffrences) stereo correspondence algorithm.   *
+*    Contributed by Kurt Konolige                                                        *
+\****************************************************************************************/
+
+#include "precomp.hpp"
+#include <stdio.h>
+#include <limits>
+
+namespace cv
+{
+	namespace stereo
+	{
+		struct StereoBinaryBMParams
+		{
+			StereoBinaryBMParams(int _numDisparities = 64, int _SADWindowSize = 9)
+			{
+				preFilterType = StereoBinaryBM::PREFILTER_XSOBEL;
+				preFilterSize = 9;
+				preFilterCap = 31;
+				SADWindowSize = _SADWindowSize;
+				minDisparity = 0;
+				numDisparities = _numDisparities > 0 ? _numDisparities : 64;
+				textureThreshold = 10;
+				uniquenessRatio = 15;
+				speckleRange = speckleWindowSize = 0;
+				roi1 = roi2 = Rect(0, 0, 0, 0);
+				disp12MaxDiff = -1;
+				dispType = CV_16S;
+			}
+
+			int preFilterType;
+			int preFilterSize;
+			int preFilterCap;
+			int SADWindowSize;
+			int minDisparity;
+			int numDisparities;
+			int textureThreshold;
+			int uniquenessRatio;
+			int speckleRange;
+			int speckleWindowSize;
+			Rect roi1, roi2;
+			int disp12MaxDiff;
+			int dispType;
+		};
+
+		static void prefilterNorm(const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf)
+		{
+			int x, y, wsz2 = winsize / 2;
+			int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32);
+			int scale_g = winsize*winsize / 8, scale_s = (1024 + scale_g) / (scale_g * 2);
+			const int OFS = 256 * 5, TABSZ = OFS * 2 + 256;
+			uchar tab[TABSZ];
+			const uchar* sptr = src.ptr();
+			int srcstep = (int)src.step;
+			Size size = src.size();
+
+			scale_g *= scale_s;
+
+			for (x = 0; x < TABSZ; x++)
+				tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero * 2 : x - OFS + ftzero);
+
+			for (x = 0; x < size.width; x++)
+				vsum[x] = (ushort)(sptr[x] * (wsz2 + 2));
+
+			for (y = 1; y < wsz2; y++)
+			{
+				for (x = 0; x < size.width; x++)
+					vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
+			}
+
+			for (y = 0; y < size.height; y++)
+			{
+				const uchar* top = sptr + srcstep*MAX(y - wsz2 - 1, 0);
+				const uchar* bottom = sptr + srcstep*MIN(y + wsz2, size.height - 1);
+				const uchar* prev = sptr + srcstep*MAX(y - 1, 0);
+				const uchar* curr = sptr + srcstep*y;
+				const uchar* next = sptr + srcstep*MIN(y + 1, size.height - 1);
+				uchar* dptr = dst.ptr<uchar>(y);
+
+				for (x = 0; x < size.width; x++)
+					vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);
+
+				for (x = 0; x <= wsz2; x++)
+				{
+					vsum[-x - 1] = vsum[0];
+					vsum[size.width + x] = vsum[size.width - 1];
+				}
+
+				int sum = vsum[0] * (wsz2 + 1);
+				for (x = 1; x <= wsz2; x++)
+					sum += vsum[x];
+
+				int val = ((curr[0] * 5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
+				dptr[0] = tab[val + OFS];
+
+				for (x = 1; x < size.width - 1; x++)
+				{
+					sum += vsum[x + wsz2] - vsum[x - wsz2 - 1];
+					val = ((curr[x] * 4 + curr[x - 1] + curr[x + 1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
+					dptr[x] = tab[val + OFS];
+				}
+
+				sum += vsum[x + wsz2] - vsum[x - wsz2 - 1];
+				val = ((curr[x] * 5 + curr[x - 1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
+				dptr[x] = tab[val + OFS];
+			}
+		}
+
+		static void
+			prefilterXSobel(const Mat& src, Mat& dst, int ftzero)
+		{
+			int x, y;
+			const int OFS = 256 * 4, TABSZ = OFS * 2 + 256;
+			uchar tab[TABSZ];
+			Size size = src.size();
+
+			for (x = 0; x < TABSZ; x++)
+				tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero * 2 : x - OFS + ftzero);
+			uchar val0 = tab[0 + OFS];
+
+#if CV_SSE2
+			volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+#endif
+
+			for (y = 0; y < size.height - 1; y += 2)
+			{
+				const uchar* srow1 = src.ptr<uchar>(y);
+				const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
+				const uchar* srow2 = y < size.height - 1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
+				const uchar* srow3 = y < size.height - 2 ? srow1 + src.step * 2 : srow1;
+				uchar* dptr0 = dst.ptr<uchar>(y);
+				uchar* dptr1 = dptr0 + dst.step;
+
+				dptr0[0] = dptr0[size.width - 1] = dptr1[0] = dptr1[size.width - 1] = val0;
+				x = 1;
+
+#if CV_SSE2
+				if (useSIMD)
+				{
+					__m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16((short)ftzero),
+						ftz2 = _mm_set1_epi8(cv::saturate_cast<uchar>(ftzero * 2));
+					for (; x <= size.width - 9; x += 8)
+					{
+						__m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z);
+						__m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z);
+						__m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z);
+						__m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z);
+
+						d0 = _mm_sub_epi16(d0, c0);
+						d1 = _mm_sub_epi16(d1, c1);
+
+						__m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
+						__m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z);
+						__m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
+						__m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z);
+
+						d2 = _mm_sub_epi16(d2, c2);
+						d3 = _mm_sub_epi16(d3, c3);
+
+						__m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1)));
+						__m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2)));
+						v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz));
+						v0 = _mm_min_epu8(v0, ftz2);
+
+						_mm_storel_epi64((__m128i*)(dptr0 + x), v0);
+						_mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0));
+					}
+				}
+#endif
+
+				for (; x < size.width - 1; x++)
+				{
+					int d0 = srow0[x + 1] - srow0[x - 1], d1 = srow1[x + 1] - srow1[x - 1],
+						d2 = srow2[x + 1] - srow2[x - 1], d3 = srow3[x + 1] - srow3[x - 1];
+					int v0 = tab[d0 + d1 * 2 + d2 + OFS];
+					int v1 = tab[d1 + d2 * 2 + d3 + OFS];
+					dptr0[x] = (uchar)v0;
+					dptr1[x] = (uchar)v1;
+				}
+			}
+
+			for (; y < size.height; y++)
+			{
+				uchar* dptr = dst.ptr<uchar>(y);
+				for (x = 0; x < size.width; x++)
+					dptr[x] = val0;
+			}
+		}
+
+		static const int DISPARITY_SHIFT = 4;
+
+		static void
+			findStereoCorrespondenceBM(const Mat& left, const Mat& right,
+			Mat& disp, Mat& cost, const StereoBinaryBMParams& state,
+			uchar* buf, int _dy0, int _dy1)
+		{
+			const int ALIGN = 16;
+			int x, y, d;
+			int wsz = state.SADWindowSize, wsz2 = wsz / 2;
+			int dy0 = MIN(_dy0, wsz2 + 1), dy1 = MIN(_dy1, wsz2 + 1);
+			int ndisp = state.numDisparities;
+			int mindisp = state.minDisparity;
+			int lofs = MAX(ndisp - 1 + mindisp, 0);
+			int rofs = -MIN(ndisp - 1 + mindisp, 0);
+			int width = left.cols, height = left.rows;
+			int width1 = width - rofs - ndisp + 1;
+			int ftzero = state.preFilterCap;
+			int textureThreshold = state.textureThreshold;
+			int uniquenessRatio = state.uniquenessRatio;
+			short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
+
+			int *sad, *hsad0, *hsad, *hsad_sub, *htext;
+			uchar *cbuf0, *cbuf;
+			const uchar* lptr0 = left.ptr() + lofs;
+			const uchar* rptr0 = right.ptr() + rofs;
+			const uchar *lptr, *lptr_sub, *rptr;
+			short* dptr = disp.ptr<short>();
+			int sstep = (int)left.step;
+			int dstep = (int)(disp.step / sizeof(dptr[0]));
+			int cstep = (height + dy0 + dy1)*ndisp;
+			int costbuf = 0;
+			int coststep = cost.data ? (int)(cost.step / sizeof(costbuf)) : 0;
+			const int TABSZ = 256;
+			uchar tab[TABSZ];
+
+			sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
+			hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
+			htext = (int*)alignPtr((int*)(hsad0 + (height + dy1)*ndisp) + wsz2 + 2, ALIGN);
+			cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
+
+			for (x = 0; x < TABSZ; x++)
+				tab[x] = (uchar)std::abs(x - ftzero);
+
+			// initialize buffers
+			memset(hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]));
+			memset(htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]));
+
+			for (x = -wsz2 - 1; x < wsz2; x++)
+			{
+				hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
+				lptr = lptr0 + std::min(std::max(x, -lofs), width - lofs - 1) - dy0*sstep;
+				rptr = rptr0 + std::min(std::max(x, -rofs), width - rofs - 1) - dy0*sstep;
+				for (y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep)
+				{
+					int lval = lptr[0];
+					for (d = 0; d < ndisp; d++)
+					{
+						int diff = std::abs(lval - rptr[d]);
+						cbuf[d] = (uchar)diff;
+						hsad[d] = (int)(hsad[d] + diff);
+					}
+					htext[y] += tab[lval];
+				}
+			}
+
+			// initialize the left and right borders of the disparity map
+			for (y = 0; y < height; y++)
+			{
+				for (x = 0; x < lofs; x++)
+					dptr[y*dstep + x] = FILTERED;
+				for (x = lofs + width1; x < width; x++)
+					dptr[y*dstep + x] = FILTERED;
+			}
+			dptr += lofs;
+
+			for (x = 0; x < width1; x++, dptr++)
+			{
+				int* costptr = cost.data ? cost.ptr<int>() + lofs + x : &costbuf;
+				int x0 = x - wsz2 - 1, x1 = x + wsz2;
+				const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
+				cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
+				hsad = hsad0 - dy0*ndisp;
+				lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width - 1 - lofs) - dy0*sstep;
+				lptr = lptr0 + MIN(MAX(x1, -lofs), width - 1 - lofs) - dy0*sstep;
+				rptr = rptr0 + MIN(MAX(x1, -rofs), width - 1 - rofs) - dy0*sstep;
+
+				for (y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
+					hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep)
+				{
+					int lval = lptr[0];
+					for (d = 0; d < ndisp; d++)
+					{
+						int diff = std::abs(lval - rptr[d]);
+						cbuf[d] = (uchar)diff;
+						hsad[d] = hsad[d] + diff - cbuf_sub[d];
+					}
+					htext[y] += tab[lval] - tab[lptr_sub[0]];
+				}
+
+				// fill borders
+				for (y = dy1; y <= wsz2; y++)
+					htext[height + y] = htext[height + dy1 - 1];
+				for (y = -wsz2 - 1; y < -dy0; y++)
+					htext[y] = htext[-dy0];
+
+				// initialize sums
+				int tsum = 0;
+				{
+					for (d = 0; d < ndisp; d++)
+						sad[d] = (int)(hsad0[d - ndisp*dy0] * (wsz2 + 2 - dy0));
+
+					hsad = hsad0 + (1 - dy0)*ndisp;
+					for (y = 1 - dy0; y < wsz2; y++, hsad += ndisp)
+						for (d = 0; d < ndisp; d++)
+							sad[d] = (int)(sad[d] + hsad[d]);
+
+					for (y = -wsz2 - 1; y < wsz2; y++)
+						tsum += htext[y];
+				}
+				// finally, start the real processing
+				{
+					for (y = 0; y < height; y++)
+					{
+						int minsad = INT_MAX, mind = -1;
+						hsad = hsad0 + MIN(y + wsz2, height + dy1 - 1)*ndisp;
+						hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
+
+						for (d = 0; d < ndisp; d++)
+						{
+							int currsad = sad[d] + hsad[d] - hsad_sub[d];
+							sad[d] = currsad;
+							if (currsad < minsad)
+							{
+								minsad = currsad;
+								mind = d;
+							}
+						}
+
+						tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
+						if (tsum < textureThreshold)
+						{
+							dptr[y*dstep] = FILTERED;
+							continue;
+						}
+
+						if (uniquenessRatio > 0)
+						{
+							int thresh = minsad + (minsad * uniquenessRatio / 100);
+							for (d = 0; d < ndisp; d++)
+							{
+								if ((d < mind - 1 || d > mind + 1) && sad[d] <= thresh)
+									break;
+							}
+							if (d < ndisp)
+							{
+								dptr[y*dstep] = FILTERED;
+								continue;
+							}
+						}
+
+						{
+							sad[-1] = sad[1];
+							sad[ndisp] = sad[ndisp - 2];
+							int p = sad[mind + 1], n = sad[mind - 1];
+							d = p + n - 2 * sad[mind] + std::abs(p - n);
+							dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp) * 256 + (d != 0 ? (p - n) * 256 / d : 0) + 15) >> 4);
+							costptr[y*coststep] = sad[mind];
+						}
+					}
+				}
+			}
+		}
+
+
+		struct PrefilterInvoker : public ParallelLoopBody
+		{
+			PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
+				uchar* buf0, uchar* buf1, StereoBinaryBMParams* _state)
+			{
+				imgs0[0] = &left0; imgs0[1] = &right0;
+				imgs[0] = &left; imgs[1] = &right;
+				buf[0] = buf0; buf[1] = buf1;
+				state = _state;
+			}
+
+			void operator()(const Range& range) const
+			{
+				for (int i = range.start; i < range.end; i++)
+				{
+					if (state->preFilterType == StereoBinaryBM::PREFILTER_NORMALIZED_RESPONSE)
+						prefilterNorm(*imgs0[i], *imgs[i], state->preFilterSize, state->preFilterCap, buf[i]);
+					else
+						prefilterXSobel(*imgs0[i], *imgs[i], state->preFilterCap);
+				}
+			}
+
+			const Mat* imgs0[2];
+			Mat* imgs[2];
+			uchar* buf[2];
+			StereoBinaryBMParams* state;
+		};
+
+		struct FindStereoCorrespInvoker : public ParallelLoopBody
+		{
+			FindStereoCorrespInvoker(const Mat& _left, const Mat& _right,
+				Mat& _disp, StereoBinaryBMParams* _state,
+				int _nstripes, size_t _stripeBufSize,
+				bool _useShorts, Rect _validDisparityRect,
+				Mat& _slidingSumBuf, Mat& _cost)
+			{
+				left = &_left; right = &_right;
+				disp = &_disp; state = _state;
+				nstripes = _nstripes; stripeBufSize = _stripeBufSize;
+				useShorts = _useShorts;
+				validDisparityRect = _validDisparityRect;
+				slidingSumBuf = &_slidingSumBuf;
+				cost = &_cost;
+			}
+
+			void operator()(const Range& range) const
+			{
+				int cols = left->cols, rows = left->rows;
+				int _row0 = std::min(cvRound(range.start * rows / nstripes), rows);
+				int _row1 = std::min(cvRound(range.end * rows / nstripes), rows);
+				uchar *ptr = slidingSumBuf->ptr() + range.start * stripeBufSize;
+				int FILTERED = (state->minDisparity - 1) * 16;
+
+				Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
+				if (roi.height == 0)
+					return;
+				int row0 = roi.y;
+				int row1 = roi.y + roi.height;
+
+				Mat part;
+				if (row0 > _row0)
+				{
+					part = disp->rowRange(_row0, row0);
+					part = Scalar::all(FILTERED);
+				}
+				if (_row1 > row1)
+				{
+					part = disp->rowRange(row1, _row1);
+					part = Scalar::all(FILTERED);
+				}
+
+				Mat left_i = left->rowRange(row0, row1);
+				Mat right_i = right->rowRange(row0, row1);
+				Mat disp_i = disp->rowRange(row0, row1);
+				Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
+
+				findStereoCorrespondenceBM(left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1);
+
+				if (state->disp12MaxDiff >= 0)
+					validateDisparity(disp_i, cost_i, state->minDisparity, state->numDisparities, state->disp12MaxDiff);
+
+				if (roi.x > 0)
+				{
+					part = disp_i.colRange(0, roi.x);
+					part = Scalar::all(FILTERED);
+				}
+				if (roi.x + roi.width < cols)
+				{
+					part = disp_i.colRange(roi.x + roi.width, cols);
+					part = Scalar::all(FILTERED);
+				}
+			}
+
+		protected:
+			const Mat *left, *right;
+			Mat* disp, *slidingSumBuf, *cost;
+			StereoBinaryBMParams *state;
+
+			int nstripes;
+			size_t stripeBufSize;
+			bool useShorts;
+			Rect validDisparityRect;
+		};
+
+		class StereoBinaryBMImpl : public StereoBinaryBM
+		{
+		public:
+			StereoBinaryBMImpl()
+			{
+				params = StereoBinaryBMParams();
+			}
+
+			StereoBinaryBMImpl(int _numDisparities, int _SADWindowSize)
+			{
+				params = StereoBinaryBMParams(_numDisparities, _SADWindowSize);
+			}
+
+			void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr)
+			{
+				int dtype = disparr.fixedType() ? disparr.type() : params.dispType;
+				Size leftsize = leftarr.size();
+
+				if (leftarr.size() != rightarr.size())
+					CV_Error(Error::StsUnmatchedSizes, "All the images must have the same size");
+
+				if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1)
+					CV_Error(Error::StsUnsupportedFormat, "Both input images must have CV_8UC1");
+
+				if (dtype != CV_16SC1 && dtype != CV_32FC1)
+					CV_Error(Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format");
+
+				if (params.preFilterType != PREFILTER_NORMALIZED_RESPONSE &&
+					params.preFilterType != PREFILTER_XSOBEL)
+					CV_Error(Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE");
+
+				if (params.preFilterSize < 5 || params.preFilterSize > 255 || params.preFilterSize % 2 == 0)
+					CV_Error(Error::StsOutOfRange, "preFilterSize must be odd and be within 5..255");
+
+				if (params.preFilterCap < 1 || params.preFilterCap > 63)
+					CV_Error(Error::StsOutOfRange, "preFilterCap must be within 1..63");
+
+				if (params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 ||
+					params.SADWindowSize >= std::min(leftsize.width, leftsize.height))
+					CV_Error(Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height");
+
+				if (params.numDisparities <= 0 || params.numDisparities % 16 != 0)
+					CV_Error(Error::StsOutOfRange, "numDisparities must be positive and divisble by 16");
+
+				if (params.textureThreshold < 0)
+					CV_Error(Error::StsOutOfRange, "texture threshold must be non-negative");
+
+				if (params.uniquenessRatio < 0)
+					CV_Error(Error::StsOutOfRange, "uniqueness ratio must be non-negative");
+
+				int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT;
+
+
+				Mat left0 = leftarr.getMat(), right0 = rightarr.getMat();
+				disparr.create(left0.size(), dtype);
+				Mat disp0 = disparr.getMat();
+
+				preFilteredImg0.create(left0.size(), CV_8U);
+				preFilteredImg1.create(left0.size(), CV_8U);
+				cost.create(left0.size(), CV_16S);
+
+				Mat left = preFilteredImg0, right = preFilteredImg1;
+
+				int mindisp = params.minDisparity;
+				int ndisp = params.numDisparities;
+
+				int width = left0.cols;
+				int height = left0.rows;
+				int lofs = std::max(ndisp - 1 + mindisp, 0);
+				int rofs = -std::min(ndisp - 1 + mindisp, 0);
+				int width1 = width - rofs - ndisp + 1;
+
+				if (lofs >= width || rofs >= width || width1 < 1)
+				{
+					disp0 = Scalar::all(FILTERED * (disp0.type() < CV_32F ? 1 : 1. / (1 << DISPARITY_SHIFT)));
+					return;
+				}
+
+				Mat disp = disp0;
+				if (dtype == CV_32F)
+				{
+					dispbuf.create(disp0.size(), CV_16S);
+					disp = dispbuf;
+				}
+
+				int wsz = params.SADWindowSize;
+				int bufSize0 = (int)((ndisp + 2)*sizeof(int));
+				bufSize0 += (int)((height + wsz + 2)*ndisp*sizeof(int));
+				bufSize0 += (int)((height + wsz + 2)*sizeof(int));
+				bufSize0 += (int)((height + wsz + 2)*ndisp*(wsz + 2)*sizeof(uchar) + 256);
+
+				int bufSize1 = (int)((width + params.preFilterSize + 2) * sizeof(int) + 256);
+				int bufSize2 = 0;
+				if (params.speckleRange >= 0 && params.speckleWindowSize > 0)
+					bufSize2 = width*height*(sizeof(Point_<short>) + sizeof(int) + sizeof(uchar));
+
+#if CV_SSE2
+				bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2);
+#else
+				const bool useShorts = false;
+#endif
+
+				const double SAD_overhead_coeff = 10.0;
+				double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
+				double maxStripeSize = std::min(std::max(N0 / (width * ndisp), (wsz - 1) * SAD_overhead_coeff), (double)height);
+				int nstripes = cvCeil(height / maxStripeSize);
+				int bufSize = std::max(bufSize0 * nstripes, std::max(bufSize1 * 2, bufSize2));
+
+				if (slidingSumBuf.cols < bufSize)
+					slidingSumBuf.create(1, bufSize, CV_8U);
+
+				uchar *_buf = slidingSumBuf.ptr();
+
+				parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, &params), 1);
+
+				Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
+				validDisparityRect = getValidDisparityROI(R1.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
+					R2.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
+					params.minDisparity, params.numDisparities,
+					params.SADWindowSize);
+
+				parallel_for_(Range(0, nstripes),
+					FindStereoCorrespInvoker(left, right, disp, &params, nstripes,
+					bufSize0, useShorts, validDisparityRect,
+					slidingSumBuf, cost));
+
+				if (params.speckleRange >= 0 && params.speckleWindowSize > 0)
+					filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
+
+				if (disp0.data != disp.data)
+					disp.convertTo(disp0, disp0.type(), 1. / (1 << DISPARITY_SHIFT), 0);
+			}
+
+			int getMinDisparity() const { return params.minDisparity; }
+			void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
+
+			int getNumDisparities() const { return params.numDisparities; }
+			void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
+
+			int getBlockSize() const { return params.SADWindowSize; }
+			void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
+
+			int getSpeckleWindowSize() const { return params.speckleWindowSize; }
+			void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
+
+			int getSpeckleRange() const { return params.speckleRange; }
+			void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
+
+			int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
+			void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
+
+			int getPreFilterType() const { return params.preFilterType; }
+			void setPreFilterType(int preFilterType) { params.preFilterType = preFilterType; }
+
+			int getPreFilterSize() const { return params.preFilterSize; }
+			void setPreFilterSize(int preFilterSize) { params.preFilterSize = preFilterSize; }
+
+			int getPreFilterCap() const { return params.preFilterCap; }
+			void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
+
+			int getTextureThreshold() const { return params.textureThreshold; }
+			void setTextureThreshold(int textureThreshold) { params.textureThreshold = textureThreshold; }
+
+			int getUniquenessRatio() const { return params.uniquenessRatio; }
+			void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
+
+			int getSmallerBlockSize() const { return 0; }
+			void setSmallerBlockSize(int) {}
+
+			Rect getROI1() const { return params.roi1; }
+			void setROI1(Rect roi1) { params.roi1 = roi1; }
+
+			Rect getROI2() const { return params.roi2; }
+			void setROI2(Rect roi2) { params.roi2 = roi2; }
+
+			void write(FileStorage& fs) const
+			{
+				fs << "name" << name_
+					<< "minDisparity" << params.minDisparity
+					<< "numDisparities" << params.numDisparities
+					<< "blockSize" << params.SADWindowSize
+					<< "speckleWindowSize" << params.speckleWindowSize
+					<< "speckleRange" << params.speckleRange
+					<< "disp12MaxDiff" << params.disp12MaxDiff
+					<< "preFilterType" << params.preFilterType
+					<< "preFilterSize" << params.preFilterSize
+					<< "preFilterCap" << params.preFilterCap
+					<< "textureThreshold" << params.textureThreshold
+					<< "uniquenessRatio" << params.uniquenessRatio;
+			}
+
+			void read(const FileNode& fn)
+			{
+				FileNode n = fn["name"];
+				CV_Assert(n.isString() && String(n) == name_);
+				params.minDisparity = (int)fn["minDisparity"];
+				params.numDisparities = (int)fn["numDisparities"];
+				params.SADWindowSize = (int)fn["blockSize"];
+				params.speckleWindowSize = (int)fn["speckleWindowSize"];
+				params.speckleRange = (int)fn["speckleRange"];
+				params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
+				params.preFilterType = (int)fn["preFilterType"];
+				params.preFilterSize = (int)fn["preFilterSize"];
+				params.preFilterCap = (int)fn["preFilterCap"];
+				params.textureThreshold = (int)fn["textureThreshold"];
+				params.uniquenessRatio = (int)fn["uniquenessRatio"];
+				params.roi1 = params.roi2 = Rect();
+			}
+
+			StereoBinaryBMParams params;
+			Mat preFilteredImg0, preFilteredImg1, cost, dispbuf;
+			Mat slidingSumBuf;
+
+			static const char* name_;
+		};
+
+		const char* StereoBinaryBMImpl::name_ = "StereoMatcher.BM";
+
+		Ptr<StereoBinaryBM> StereoBinaryBM::create(int _numDisparities, int _SADWindowSize)
+		{
+			return makePtr<StereoBinaryBMImpl>(_numDisparities, _SADWindowSize);
+		}
+	}
+}
+
+/* End of file. */
+
--- a/modules/stereo/src/stereo_binary_sgbm.cpp
+++ b/modules/stereo/src/stereo_binary_sgbm.cpp
@ -0,0 +1,958 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+This is a variation of
+"Stereo Processing by Semiglobal Matching and Mutual Information"
+by Heiko Hirschmuller.
+
+We match blocks rather than individual pixels, thus the algorithm is called
+SGBM (Semi-global block matching)
+*/
+
+#include "precomp.hpp"
+#include <limits.h>
+
+namespace cv
+{
+	namespace stereo
+	{
+		typedef uchar PixType;
+		typedef short CostType;
+		typedef short DispType;
+
+		enum { NR = 16, NR2 = NR/2 };
+
+
+		struct StereoBinarySGBMParams
+		{
+			StereoBinarySGBMParams()
+			{
+				minDisparity = numDisparities = 0;
+				SADWindowSize = 0;
+				P1 = P2 = 0;
+				disp12MaxDiff = 0;
+				preFilterCap = 0;
+				uniquenessRatio = 0;
+				speckleWindowSize = 0;
+				speckleRange = 0;
+				mode = StereoBinarySGBM::MODE_SGBM;
+			}
+
+			StereoBinarySGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
+				int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
+				int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
+				int _mode )
+			{
+				minDisparity = _minDisparity;
+				numDisparities = _numDisparities;
+				SADWindowSize = _SADWindowSize;
+				P1 = _P1;
+				P2 = _P2;
+				disp12MaxDiff = _disp12MaxDiff;
+				preFilterCap = _preFilterCap;
+				uniquenessRatio = _uniquenessRatio;
+				speckleWindowSize = _speckleWindowSize;
+				speckleRange = _speckleRange;
+				mode = _mode;
+			}
+
+			int minDisparity;
+			int numDisparities;
+			int SADWindowSize;
+			int preFilterCap;
+			int uniquenessRatio;
+			int P1;
+			int P2;
+			int speckleWindowSize;
+			int speckleRange;
+			int disp12MaxDiff;
+			int mode;
+		};
+
+		/*
+		For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
+		and for each disparity minD<=d<maxD the function
+		computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
+		row1[x] and row2[x-d]. The subpixel algorithm from
+		"Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
+		is used, hence the suffix BT.
+
+		the temporary buffer should contain width2*2 elements
+		*/
+		static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
+			int minD, int maxD, CostType* cost,
+			PixType* buffer, const PixType* tab,
+			int tabOfs, int )
+		{
+			int x, c, width = img1.cols, cn = img1.channels();
+			int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
+			int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
+			int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
+			const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
+			PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
+
+			tab += tabOfs;
+
+			for( c = 0; c < cn*2; c++ )
+			{
+				prow1[width*c] = prow1[width*c + width-1] =
+					prow2[width*c] = prow2[width*c + width-1] = tab[0];
+			}
+
+			int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
+			int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
+
+			if( cn == 1 )
+			{
+				for( x = 1; x < width-1; x++ )
+				{
+					prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
+					prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
+
+					prow1[x+width] = row1[x];
+					prow2[width-1-x+width] = row2[x];
+				}
+			}
+			else
+			{
+				for( x = 1; x < width-1; x++ )
+				{
+					prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
+					prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
+					prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
+
+					prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
+					prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
+					prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
+
+					prow1[x+width*3] = row1[x*3];
+					prow1[x+width*4] = row1[x*3+1];
+					prow1[x+width*5] = row1[x*3+2];
+
+					prow2[width-1-x+width*3] = row2[x*3];
+					prow2[width-1-x+width*4] = row2[x*3+1];
+					prow2[width-1-x+width*5] = row2[x*3+2];
+				}
+			}
+
+			memset( cost, 0, width1*D*sizeof(cost[0]) );
+
+			buffer -= minX2;
+			cost -= minX1*D + minD; // simplify the cost indices inside the loop
+
+#if CV_SSE2
+			volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+#endif
+
+#if 1
+			for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
+			{
+				int diff_scale = c < cn ? 0 : 2;
+
+				// precompute
+				//   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
+				//   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
+				for( x = minX2; x < maxX2; x++ )
+				{
+					int v = prow2[x];
+					int vl = x > 0 ? (v + prow2[x-1])/2 : v;
+					int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
+					int v0 = std::min(vl, vr); v0 = std::min(v0, v);
+					int v1 = std::max(vl, vr); v1 = std::max(v1, v);
+					buffer[x] = (PixType)v0;
+					buffer[x + width2] = (PixType)v1;
+				}
+
+				for( x = minX1; x < maxX1; x++ )
+				{
+					int u = prow1[x];
+					int ul = x > 0 ? (u + prow1[x-1])/2 : u;
+					int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
+					int u0 = std::min(ul, ur); u0 = std::min(u0, u);
+					int u1 = std::max(ul, ur); u1 = std::max(u1, u);
+
+#if CV_SSE2
+					if( useSIMD )
+					{
+						__m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
+						__m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
+						__m128i ds = _mm_cvtsi32_si128(diff_scale);
+
+						for( int d = minD; d < maxD; d += 16 )
+						{
+							__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
+							__m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
+							__m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
+							__m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
+							__m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
+							__m128i diff = _mm_min_epu8(c0, c1);
+
+							c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
+							c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
+
+							_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
+							_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
+						}
+					}
+					else
+#endif
+					{
+						for( int d = minD; d < maxD; d++ )
+						{
+							int v = prow2[width-x-1 + d];
+							int v0 = buffer[width-x-1 + d];
+							int v1 = buffer[width-x-1 + d + width2];
+							int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
+							int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
+
+							cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
+						}
+					}
+				}
+			}
+#else
+			for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
+			{
+				for( x = minX1; x < maxX1; x++ )
+				{
+					int u = prow1[x];
+#if CV_SSE2
+					if( useSIMD )
+					{
+						__m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
+
+						for( int d = minD; d < maxD; d += 16 )
+						{
+							__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
+							__m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
+							__m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
+							__m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
+
+							_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
+							_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
+						}
+					}
+					else
+#endif
+					{
+						for( int d = minD; d < maxD; d++ )
+						{
+							int v = prow2[width-1-x + d];
+							cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
+						}
+					}
+				}
+			}
+#endif
+		}
+
+
+		/*
+		computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
+		that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
+		minD <= d < maxD.
+		disp2full is the reverse disparity map, that is:
+		disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
+
+		note that disp1buf will have the same size as the roi and
+		disp2full will have the same size as img1 (or img2).
+		On exit disp2buf is not the final disparity, it is an intermediate result that becomes
+		final after all the tiles are processed.
+
+		the disparity in disp1buf is written with sub-pixel accuracy
+		(4 fractional bits, see StereoSGBM::DISP_SCALE),
+		using quadratic interpolation, while the disparity in disp2buf
+		is written as is, without interpolation.
+
+		disp2cost also has the same size as img1 (or img2).
+		It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
+		*/
+		static void computeDisparityBinarySGBM( const Mat& img1, const Mat& img2,
+			Mat& disp1, const StereoBinarySGBMParams& params,
+			Mat& buffer )
+		{
+#if CV_SSE2
+			static const uchar LSBTab[] =
+			{
+				0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+			};
+
+			volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+#endif
+
+			const int ALIGN = 16;
+			const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
+			const int DISP_SCALE = (1 << DISP_SHIFT);
+			const CostType MAX_COST = SHRT_MAX;
+
+			int minD = params.minDisparity, maxD = minD + params.numDisparities;
+			Size SADWindowSize;
+			SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
+			int ftzero = std::max(params.preFilterCap, 15) | 1;
+			int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
+			int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
+			int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
+			int k, width = disp1.cols, height = disp1.rows;
+			int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
+			int D = maxD - minD, width1 = maxX1 - minX1;
+			int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
+			int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
+			bool fullDP = params.mode == StereoBinarySGBM::MODE_HH;
+			int npasses = fullDP ? 2 : 1;
+			const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
+			PixType clipTab[TAB_SIZE];
+
+			for( k = 0; k < TAB_SIZE; k++ )
+				clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
+
+			if( minX1 >= maxX1 )
+			{
+				disp1 = Scalar::all(INVALID_DISP_SCALED);
+				return;
+			}
+
+			CV_Assert( D % 16 == 0 );
+
+			// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
+			// if you change NR, please, modify the loop as well.
+			int D2 = D+16, NRD2 = NR2*D2;
+
+			// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
+			// for 8-way dynamic programming we need the current row and
+			// the previous row, i.e. 2 rows in total
+			const int NLR = 2;
+			const int LrBorder = NLR - 1;
+
+			// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
+			// we keep pixel difference cost (C) and the summary cost over NR directions (S).
+			// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
+			size_t costBufSize = width1*D;
+			size_t CSBufSize = costBufSize*(fullDP ? height : 1);
+			size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
+			int hsumBufNRows = SH2*2 + 2;
+			size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
+				costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
+				CSBufSize*2*sizeof(CostType) + // C, S
+				width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
+				width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
+
+			if( buffer.empty() || !buffer.isContinuous() ||
+				buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
+				buffer.create(1, (int)totalBufSize, CV_8U);
+
+			// summary cost over different (nDirs) directions
+			CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
+			CostType* Sbuf = Cbuf + CSBufSize;
+			CostType* hsumBuf = Sbuf + CSBufSize;
+			CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
+
+			CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
+			DispType* disp2ptr = (DispType*)(disp2cost + width);
+			PixType* tempBuf = (PixType*)(disp2ptr + width);
+
+			// add P2 to every C(x,y). it saves a few operations in the inner loops
+			for( k = 0; k < width1*D; k++ )
+				Cbuf[k] = (CostType)P2;
+
+			for( int pass = 1; pass <= npasses; pass++ )
+			{
+				int x1, y1, x2, y2, dx, dy;
+
+				if( pass == 1 )
+				{
+					y1 = 0; y2 = height; dy = 1;
+					x1 = 0; x2 = width1; dx = 1;
+				}
+				else
+				{
+					y1 = height-1; y2 = -1; dy = -1;
+					x1 = width1-1; x2 = -1; dx = -1;
+				}
+
+				CostType *Lr[NLR]={0}, *minLr[NLR]={0};
+
+				for( k = 0; k < NLR; k++ )
+				{
+					// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
+					// and will occasionally use negative indices with the arrays
+					// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
+					// however, then the alignment will be imperfect, i.e. bad for SSE,
+					// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
+					Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
+					memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
+					minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
+					memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
+				}
+
+				for( int y = y1; y != y2; y += dy )
+				{
+					int x, d;
+					DispType* disp1ptr = disp1.ptr<DispType>(y);
+					CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
+					CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
+
+					if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
+					{
+						int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
+
+						for( k = dy1; k <= dy2; k++ )
+						{
+							CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+
+							if( k < height )
+							{
+								calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
+
+								memset(hsumAdd, 0, D*sizeof(CostType));
+								for( x = 0; x <= SW2*D; x += D )
+								{
+									int scale = x == 0 ? SW2 + 1 : 1;
+									for( d = 0; d < D; d++ )
+										hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
+								}
+
+								if( y > 0 )
+								{
+									const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
+									const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
+
+									for( x = D; x < width1*D; x += D )
+									{
+										const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+										const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+#if CV_SSE2
+										if( useSIMD )
+										{
+											for( d = 0; d < D; d += 8 )
+											{
+												__m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
+												__m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
+												hv = _mm_adds_epi16(_mm_subs_epi16(hv,
+													_mm_load_si128((const __m128i*)(pixSub + d))),
+													_mm_load_si128((const __m128i*)(pixAdd + d)));
+												Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
+													_mm_load_si128((const __m128i*)(hsumSub + x + d))),
+													hv);
+												_mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
+												_mm_store_si128((__m128i*)(C + x + d), Cx);
+											}
+										}
+										else
+#endif
+										{
+											for( d = 0; d < D; d++ )
+											{
+												int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+												C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
+											}
+										}
+									}
+								}
+								else
+								{
+									for( x = D; x < width1*D; x += D )
+									{
+										const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+										const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+										for( d = 0; d < D; d++ )
+											hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+									}
+								}
+							}
+
+							if( y == 0 )
+							{
+								int scale = k == 0 ? SH2 + 1 : 1;
+								for( x = 0; x < width1*D; x++ )
+									C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+							}
+						}
+
+						// also, clear the S buffer
+						for( k = 0; k < width1*D; k++ )
+							S[k] = 0;
+					}
+
+					// clear the left and the right borders
+					memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
+					memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
+					memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
+					memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
+
+					/*
+					[formula 13 in the paper]
+					compute L_r(p, d) = C(p, d) +
+					min(L_r(p-r, d),
+					L_r(p-r, d-1) + P1,
+					L_r(p-r, d+1) + P1,
+					min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
+					where p = (x,y), r is one of the directions.
+					we process all the directions at once:
+					0: r=(-dx, 0)
+					1: r=(-1, -dy)
+					2: r=(0, -dy)
+					3: r=(1, -dy)
+					4: r=(-2, -dy)
+					5: r=(-1, -dy*2)
+					6: r=(1, -dy*2)
+					7: r=(2, -dy)
+					*/
+					for( x = x1; x != x2; x += dx )
+					{
+						int xm = x*NR2, xd = xm*D2;
+
+						int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
+						int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
+
+						CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
+						CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
+						CostType* Lr_p2 = Lr[1] + xd + D2*2;
+						CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
+
+						Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
+							Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
+
+						CostType* Lr_p = Lr[0] + xd;
+						const CostType* Cp = C + x*D;
+						CostType* Sp = S + x*D;
+
+#if CV_SSE2
+						if( useSIMD )
+						{
+							__m128i _P1 = _mm_set1_epi16((short)P1);
+
+							__m128i _delta0 = _mm_set1_epi16((short)delta0);
+							__m128i _delta1 = _mm_set1_epi16((short)delta1);
+							__m128i _delta2 = _mm_set1_epi16((short)delta2);
+							__m128i _delta3 = _mm_set1_epi16((short)delta3);
+							__m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
+
+							for( d = 0; d < D; d += 8 )
+							{
+								__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
+								__m128i L0, L1, L2, L3;
+
+								L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
+								L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
+								L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
+								L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
+
+								L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
+								L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
+
+								L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
+								L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
+
+								L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
+								L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
+
+								L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
+								L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
+
+								L0 = _mm_min_epi16(L0, _delta0);
+								L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
+
+								L1 = _mm_min_epi16(L1, _delta1);
+								L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
+
+								L2 = _mm_min_epi16(L2, _delta2);
+								L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
+
+								L3 = _mm_min_epi16(L3, _delta3);
+								L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
+
+								_mm_store_si128( (__m128i*)(Lr_p + d), L0);
+								_mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
+								_mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
+								_mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
+
+								__m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
+								__m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
+								t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
+								_minL0 = _mm_min_epi16(_minL0, t0);
+
+								__m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
+
+								L0 = _mm_adds_epi16(L0, L1);
+								L2 = _mm_adds_epi16(L2, L3);
+								Sval = _mm_adds_epi16(Sval, L0);
+								Sval = _mm_adds_epi16(Sval, L2);
+
+								_mm_store_si128((__m128i*)(Sp + d), Sval);
+							}
+
+							_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
+							_mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
+						}
+						else
+#endif
+						{
+							int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
+
+							for( d = 0; d < D; d++ )
+							{
+								int Cpd = Cp[d], L0, L1, L2, L3;
+
+								L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
+								L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
+								L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
+								L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
+
+								Lr_p[d] = (CostType)L0;
+								minL0 = std::min(minL0, L0);
+
+								Lr_p[d + D2] = (CostType)L1;
+								minL1 = std::min(minL1, L1);
+
+								Lr_p[d + D2*2] = (CostType)L2;
+								minL2 = std::min(minL2, L2);
+
+								Lr_p[d + D2*3] = (CostType)L3;
+								minL3 = std::min(minL3, L3);
+
+								Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
+							}
+							minLr[0][xm] = (CostType)minL0;
+							minLr[0][xm+1] = (CostType)minL1;
+							minLr[0][xm+2] = (CostType)minL2;
+							minLr[0][xm+3] = (CostType)minL3;
+						}
+					}
+
+					if( pass == npasses )
+					{
+						for( x = 0; x < width; x++ )
+						{
+							disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
+							disp2cost[x] = MAX_COST;
+						}
+
+						for( x = width1 - 1; x >= 0; x-- )
+						{
+							CostType* Sp = S + x*D;
+							int minS = MAX_COST, bestDisp = -1;
+
+							if( npasses == 1 )
+							{
+								int xm = x*NR2, xd = xm*D2;
+
+								int minL0 = MAX_COST;
+								int delta0 = minLr[0][xm + NR2] + P2;
+								CostType* Lr_p0 = Lr[0] + xd + NRD2;
+								Lr_p0[-1] = Lr_p0[D] = MAX_COST;
+								CostType* Lr_p = Lr[0] + xd;
+
+								const CostType* Cp = C + x*D;
+
+#if CV_SSE2
+								if( useSIMD )
+								{
+									__m128i _P1 = _mm_set1_epi16((short)P1);
+									__m128i _delta0 = _mm_set1_epi16((short)delta0);
+
+									__m128i _minL0 = _mm_set1_epi16((short)minL0);
+									__m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
+									__m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
+
+									for( d = 0; d < D; d += 8 )
+									{
+										__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
+
+										L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
+										L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
+										L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
+										L0 = _mm_min_epi16(L0, _delta0);
+										L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
+
+										_mm_store_si128((__m128i*)(Lr_p + d), L0);
+										_minL0 = _mm_min_epi16(_minL0, L0);
+										L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
+										_mm_store_si128((__m128i*)(Sp + d), L0);
+
+										__m128i mask = _mm_cmpgt_epi16(_minS, L0);
+										_minS = _mm_min_epi16(_minS, L0);
+										_bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
+										_d8 = _mm_adds_epi16(_d8, _8);
+									}
+
+									short CV_DECL_ALIGNED(16) bestDispBuf[8];
+									_mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
+
+									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
+									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
+									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
+
+									__m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
+									qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
+									qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
+
+									minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
+									minS = (CostType)_mm_cvtsi128_si32(qS);
+
+									qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
+									qS = _mm_cmpeq_epi16(_minS, qS);
+									int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
+
+									bestDisp = bestDispBuf[LSBTab[idx]];
+								}
+								else
+#endif
+								{
+									for( d = 0; d < D; d++ )
+									{
+										int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
+
+										Lr_p[d] = (CostType)L0;
+										minL0 = std::min(minL0, L0);
+
+										int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
+										if( Sval < minS )
+										{
+											minS = Sval;
+											bestDisp = d;
+										}
+									}
+									minLr[0][xm] = (CostType)minL0;
+								}
+							}
+							else
+							{
+								for( d = 0; d < D; d++ )
+								{
+									int Sval = Sp[d];
+									if( Sval < minS )
+									{
+										minS = Sval;
+										bestDisp = d;
+									}
+								}
+							}
+
+							for( d = 0; d < D; d++ )
+							{
+								if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
+									break;
+							}
+							if( d < D )
+								continue;
+							d = bestDisp;
+							int _x2 = x + minX1 - d - minD;
+							if( disp2cost[_x2] > minS )
+							{
+								disp2cost[_x2] = (CostType)minS;
+								disp2ptr[_x2] = (DispType)(d + minD);
+							}
+
+							if( 0 < d && d < D-1 )
+							{
+								// do subpixel quadratic interpolation:
+								//   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
+								//   then find minimum of the parabola.
+								int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
+								d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
+							}
+							else
+								d *= DISP_SCALE;
+							disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
+						}
+
+						for( x = minX1; x < maxX1; x++ )
+						{
+							// we round the computed disparity both towards -inf and +inf and check
+							// if either of the corresponding disparities in disp2 is consistent.
+							// This is to give the computed disparity a chance to look valid if it is.
+							int d1 = disp1ptr[x];
+							if( d1 == INVALID_DISP_SCALED )
+								continue;
+							int _d = d1 >> DISP_SHIFT;
+							int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
+							int _x = x - _d, x_ = x - d_;
+							if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
+								0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
+								disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
+						}
+					}
+
+					// now shift the cyclic buffers
+					std::swap( Lr[0], Lr[1] );
+					std::swap( minLr[0], minLr[1] );
+				}
+			}
+		}
+
+		class StereoBinarySGBMImpl : public StereoBinarySGBM
+		{
+		public:
+			StereoBinarySGBMImpl()
+			{
+				params = StereoBinarySGBMParams();
+			}
+
+			StereoBinarySGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
+				int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
+				int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
+				int _mode )
+			{
+				params = StereoBinarySGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
+					_P1, _P2, _disp12MaxDiff, _preFilterCap,
+					_uniquenessRatio, _speckleWindowSize, _speckleRange,
+					_mode );
+			}
+
+			void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
+			{
+				Mat left = leftarr.getMat(), right = rightarr.getMat();
+				CV_Assert( left.size() == right.size() && left.type() == right.type() &&
+					left.depth() == CV_8U );
+
+				disparr.create( left.size(), CV_16S );
+				Mat disp = disparr.getMat();
+
+				computeDisparityBinarySGBM( left, right, disp, params, buffer );
+				medianBlur(disp, disp, 3);
+
+				if( params.speckleWindowSize > 0 )
+					filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
+					StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
+			}
+
+			int getMinDisparity() const { return params.minDisparity; }
+			void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
+
+			int getNumDisparities() const { return params.numDisparities; }
+			void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
+
+			int getBlockSize() const { return params.SADWindowSize; }
+			void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
+
+			int getSpeckleWindowSize() const { return params.speckleWindowSize; }
+			void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
+
+			int getSpeckleRange() const { return params.speckleRange; }
+			void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
+
+			int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
+			void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
+
+			int getPreFilterCap() const { return params.preFilterCap; }
+			void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
+
+			int getUniquenessRatio() const { return params.uniquenessRatio; }
+			void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
+
+			int getP1() const { return params.P1; }
+			void setP1(int P1) { params.P1 = P1; }
+
+			int getP2() const { return params.P2; }
+			void setP2(int P2) { params.P2 = P2; }
+
+			int getMode() const { return params.mode; }
+			void setMode(int mode) { params.mode = mode; }
+
+			void write(FileStorage& fs) const
+			{
+				fs << "name" << name_
+					<< "minDisparity" << params.minDisparity
+					<< "numDisparities" << params.numDisparities
+					<< "blockSize" << params.SADWindowSize
+					<< "speckleWindowSize" << params.speckleWindowSize
+					<< "speckleRange" << params.speckleRange
+					<< "disp12MaxDiff" << params.disp12MaxDiff
+					<< "preFilterCap" << params.preFilterCap
+					<< "uniquenessRatio" << params.uniquenessRatio
+					<< "P1" << params.P1
+					<< "P2" << params.P2
+					<< "mode" << params.mode;
+			}
+
+			void read(const FileNode& fn)
+			{
+				FileNode n = fn["name"];
+				CV_Assert( n.isString() && String(n) == name_ );
+				params.minDisparity = (int)fn["minDisparity"];
+				params.numDisparities = (int)fn["numDisparities"];
+				params.SADWindowSize = (int)fn["blockSize"];
+				params.speckleWindowSize = (int)fn["speckleWindowSize"];
+				params.speckleRange = (int)fn["speckleRange"];
+				params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
+				params.preFilterCap = (int)fn["preFilterCap"];
+				params.uniquenessRatio = (int)fn["uniquenessRatio"];
+				params.P1 = (int)fn["P1"];
+				params.P2 = (int)fn["P2"];
+				params.mode = (int)fn["mode"];
+			}
+
+			StereoBinarySGBMParams params;
+			Mat buffer;
+			static const char* name_;
+		};
+
+		const char* StereoBinarySGBMImpl::name_ = "StereoMatcher.SGBM";
+
+		Ptr<StereoBinarySGBM> StereoBinarySGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
+			int P1, int P2, int disp12MaxDiff,
+			int preFilterCap, int uniquenessRatio,
+			int speckleWindowSize, int speckleRange,
+			int mode)
+		{
+			return Ptr<StereoBinarySGBM>(
+				new StereoBinarySGBMImpl(minDisparity, numDisparities, SADWindowSize,
+				P1, P2, disp12MaxDiff,
+				preFilterCap, uniquenessRatio,
+				speckleWindowSize, speckleRange,
+				mode));
+		}
+
+		typedef cv::Point_<short> Point2s;
+	}
+}
--- a/modules/stereo/test/test_main.cpp
+++ b/modules/stereo/test/test_main.cpp
@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_TEST_MAIN("cv")
--- a/modules/stereo/test/test_precomp.hpp
+++ b/modules/stereo/test/test_precomp.hpp
@ -0,0 +1,31 @@
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include <iostream>
+#include "opencv2/ts.hpp"
+#include "opencv2/imgcodecs.hpp"
+
+
+#include "opencv2/stereo.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/private.hpp"
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core.hpp"
+#include "opencv2/highgui.hpp"
+
+#include <algorithm>
+#include <cmath>
+
+
+
+#endif
--- a/modules/surface_matching/samples/ppf_load_match.cpp
+++ b/modules/surface_matching/samples/ppf_load_match.cpp
@ -110,9 +110,22 @@ int main(int argc, char** argv)
    tick2 = cv::getTickCount();
    cout << endl << "PPF Elapsed Time " <<
         (tick2-tick1)/cv::getTickFrequency() << " sec" << endl;
-         
-    // Get only first N results
-    int N = 2;
+
+    //check results size from match call above
+    size_t results_size = results.size();
+    cout << "Number of matching poses: " << results_size;
+    if (results_size == 0) {
+        cout << endl << "No matching poses found. Exiting." << endl;
+        exit(0);
+    }
+
+    // Get only first N results - but adjust to results size if num of results are less than that specified by N
+    size_t N = 2;
+    if (results_size < N) {
+        cout << endl << "Reducing matching poses to be reported (as specified in code): "
+             << N << " to the number of matches found: " << results_size << endl;
+        N = results_size;
+    }
    vector<Pose3DPtr> resultsSub(results.begin(),results.begin()+N);
    
    // Create an instance of ICP
--- a/modules/surface_matching/src/ppf_match_3d.cpp
+++ b/modules/surface_matching/src/ppf_match_3d.cpp
@ -375,7 +375,7 @@ void PPF3DDetector::clusterPoses(std::vector<Pose3DPtr> poseList, int numPoses,
 #pragma omp parallel for
 #endif
    // uses weighting by the number of votes
-    for (size_t i=0; i<poseClusters.size(); i++)
+    for (int i=0; i<static_cast<int>(poseClusters.size()); i++)
    {
      // We could only average the quaternions. So I will make use of them here
      double qAvg[4]={0}, tAvg[3]={0};
@ -426,7 +426,7 @@ void PPF3DDetector::clusterPoses(std::vector<Pose3DPtr> poseList, int numPoses,
 #if defined _OPENMP
 #pragma omp parallel for
 #endif
-    for (size_t i=0; i<poseClusters.size(); i++)
+    for (int i=0; i<static_cast<int>(poseClusters.size()); i++)
    {
      // We could only average the quaternions. So I will make use of them here
      double qAvg[4]={0}, tAvg[3]={0};
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@ -240,6 +240,119 @@ types.
 */
 CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename);

+
+/* OCR BeamSearch Decoder */
+
+/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm.
+
+@note
+   -   (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can
+        be found at the demo sample:
+        <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/word_recognition.cpp>
+ */
+class CV_EXPORTS OCRBeamSearchDecoder : public BaseOCR
+{
+public:
+
+    /** @brief Callback with the character classifier is made a class.
+
+    This way it hides the feature extractor and the classifier itself, so developers can write
+    their own OCR code.
+
+    The default character classifier and feature extractor can be loaded using the utility funtion
+    loadOCRBeamSearchClassifierCNN with all its parameters provided in
+    <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz>.
+     */
+    class CV_EXPORTS ClassifierCallback
+    {
+    public:
+        virtual ~ClassifierCallback() { }
+        /** @brief The character classifier must return a (ranked list of) class(es) id('s)
+
+        @param image Input image CV_8UC1 or CV_8UC3 with a single letter.
+        @param recognition_probabilities For each of the N characters found the classifier returns a list with
+        class probabilities for each class.
+        @param oversegmentation The classifier returns a list of N+1 character locations' x-coordinates,
+        including 0 as start-sequence location.
+         */
+        virtual void eval( InputArray image, std::vector< std::vector<double> >& recognition_probabilities, std::vector<int>& oversegmentation );
+    };
+
+public:
+    /** @brief Recognize text using Beam Search.
+
+    Takes image on input and returns recognized text in the output_text parameter. Optionally
+    provides also the Rects for individual text elements found (e.g. words), and the list of those
+    text elements with their confidence values.
+
+    @param image Input image CV_8UC1 with a single text line (or word).
+
+    @param output_text Output text. Most likely character sequence found by the HMM decoder.
+
+    @param component_rects If provided the method will output a list of Rects for the individual
+    text elements found (e.g. words).
+
+    @param component_texts If provided the method will output a list of text strings for the
+    recognition of individual text elements found (e.g. words).
+
+    @param component_confidences If provided the method will output a list of confidence values
+    for the recognition of individual text elements found (e.g. words).
+
+    @param component_level Only OCR_LEVEL_WORD is supported.
+     */
+    virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
+    /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder.
+
+    @param classifier The character classifier with built in feature extractor.
+
+    @param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size()
+    must be equal to the number of classes of the classifier.
+
+    @param transition_probabilities_table Table with transition probabilities between character
+    pairs. cols == rows == vocabulary.size().
+
+    @param emission_probabilities_table Table with observation emission probabilities. cols ==
+    rows == vocabulary.size().
+
+    @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment
+    (<http://en.wikipedia.org/wiki/Viterbi_algorithm>).
+
+    @param beam_size Size of the beam in Beam Search algorithm.
+     */
+    static Ptr<OCRBeamSearchDecoder> create(const Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor
+                                     const std::string& vocabulary,                    // The language vocabulary (chars when ascii english text)
+                                                                                       //     size() must be equal to the number of classes
+                                     InputArray transition_probabilities_table,        // Table with transition probabilities between character pairs
+                                                                                       //     cols == rows == vocabulari.size()
+                                     InputArray emission_probabilities_table,          // Table with observation emission probabilities
+                                                                                       //     cols == rows == vocabulari.size()
+                                     decoder_mode mode = OCR_DECODER_VITERBI,          // HMM Decoding algorithm (only Viterbi for the moment)
+                                     int beam_size = 50);                              // Size of the beam in Beam Search algorithm
+
+protected:
+
+    Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier;
+    std::string vocabulary;
+    Mat transition_p;
+    Mat emission_p;
+    decoder_mode mode;
+    int beam_size;
+};
+
+/** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object.
+
+@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
+
+The default classifier is based in the scene text recognition method proposed by Adam Coates &
+Andrew NG in [Coates11a]. The character classifier sonsists in a Single Layer Convolutional Neural Network and
+a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
+at each window location.
+ */
+CV_EXPORTS Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const std::string& filename);
+
 //! @}

 }
--- a/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz
+++ b/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz
--- a/modules/text/samples/cropped_word_recognition.cpp
+++ b/modules/text/samples/cropped_word_recognition.cpp
@ -0,0 +1,66 @@
+/*
+ * textdetection.cpp
+ *
+ * A demo program of End-to-end Scene Text Detection and Recognition:
+ * Shows the use of the Tesseract OCR API with the Extremal Region Filter algorithm described in:
+ * Neumann L., Matas J.: Real-Time Scene Text Localization and Recognition, CVPR 2012
+ *
+ * Created on: Jul 31, 2014
+ *     Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
+ */
+
+#include "opencv2/text.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include <iostream>
+
+using namespace std;
+using namespace cv;
+using namespace cv::text;
+
+//Perform text recognition in a given cropped word
+int main(int argc, char* argv[])
+{
+    cout << endl << argv[0] << endl << endl;
+    cout << "A demo program of Scene Text cropped word Recognition: " << endl;
+    cout << "Shows the use of the OCRBeamSearchDecoder class using the Single Layer CNN character classifier described in:" << endl;
+    cout << "Coates, Adam, et al. \"Text detection and character recognition in scene images with unsupervised feature learning.\" ICDAR 2011." << endl << endl;
+
+    Mat image;
+    if(argc>1)
+        image  = imread(argv[1]);
+    else
+    {
+        cout << "    Usage: " << argv[0] << " <input_image>" << endl << endl;
+        return(0);
+    }
+
+    Mat transition_p;
+    string filename = "OCRHMM_transitions_table.xml"; // TODO this table was done with a different vocabulary order?
+                                                      // TODO add a new function in ocr.cpp to create transition tab
+                                                      // for a given lexicon
+    FileStorage fs(filename, FileStorage::READ);
+    fs["transition_probabilities"] >> transition_p;
+    fs.release();
+    Mat emission_p = Mat::eye(62,62,CV_64FC1);
+    string voc = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyx0123456789";
+
+    Ptr<OCRBeamSearchDecoder> ocr = OCRBeamSearchDecoder::create(
+                loadOCRBeamSearchClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"),
+                voc, transition_p, emission_p);
+
+    double t_r = (double)getTickCount();
+    string output;
+
+    vector<Rect>   boxes;
+    vector<string> words;
+    vector<float>  confidences;
+    ocr->run(image, output, &boxes, &words, &confidences, OCR_LEVEL_WORD);
+
+    cout << "OCR output = \"" << output << "\". Decoded in "
+         << ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl << endl;
+
+    return 0;
+}
--- a/modules/text/samples/scenetext_word01.jpg
+++ b/modules/text/samples/scenetext_word01.jpg
--- a/modules/text/samples/scenetext_word02.jpg
+++ b/modules/text/samples/scenetext_word02.jpg
--- a/modules/text/src/erfilter.cpp
+++ b/modules/text/src/erfilter.cpp
@ -287,7 +287,11 @@ void ERFilterNM::er_tree_extract( InputArray image )
    quads[1][3] = (1<<3)|(1<<2)|(1<<1);
    quads[2][0] = (1<<2)|(1<<1);
    quads[2][1] = (1<<3)|(1);
-    quads[2][3] = 255;
+    // quads[2][2] and quads[2][3] are never used so no need to initialize them.
+    // The four lowest bits in each quads[i][j] correspond to the 2x2 binary patterns 
+    // Q_1, Q_2, Q_3 in the Neumann and Matas CVPR 2012 paper 
+    // (see in page 4 at the end of first column). 
+    // Q_1 and Q_2 have four patterns, while Q_3 has only two.


    // masks to know if a pixel is accessible and if it has been already added to some region
--- a/modules/text/src/ocr_beamsearch_decoder.cpp
+++ b/modules/text/src/ocr_beamsearch_decoder.cpp
@ -0,0 +1,656 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/ml.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <set>
+
+namespace cv
+{
+namespace text
+{
+
+using namespace std;
+using namespace cv::ml;
+
+/* OCR BeamSearch Decoder */
+
+void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* component_rects,
+                               vector<string>* component_texts, vector<float>* component_confidences,
+                               int component_level)
+{
+    CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+    CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
+    output_text.clear();
+    if (component_rects != NULL)
+        component_rects->clear();
+    if (component_texts != NULL)
+        component_texts->clear();
+    if (component_confidences != NULL)
+        component_confidences->clear();
+}
+
+
+void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
+{
+    CV_Assert(( image.getMat().type() == CV_8UC3 ) || ( image.getMat().type() == CV_8UC1 ));
+    if (!recognition_probabilities.empty())
+    {
+        for (size_t i=0; i<recognition_probabilities.size(); i++)
+            recognition_probabilities[i].clear();
+    }
+    recognition_probabilities.clear();
+    oversegmentation.clear();
+}
+
+
+bool beam_sort_function ( pair< double,vector<int> > i, pair< double,vector<int> > j );
+bool beam_sort_function ( pair< double,vector<int> > i, pair< double,vector<int> > j )
+{
+    return (i.first > j.first);
+}
+
+
+class OCRBeamSearchDecoderImpl : public OCRBeamSearchDecoder
+{
+public:
+    //Default constructor
+    OCRBeamSearchDecoderImpl( Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier,
+                              const string& _vocabulary,
+                              InputArray transition_probabilities_table,
+                              InputArray emission_probabilities_table,
+                              decoder_mode _mode,
+                              int _beam_size)
+    {
+        classifier = _classifier;
+        transition_p = transition_probabilities_table.getMat();
+        emission_p = emission_probabilities_table.getMat();
+        vocabulary = _vocabulary;
+        mode = _mode;
+        beam_size = _beam_size;
+    }
+
+    ~OCRBeamSearchDecoderImpl()
+    {
+    }
+
+    void run( Mat& src,
+              string& out_sequence,
+              vector<Rect>* component_rects,
+              vector<string>* component_texts,
+              vector<float>* component_confidences,
+              int component_level)
+    {
+
+        CV_Assert( (src.type() == CV_8UC1) || (src.type() == CV_8UC3) );
+        CV_Assert( (src.cols > 0) && (src.rows > 0) );
+        CV_Assert( component_level == OCR_LEVEL_WORD );
+        out_sequence.clear();
+        if (component_rects != NULL)
+            component_rects->clear();
+        if (component_texts != NULL)
+            component_texts->clear();
+        if (component_confidences != NULL)
+            component_confidences->clear();
+
+        // TODO split a line into words
+
+        if(src.type() == CV_8UC3)
+        {
+            cvtColor(src,src,COLOR_RGB2GRAY);
+        }
+
+
+        vector< vector<double> > recognition_probabilities;
+        vector<int> oversegmentation;
+
+        classifier->eval(src, recognition_probabilities, oversegmentation);
+
+        /*Now we go here with the beam search algorithm to optimize the recognition score*/
+
+        //convert probabilities to log probabilities
+        for (size_t i=0; i<recognition_probabilities.size(); i++)
+        {
+            for (size_t j=0; j<recognition_probabilities[i].size(); j++)
+            {
+                if (recognition_probabilities[i][j] == 0)
+                    recognition_probabilities[i][j] = -DBL_MAX;
+                else
+                    recognition_probabilities[i][j] = log(recognition_probabilities[i][j]);
+            }
+        }
+        for (int i=0; i<transition_p.rows; i++)
+        {
+            for (int j=0; j<transition_p.cols; j++)
+            {
+                if (transition_p.at<double>(i,j) == 0)
+                    transition_p.at<double>(i,j) = -DBL_MAX;
+                else
+                    transition_p.at<double>(i,j) = log(transition_p.at<double>(i,j));
+            }
+        }
+
+
+        //TODO it would be interesting to have a hash table with a vector of booleans
+        // but this is not possible when we have a large number of possible segmentations.
+        //vector<bool> visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes
+        // options are using std::set<unsigned long long int> to store only the keys of visited nodes
+        // but will deteriorate the time performance.
+        set<unsigned long long int> visited_nodes; //TODO make it member of class
+        // it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points
+        // for which there is a change on the class prediction)
+
+        vector<int> start_segmentation;
+        start_segmentation.push_back(oversegmentation[0]);
+        start_segmentation.push_back(oversegmentation[oversegmentation.size()-1]);
+
+        vector< pair< double,vector<int> > > beam;
+        beam.push_back( pair< double,vector<int> > (score_segmentation(start_segmentation, recognition_probabilities, out_sequence), start_segmentation) );
+
+        vector< vector<int> > childs = generate_childs(start_segmentation,oversegmentation, visited_nodes);
+        if (!childs.empty())
+            update_beam( beam, childs, recognition_probabilities);
+        //cout << "beam size " << beam.size() << " best score " << beam[0].first<< endl;
+
+        int generated_chids = (int)childs.size();
+        while (generated_chids != 0)
+        {
+            generated_chids = 0;
+            vector< pair< double,vector<int> > > old_beam = beam;
+
+            for (size_t i=0; i<old_beam.size(); i++)
+            {
+                childs = generate_childs(old_beam[i].second,oversegmentation, visited_nodes);
+                if (!childs.empty())
+                    update_beam( beam, childs, recognition_probabilities);
+                generated_chids += (int)childs.size();
+            }
+            //cout << "beam size " << beam.size() << " best score " << beam[0].first << endl;
+        }
+
+
+        // FINISHED ! Get the best prediction found into out_sequence
+        score_segmentation(beam[0].second, recognition_probabilities, out_sequence);
+
+
+        // TODO fill other output parameters
+
+        return;
+    }
+
+private:
+
+    ////////////////////////////////////////////////////////////
+
+    // TODO the way we expand nodes makes the recognition score heuristic not monotonic
+    // it should start from left node 0 and grow always to the right.
+
+    vector< vector<int> > generate_childs(vector<int> &segmentation, vector<int> &oversegmentation, set<unsigned long long int> &visited_nodes)
+    {
+        /*cout << " generate childs  for [";
+  for (size_t i = 0 ; i < segmentation .size(); i++)
+      cout << segmentation[i] << ",";
+  cout << "] ";*/
+
+        vector< vector<int> > childs;
+        for (size_t i=0; i<oversegmentation.size(); i++)
+        {
+            int seg_point = oversegmentation[i];
+            if (find(segmentation.begin(), segmentation.end(), seg_point) == segmentation.end())
+            {
+                //cout << seg_point << " " ;
+                vector<int> child = segmentation;
+                child.push_back(seg_point);
+                sort(child.begin(), child.end());
+                unsigned long long int key = 0;
+                for (size_t j=0; j<child.size(); j++)
+                {
+                    key += (unsigned long long int)pow(2,oversegmentation.size()-(oversegmentation.end()-find(oversegmentation.begin(), oversegmentation.end(), child[j])));
+                }
+                //if (!visited_nodes[key])
+                if (visited_nodes.find(key) == visited_nodes.end())
+                {
+                    childs.push_back(child);
+                    //visited_nodes[key] = true;
+                    visited_nodes.insert(key);
+                }
+            }
+        }
+        //cout << endl;
+        return childs;
+    }
+
+
+    ////////////////////////////////////////////////////////////
+
+    //TODO shall the beam itself be a member of the class?
+    void update_beam (vector< pair< double,vector<int> > > &beam, vector< vector<int> > &childs, vector< vector<double> > &recognition_probabilities)
+    {
+        string out_sequence;
+        double min_score = -DBL_MAX; //min score value to be part of the beam
+        if ((int)beam.size() == beam_size)
+            min_score = beam[beam.size()-1].first; //last element has the lowest score
+        for (size_t i=0; i<childs.size(); i++)
+        {
+            double score = score_segmentation(childs[i], recognition_probabilities, out_sequence);
+            if (score > min_score)
+            {
+                beam.push_back(pair< double,vector<int> >(score,childs[i]));
+                sort(beam.begin(),beam.end(),beam_sort_function);
+                if ((int)beam.size() > beam_size)
+                {
+                    beam.pop_back();
+                    min_score = beam[beam.size()-1].first;
+                }
+            }
+        }
+    }
+
+
+    ////////////////////////////////////////////////////////////
+    // TODO Add heuristics to the score function (see PhotoOCR paper)
+    // e.g.: in some cases we discard a segmentation because it includes a very large character
+    //       in other cases we do it because the overlapping between two chars is too large
+    //       etc.
+    double score_segmentation(vector<int> &segmentation, vector< vector<double> > &observations, string& outstring)
+    {
+
+        //TODO This must be extracted from dictionary
+        vector<double> start_p(vocabulary.size());
+        for (int i=0; i<(int)vocabulary.size(); i++)
+            start_p[i] = log(1.0/vocabulary.size());
+
+
+        Mat V = Mat::ones((int)segmentation.size()-1,(int)vocabulary.size(),CV_64FC1);
+        V = V * -DBL_MAX;
+        vector<string> path(vocabulary.size());
+
+        // Initialize base cases (t == 0)
+        for (int i=0; i<(int)vocabulary.size(); i++)
+        {
+            V.at<double>(0,i) = start_p[i] + observations[segmentation[1]-1][i];
+            path[i] = vocabulary.at(i);
+        }
+
+
+        // Run Viterbi for t > 0
+        for (int t=1; t<(int)segmentation.size()-1; t++)
+        {
+
+            vector<string> newpath(vocabulary.size());
+
+            for (int i=0; i<(int)vocabulary.size(); i++)
+            {
+                double max_prob = -DBL_MAX;
+                int best_idx = 0;
+                for (int j=0; j<(int)vocabulary.size(); j++)
+                {
+                    double prob = V.at<double>(t-1,j) + transition_p.at<double>(j,i) + observations[segmentation[t+1]-1][i];
+                    if ( prob > max_prob)
+                    {
+                        max_prob = prob;
+                        best_idx = j;
+                    }
+                }
+
+                V.at<double>(t,i) = max_prob;
+                newpath[i] = path[best_idx] + vocabulary.at(i);
+            }
+
+            // Don't need to remember the old paths
+            path.swap(newpath);
+        }
+
+        double max_prob = -DBL_MAX;
+        int best_idx = 0;
+        for (int i=0; i<(int)vocabulary.size(); i++)
+        {
+            double prob = V.at<double>((int)segmentation.size()-2,i);
+            if ( prob > max_prob)
+            {
+                max_prob = prob;
+                best_idx = i;
+            }
+        }
+
+        //cout << " score " << max_prob / (segmentation.size()-1) << " " << path[best_idx] << endl;
+        outstring = path[best_idx];
+        return max_prob / (segmentation.size()-1);
+    }
+
+};
+
+Ptr<OCRBeamSearchDecoder> OCRBeamSearchDecoder::create( Ptr<OCRBeamSearchDecoder::ClassifierCallback> _classifier,
+                                                        const string& _vocabulary,
+                                                        InputArray transition_p,
+                                                        InputArray emission_p,
+                                                        decoder_mode _mode,
+                                                        int _beam_size)
+{
+    return makePtr<OCRBeamSearchDecoderImpl>(_classifier, _vocabulary, transition_p, emission_p, _mode, _beam_size);
+}
+
+
+class CV_EXPORTS OCRBeamSearchClassifierCNN : public OCRBeamSearchDecoder::ClassifierCallback
+{
+public:
+    //constructor
+    OCRBeamSearchClassifierCNN(const std::string& filename);
+    // Destructor
+    ~OCRBeamSearchClassifierCNN() {}
+
+    void eval( InputArray src, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation );
+
+protected:
+    void normalizeAndZCA(Mat& patches);
+    double eval_feature(Mat& feature, double* prob_estimates);
+
+private:
+    //TODO implement getters/setters for some of these members (if apply)
+    int nr_class;		 // number of classes
+    int nr_feature;  // number of features
+    Mat feature_min; // scale range
+    Mat feature_max;
+    Mat weights;     // Logistic Regression weights
+    Mat kernels;     // CNN kernels
+    Mat M, P;        // ZCA Whitening parameters
+    int step_size;   // sliding window step
+    int window_size; // window size
+    int quad_size;
+    int patch_size;
+    int num_quads;   // extract 25 quads (12x12) from each image
+    int num_tiles;   // extract 25 patches (8x8) from each quad
+    double alpha;    // used in non-linear activation function z = max(0, |D*a| - alpha)
+};
+
+OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename)
+{
+    if (ifstream(filename.c_str()))
+    {
+        FileStorage fs(filename, FileStorage::READ);
+        // Load kernels bank and withenning params
+        fs["kernels"] >> kernels;
+        fs["M"] >> M;
+        fs["P"] >> P;
+        // Load Logistic Regression weights
+        fs["weights"] >> weights;
+        // Load feature scaling ranges
+        fs["feature_min"] >> feature_min;
+        fs["feature_max"] >> feature_max;
+        fs.release();
+        // TODO check all matrix dimensions match correctly and no one is empty
+    }
+    else
+        CV_Error(Error::StsBadArg, "Default classifier data file not found!");
+
+    nr_feature = weights.rows;
+    nr_class   = weights.cols;
+    // TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols))
+    step_size   = 4;
+    window_size = 32;
+    quad_size   = 12;
+    patch_size  = 8;
+    num_quads   = 25;
+    num_tiles   = 25;
+    alpha       = 0.5;
+
+
+}
+
+void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
+{
+
+    CV_Assert(( _src.getMat().type() == CV_8UC3 ) || ( _src.getMat().type() == CV_8UC1 ));
+    if (!recognition_probabilities.empty())
+    {
+        for (size_t i=0; i<recognition_probabilities.size(); i++)
+            recognition_probabilities[i].clear();
+    }
+    recognition_probabilities.clear();
+    oversegmentation.clear();
+
+
+    Mat src = _src.getMat();
+    if(src.type() == CV_8UC3)
+    {
+        cvtColor(src,src,COLOR_RGB2GRAY);
+    }
+
+    // TODO shall we resize the input image or make a copy ?
+    resize(src,src,Size(window_size*src.cols/src.rows,window_size));
+
+    int seg_points = 0;
+    oversegmentation.push_back(seg_points);
+
+    Mat quad;
+    Mat tmp;
+    Mat img;
+
+    // begin sliding window loop foreach detection window
+    for (int x_c=0; x_c<=src.cols-window_size; x_c=x_c+step_size)
+    {
+
+        img = src(Rect(Point(x_c,0),Size(window_size,window_size)));
+
+        int patch_count = 0;
+        vector< vector<double> > data_pool(9);
+
+
+        int quad_id = 1;
+        for (int q_x=0; q_x<=window_size-quad_size; q_x=q_x+(quad_size/2-1))
+        {
+            for (int q_y=0; q_y<=window_size-quad_size; q_y=q_y+(quad_size/2-1))
+            {
+                Rect quad_rect = Rect(q_x,q_y,quad_size,quad_size);
+                quad = img(quad_rect);
+
+                //start sliding window (8x8) in each tile and store the patch as row in data_pool
+                for (int w_x=0; w_x<=quad_size-patch_size; w_x++)
+                {
+                    for (int w_y=0; w_y<=quad_size-patch_size; w_y++)
+                    {
+                        quad(Rect(w_x,w_y,patch_size,patch_size)).copyTo(tmp);
+                        tmp = tmp.reshape(0,1);
+                        tmp.convertTo(tmp, CV_64F);
+                        normalizeAndZCA(tmp);
+                        vector<double> patch;
+                        tmp.copyTo(patch);
+                        if ((quad_id == 1)||(quad_id == 2)||(quad_id == 6)||(quad_id == 7))
+                            data_pool[0].insert(data_pool[0].end(),patch.begin(),patch.end());
+                        if ((quad_id == 2)||(quad_id == 7)||(quad_id == 3)||(quad_id == 8)||(quad_id == 4)||(quad_id == 9))
+                            data_pool[1].insert(data_pool[1].end(),patch.begin(),patch.end());
+                        if ((quad_id == 4)||(quad_id == 9)||(quad_id == 5)||(quad_id == 10))
+                            data_pool[2].insert(data_pool[2].end(),patch.begin(),patch.end());
+                        if ((quad_id == 6)||(quad_id == 11)||(quad_id == 16)||(quad_id == 7)||(quad_id == 12)||(quad_id == 17))
+                            data_pool[3].insert(data_pool[3].end(),patch.begin(),patch.end());
+                        if ((quad_id == 7)||(quad_id == 12)||(quad_id == 17)||(quad_id == 8)||(quad_id == 13)||(quad_id == 18)||(quad_id == 9)||(quad_id == 14)||(quad_id == 19))
+                            data_pool[4].insert(data_pool[4].end(),patch.begin(),patch.end());
+                        if ((quad_id == 9)||(quad_id == 14)||(quad_id == 19)||(quad_id == 10)||(quad_id == 15)||(quad_id == 20))
+                            data_pool[5].insert(data_pool[5].end(),patch.begin(),patch.end());
+                        if ((quad_id == 16)||(quad_id == 21)||(quad_id == 17)||(quad_id == 22))
+                            data_pool[6].insert(data_pool[6].end(),patch.begin(),patch.end());
+                        if ((quad_id == 17)||(quad_id == 22)||(quad_id == 18)||(quad_id == 23)||(quad_id == 19)||(quad_id == 24))
+                            data_pool[7].insert(data_pool[7].end(),patch.begin(),patch.end());
+                        if ((quad_id == 19)||(quad_id == 24)||(quad_id == 20)||(quad_id == 25))
+                            data_pool[8].insert(data_pool[8].end(),patch.begin(),patch.end());
+                        patch_count++;
+                    }
+                }
+
+                quad_id++;
+            }
+        }
+
+        //do dot product of each normalized and whitened patch
+        //each pool is averaged and this yields a representation of 9xD
+        Mat feature = Mat::zeros(9,kernels.rows,CV_64FC1);
+        for (int i=0; i<9; i++)
+        {
+            Mat pool = Mat(data_pool[i]);
+            pool = pool.reshape(0,(int)data_pool[i].size()/kernels.cols);
+            for (int p=0; p<pool.rows; p++)
+            {
+                for (int f=0; f<kernels.rows; f++)
+                {
+                    feature.row(i).at<double>(0,f) = feature.row(i).at<double>(0,f) + max(0.0,std::abs(pool.row(p).dot(kernels.row(f)))-alpha);
+                }
+            }
+        }
+        feature = feature.reshape(0,1);
+
+
+        // data must be normalized within the range obtained during training
+        double lower = -1.0;
+        double upper =  1.0;
+        for (int k=0; k<feature.cols; k++)
+        {
+            feature.at<double>(0,k) = lower + (upper-lower) *
+                    (feature.at<double>(0,k)-feature_min.at<double>(0,k))/
+                    (feature_max.at<double>(0,k)-feature_min.at<double>(0,k));
+        }
+
+        double *p = new double[nr_class];
+        double predict_label = eval_feature(feature,p);
+        //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
+        if (predict_label < 0) // TODO use cvError
+            cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl;
+
+
+        seg_points++;
+        oversegmentation.push_back(seg_points);
+        vector<double> recognition_p(p, p+nr_class*sizeof(double));
+        recognition_probabilities.push_back(recognition_p);
+
+    }
+
+
+}
+
+// normalize for contrast and apply ZCA whitening to a set of image patches
+void OCRBeamSearchClassifierCNN::normalizeAndZCA(Mat& patches)
+{
+
+    //Normalize for contrast
+    for (int i=0; i<patches.rows; i++)
+    {
+        Scalar row_mean, row_std;
+        meanStdDev(patches.row(i),row_mean,row_std);
+        row_std[0] = sqrt(pow(row_std[0],2)*patches.cols/(patches.cols-1)+10);
+        patches.row(i) = (patches.row(i) - row_mean[0]) / row_std[0];
+    }
+
+
+    //ZCA whitening
+    if ((M.dims == 0) || (P.dims == 0))
+    {
+        Mat CC;
+        calcCovarMatrix(patches,CC,M,COVAR_NORMAL|COVAR_ROWS|COVAR_SCALE);
+        CC = CC * patches.rows / (patches.rows-1);
+
+
+        Mat e_val,e_vec;
+        eigen(CC.t(),e_val,e_vec);
+        e_vec = e_vec.t();
+        sqrt(1./(e_val + 0.1), e_val);
+
+
+        Mat V = Mat::zeros(e_vec.rows, e_vec.cols, CV_64FC1);
+        Mat D = Mat::eye(e_vec.rows, e_vec.cols, CV_64FC1);
+
+        for (int i=0; i<e_vec.cols; i++)
+        {
+            e_vec.col(e_vec.cols-i-1).copyTo(V.col(i));
+            D.col(i) = D.col(i) * e_val.at<double>(0,e_val.rows-i-1);
+        }
+
+        P = V * D * V.t();
+    }
+
+    for (int i=0; i<patches.rows; i++)
+        patches.row(i) = patches.row(i) - M;
+
+    patches = patches * P;
+
+}
+
+double OCRBeamSearchClassifierCNN::eval_feature(Mat& feature, double* prob_estimates)
+{
+    for(int i=0;i<nr_class;i++)
+        prob_estimates[i] = 0;
+
+    for(int idx=0; idx<nr_feature; idx++)
+        for(int i=0;i<nr_class;i++)
+            prob_estimates[i] += weights.at<float>(idx,i)*feature.at<double>(0,idx); //TODO use vectorized dot product
+
+    int dec_max_idx = 0;
+    for(int i=1;i<nr_class;i++)
+    {
+        if(prob_estimates[i] > prob_estimates[dec_max_idx])
+            dec_max_idx = i;
+    }
+
+    for(int i=0;i<nr_class;i++)
+        prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
+
+    double sum=0;
+    for(int i=0; i<nr_class; i++)
+        sum+=prob_estimates[i];
+
+    for(int i=0; i<nr_class; i++)
+        prob_estimates[i]=prob_estimates[i]/sum;
+
+    return dec_max_idx;
+}
+
+
+Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const std::string& filename)
+
+{
+    return makePtr<OCRBeamSearchClassifierCNN>(filename);
+}
+
+}
+}
--- a/modules/text/src/ocr_tesseract.cpp
+++ b/modules/text/src/ocr_tesseract.cpp
@ -127,7 +127,7 @@ public:
             int component_level=0)
    {

-        CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC1) );
+        CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );

 #ifdef HAVE_TESSERACT

--- a/modules/tracking/doc/[Tutorial]
+++ b/modules/tracking/doc/[Tutorial]
@ -0,0 +1,135 @@
+
+/*---------------STEP 1---------------------*/
+/* modify this file
+*  opencv2/tracking/tracker.hpp
+*  and put several lines of snippet similar to 
+*  the following:
+*/ 
+/*------------------------------------------*/
+
+class CV_EXPORTS_W TrackerKCF : public Tracker
+{
+ public:
+  struct CV_EXPORTS Params
+  {
+    Params();
+    void read( const FileNode& /*fn*/ );
+    void write( FileStorage& /*fs*/ ) const;
+  };
+
+  /** @brief Constructor
+    @param parameters KCF parameters TrackerKCF::Params
+     */
+  BOILERPLATE_CODE("KCF",TrackerKCF);
+};
+
+
+/*---------------STEP 2---------------------*/
+/* modify this file
+*  src/tracker.cpp
+*  add one line in function  
+*  Ptr<Tracker> Tracker::create( const String& trackerType )
+*/ 
+/*------------------------------------------*/
+
+Ptr<Tracker> Tracker::create( const String& trackerType )
+{
+  BOILERPLATE_CODE("MIL",TrackerMIL);
+  BOILERPLATE_CODE("BOOSTING",TrackerBoosting);
+  BOILERPLATE_CODE("MEDIANFLOW",TrackerMedianFlow);
+  BOILERPLATE_CODE("TLD",TrackerTLD);
+  BOILERPLATE_CODE("KCF",TrackerKCF); // add this line!
+  return Ptr<Tracker>();
+}
+
+
+/*---------------STEP 3---------------------*/
+/* make a new file and paste the snippet below
+*  and modify it according to your needs.
+*  also make sure to put the LICENSE part.
+*  src/trackerKCF.cpp
+*/
+/*------------------------------------------*/
+
+/*---------------------------
+|  TrackerKCFModel
+|---------------------------*/
+namespace cv{
+   /**
+  * \brief Implementation of TrackerModel for MIL algorithm
+  */
+  class TrackerKCFModel : public TrackerModel{
+  public:
+    TrackerKCFModel(TrackerKCF::Params /*params*/){}
+    ~TrackerKCFModel(){}
+  protected:
+    void modelEstimationImpl( const std::vector<Mat>& responses ){}
+    void modelUpdateImpl(){}
+  };
+} /* namespace cv */
+
+
+/*---------------------------
+|  TrackerKCF
+|---------------------------*/
+namespace cv{
+  
+  /*
+ * Prototype
+ */
+  class TrackerKCFImpl : public TrackerKCF{
+  public:
+    TrackerKCFImpl( const TrackerKCF::Params &parameters = TrackerKCF::Params() );
+    void read( const FileNode& fn );
+    void write( FileStorage& fs ) const;
+  
+  protected:
+    bool initImpl( const Mat& image, const Rect2d& boundingBox );
+    bool updateImpl( const Mat& image, Rect2d& boundingBox );
+    
+    TrackerKCF::Params params;
+  };
+    
+  /*
+ * Constructor
+ */
+  Ptr<TrackerKCF> TrackerKCF::createTracker(const TrackerKCF::Params &parameters){
+      return Ptr<TrackerKCFImpl>(new TrackerKCFImpl(parameters));
+  }
+  TrackerKCFImpl::TrackerKCFImpl( const TrackerKCF::Params &parameters ) :
+      params( parameters )
+  {
+    isInit = false;
+  }
+  
+  void TrackerKCFImpl::read( const cv::FileNode& fn ){
+    params.read( fn );
+  }
+
+  void TrackerKCFImpl::write( cv::FileStorage& fs ) const{
+    params.write( fs );
+  }
+  
+  
+  bool TrackerKCFImpl::initImpl( const Mat& image, const Rect2d& boundingBox ){
+    model=Ptr<TrackerKCFModel>(new TrackerKCFModel(params));
+    return true;
+  }
+  bool TrackerKCFImpl::updateImpl( const Mat& image, Rect2d& boundingBox ){return true;}
+  
+  /*
+ * Parameters
+ */
+  TrackerKCF::Params::Params(){
+      
+  }
+
+  void TrackerKCF::Params::read( const cv::FileNode& fn ){
+    
+  }
+
+  void TrackerKCF::Params::write( cv::FileStorage& fs ) const{
+    
+  }
+  
+} /* namespace cv */
--- a/modules/tracking/doc/tracking.bib
+++ b/modules/tracking/doc/tracking.bib
@ -67,3 +67,29 @@
  year={2013},
  organization={IEEE}
 }
+
+@article{KCF,
+  title = {High-Speed Tracking with Kernelized Correlation Filters},
+  journal = {Pattern Analysis and Machine Intelligence, IEEE Transactions on},
+  author = {Henriques, J. F. and Caseiro, R. and Martins, P. and Batista, J.},
+  year = {2015},
+  doi = {10.1109/TPAMI.2014.2345390},
+}
+
+@inproceedings{KCF_ECCV,
+  title = {Exploiting the Circulant Structure of Tracking-by-detection with Kernels},
+  author = {Henriques, J. F. and Caseiro, R. and Martins, P. and Batista, J.},
+  booktitle = {proceedings of the European Conference on Computer Vision},
+  year = {2012},
+}
+
+@INPROCEEDINGS{KCF_CN,
+  author={Danelljan, M. and Khan, F.S. and Felsberg, M. and van de Weijer, J.},
+  booktitle={Computer Vision and Pattern Recognition (CVPR), 2014 IEEE Conference on},
+  title={Adaptive Color Attributes for Real-Time Visual Tracking},
+  year={2014},
+  month={June},
+  pages={1090-1097},
+  keywords={computer vision;feature extraction;image colour analysis;image representation;image sequences;adaptive color attributes;benchmark color sequences;color features;color representations;computer vision;image description;real-time visual tracking;tracking-by-detection framework;Color;Computational modeling;Covariance matrices;Image color analysis;Kernel;Target tracking;Visualization;Adaptive Dimensionality Reduction;Appearance Model;Color Features;Visual Tracking},
+  doi={10.1109/CVPR.2014.143},
+}
--- a/modules/tracking/include/opencv2/tracking/tracker.hpp
+++ b/modules/tracking/include/opencv2/tracking/tracker.hpp
@ -1189,6 +1189,60 @@ class CV_EXPORTS_W TrackerTLD : public Tracker
  BOILERPLATE_CODE("TLD",TrackerTLD);
 };

+/** @brief KCF is a novel tracking framework that utilizes properties of circulant matrix to enhance the processing speed.
+ * This tracking method is an implementation of @cite KCF_ECCV which is extended to KFC with color-names features (@cite KCF_CN).
+ * The original paper of KCF is available at <http://home.isr.uc.pt/~henriques/circulant/index.html>
+ * as well as the matlab implementation. For more information about KCF with color-names features, please refer to
+ * <http://www.cvl.isy.liu.se/research/objrec/visualtracking/colvistrack/index.html>.
+ */
+class CV_EXPORTS_W TrackerKCF : public Tracker
+{
+ public:
+  /**
+   * \brief Feature type to be used in the tracking grayscale, colornames, compressed color-names
+   * The modes available now:
+    -   "GRAY" -- Use grayscale values as the feature
+    -   "CN" -- Color-names feature
+   */
+  enum MODE {GRAY, CN, CN2};
+
+  struct CV_EXPORTS Params
+  {
+    /**
+     * \brief Constructor
+     */
+    Params();
+
+    /**
+     * \brief Read parameters from file, currently unused
+     */
+    void read( const FileNode& /*fn*/ );
+
+    /**
+     * \brief Read parameters from file, currently unused
+     */
+    void write( FileStorage& /*fs*/ ) const;
+
+    double sigma;                 //!<  gaussian kernel bandwidth
+    double lambda;                //!<  regularization
+    double interp_factor;         //!<  linear interpolation factor for adaptation
+    double output_sigma_factor;   //!<  spatial bandwidth (proportional to target)
+    double pca_learning_rate;     //!<  compression learning rate
+    bool resize;                  //!<  activate the resize feature to improve the processing speed
+    bool split_coeff;             //!<  split the training coefficients into two matrices
+    bool wrap_kernel;             //!<  wrap around the kernel values
+    bool compress_feature;        //!<  activate the pca method to compress the features
+    int max_patch_size;           //!<  threshold for the ROI size
+    int compressed_size;          //!<  feature size after compression
+    MODE descriptor;              //!<  descriptor type
+  };
+
+  /** @brief Constructor
+      @param parameters KCF parameters TrackerKCF::Params
+  */
+  BOILERPLATE_CODE("KCF",TrackerKCF);
+};
+
 //! @}

 } /* namespace cv */
--- a/modules/tracking/samples/kcf.cpp
+++ b/modules/tracking/samples/kcf.cpp
@ -0,0 +1,194 @@
+/*----------------------------------------------
+ * Usage:
+ * example_tracking_kcf <video_name>
+ *
+ * example:
+ * example_tracking_kcf Bolt/img/%04.jpg
+ * example_tracking_kcf faceocc2.webm
+ *--------------------------------------------------*/
+
+#include <opencv2/core/utility.hpp>
+#include <opencv2/tracking.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+using namespace cv;
+
+class BoxExtractor {
+public:
+  Rect2d extract(Mat img);
+  Rect2d extract(const std::string& windowName, Mat img, bool showCrossair = true);
+
+  struct handlerT{
+    bool isDrawing;
+    Rect2d box;
+    Mat image;
+
+    // initializer list
+    handlerT(): isDrawing(false) {};
+  }params;
+
+private:
+  static void mouseHandler(int event, int x, int y, int flags, void *param);
+  void opencv_mouse_callback( int event, int x, int y, int , void *param );
+};
+
+int main( int argc, char** argv ){
+  // show help
+  if(argc<2){
+    cout<<
+      " Usage: example_tracking_kcf <video_name>\n"
+      " examples:\n"
+      " example_tracking_kcf Bolt/img/%04.jpg\n"
+      " example_tracking_kcf faceocc2.webm\n"
+      << endl;
+    return 0;
+  }
+
+  // ROI selector
+  BoxExtractor box;
+
+  // create the tracker
+  Ptr<Tracker> tracker = Tracker::create( "KCF" );
+
+  // set input video
+  std::string video = argv[1];
+  VideoCapture cap(video);
+
+  Mat frame;
+
+  // get bounding box
+  cap >> frame;
+  Rect2d roi=box.extract("tracker",frame);
+
+  //quit if ROI was not selected
+  if(roi.width==0 || roi.height==0)
+    return 0;
+
+  // initialize the tracker
+  tracker->init(frame,roi);
+
+  // do the tracking
+  printf("Start the tracking process, press ESC to quit.\n");
+  for ( ;; ){
+    // get frame from the video
+    cap >> frame;
+
+    // stop the program if no more images
+    if(frame.rows==0 || frame.cols==0)
+      break;
+
+    // update the tracking result
+    tracker->update(frame,roi);
+
+    // draw the tracked object
+    rectangle( frame, roi, Scalar( 255, 0, 0 ), 2, 1 );
+
+    // show image with the tracked object
+    imshow("tracker",frame);
+
+    //quit on ESC button
+    if(waitKey(1)==27)break;
+  }
+
+}
+
+void BoxExtractor::mouseHandler(int event, int x, int y, int flags, void *param){
+    BoxExtractor *self =static_cast<BoxExtractor*>(param);
+    self->opencv_mouse_callback(event,x,y,flags,param);
+}
+
+void BoxExtractor::opencv_mouse_callback( int event, int x, int y, int , void *param ){
+    handlerT * data = (handlerT*)param;
+    switch( event ){
+      // update the selected bounding box
+      case EVENT_MOUSEMOVE:
+        if( data->isDrawing ){
+          data->box.width = x-data->box.x;
+          data->box.height = y-data->box.y;
+        }
+      break;
+
+      // start to select the bounding box
+      case EVENT_LBUTTONDOWN:
+        data->isDrawing = true;
+        data->box = cvRect( x, y, 0, 0 );
+      break;
+
+      // cleaning up the selected bounding box
+      case EVENT_LBUTTONUP:
+        data->isDrawing = false;
+        if( data->box.width < 0 ){
+          data->box.x += data->box.width;
+          data->box.width *= -1;
+        }
+        if( data->box.height < 0 ){
+          data->box.y += data->box.height;
+          data->box.height *= -1;
+        }
+      break;
+    }
+}
+
+Rect2d BoxExtractor::extract(Mat img){
+  return extract("Bounding Box Extractor", img);
+}
+
+Rect2d BoxExtractor::extract(const std::string& windowName, Mat img, bool showCrossair){
+
+  int key=0;
+
+  // show the image and give feedback to user
+  imshow(windowName,img);
+  printf("Select an object to track and then press SPACE/BACKSPACE/ENTER button!\n");
+
+  // copy the data, rectangle should be drawn in the fresh image
+  params.image=img.clone();
+
+  // select the object
+  setMouseCallback( windowName, mouseHandler, (void *)&params );
+
+  // end selection process on SPACE (32) BACKSPACE (27) or ENTER (13)
+  while(!(key==32 || key==27 || key==13)){
+    // draw the selected object
+    rectangle(
+      params.image,
+      params.box,
+      Scalar(255,0,0),2,1
+    );
+
+    // draw cross air in the middle of bounding box
+    if(showCrossair){
+      // horizontal line
+      line(
+        params.image,
+        Point((int)params.box.x,(int)(params.box.y+params.box.height/2)),
+        Point((int)(params.box.x+params.box.width),(int)(params.box.y+params.box.height/2)),
+        Scalar(255,0,0),2,1
+      );
+
+      // vertical line
+      line(
+        params.image,
+        Point((int)(params.box.x+params.box.width/2),(int)params.box.y),
+        Point((int)(params.box.x+params.box.width/2),(int)(params.box.y+params.box.height)),
+        Scalar(255,0,0),2,1
+      );
+    }
+
+    // show the image bouding box
+    imshow(windowName,params.image);
+
+    // reset the image
+    params.image=img.clone();
+
+    //get keyboard event
+    key=waitKey(1);
+  }
+
+
+  return params.box;
+}
--- a/modules/tracking/samples/tld_test.cpp
+++ b/modules/tracking/samples/tld_test.cpp
@ -48,8 +48,8 @@
 using namespace std;
 using namespace cv;

-#define NUM_TEST_FRAMES 500
-#define TEST_VIDEO_INDEX 1		//TLD Dataset Video Index from 1-10
+#define NUM_TEST_FRAMES 100
+#define TEST_VIDEO_INDEX 7		//TLD Dataset Video Index from 1-10
 //#define RECORD_VIDEO_FLG

 static Mat image;
--- a/modules/tracking/src/featureColorName.cpp
+++ b/modules/tracking/src/featureColorName.cpp
--- a/modules/tracking/src/opencl/tldDetector.cl
+++ b/modules/tracking/src/opencl/tldDetector.cl
@ -0,0 +1,133 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+
+
+__kernel void NCC(__global const uchar *patch,
+				 __global const uchar *positiveSamples,
+				 __global const uchar *negativeSamples,
+				 __global float *ncc,
+				 int posNum,
+				 int negNum)
+{
+	int id = get_global_id(0);
+	if (id >= 1000) return;
+	bool posFlg;
+
+	if (id < 500)
+		posFlg = true;
+	if (id >= 500)
+	{
+		//Negative index
+		id = id - 500;
+		posFlg = false;
+	}
+
+	//Variables
+	int s1 = 0, s2 = 0, n1 = 0, n2 = 0, prod = 0;
+	float sq1 = 0, sq2 = 0, ares = 0;
+	int N = 225;
+	//NCC with positive sample
+	if (posFlg && id < posNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+			s1 += positiveSamples[id * N + i];
+			s2 += patch[i];
+			n1 += positiveSamples[id * N + i] * positiveSamples[id * N + i];
+			n2 += patch[i] * patch[i];
+			prod += positiveSamples[id * N + i] * patch[i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		ncc[id] = ares;
+	}
+
+	//NCC with negative sample
+	if (!posFlg && id < negNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+
+			s1 += negativeSamples[id * N + i];
+			s2 += patch[i];
+			n1 += negativeSamples[id * N + i] * negativeSamples[id * N + i];
+			n2 += patch[i] * patch[i];
+			prod += negativeSamples[id * N + i] * patch[i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		ncc[id+500] = ares;
+	}
+}
+
+__kernel void batchNCC(__global const uchar *patches,
+	__global const uchar *positiveSamples,
+	__global const uchar *negativeSamples,
+	__global float *posNcc,
+	__global float *negNcc,
+	int posNum,
+	int negNum,
+	int patchNum)
+{
+	int id = get_global_id(0);
+	bool posFlg;
+
+	if (id < 500*patchNum)
+		posFlg = true;
+	if (id >= 500*patchNum)
+	{
+		//Negative index
+		id = id - 500*patchNum;
+		posFlg = false;
+	}
+
+	int modelSampleID = id % 500;
+	int patchID = id / 500;
+
+	//Variables
+	int s1 = 0, s2 = 0, n1 = 0, n2 = 0, prod = 0;
+	float sq1 = 0, sq2 = 0, ares = 0;
+	int N = 225;
+
+	//NCC with positive sample
+	if (posFlg && modelSampleID < posNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+			s1 += positiveSamples[modelSampleID * N + i];
+			s2 += patches[patchID*N + i];
+			n1 += positiveSamples[modelSampleID * N + i] * positiveSamples[modelSampleID * N + i];
+			n2 += patches[patchID*N + i] * patches[patchID*N + i];
+			prod += positiveSamples[modelSampleID * N + i] * patches[patchID*N + i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		posNcc[id] = ares;
+	}
+
+	//NCC with negative sample
+	if (!posFlg && modelSampleID < negNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+
+			s1 += negativeSamples[modelSampleID * N + i];
+			s2 += patches[patchID*N + i];
+			n1 += negativeSamples[modelSampleID * N + i] * negativeSamples[modelSampleID * N + i];
+			n2 += patches[patchID*N + i] * patches[patchID*N + i];
+			prod += negativeSamples[modelSampleID * N + i] * patches[patchID*N + i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		negNcc[id] = ares;
+	}
+}
--- a/modules/tracking/src/precomp.hpp
+++ b/modules/tracking/src/precomp.hpp
@ -44,5 +44,11 @@

 #include "opencv2/tracking.hpp"
 #include "opencv2/core/utility.hpp"
+#include "opencv2/core/ocl.hpp"
+
+namespace cv
+{
+	extern const double ColorNames[][10];
+}

 #endif
--- a/modules/tracking/src/tldDetector.cpp
+++ b/modules/tracking/src/tldDetector.cpp
@ -65,19 +65,232 @@ namespace cv
 		// Calculate Relative similarity of the patch (NN-Model)
 		double TLDDetector::Sr(const Mat_<uchar>& patch)
 		{
+			/*
+			int64 e1, e2;
+			float t;
+			e1 = getTickCount();
 			double splus = 0.0, sminus = 0.0;
 			for (int i = 0; i < (int)(*positiveExamples).size(); i++)
 				splus = std::max(splus, 0.5 * (NCC((*positiveExamples)[i], patch) + 1.0));
 			for (int i = 0; i < (int)(*negativeExamples).size(); i++)
 				sminus = std::max(sminus, 0.5 * (NCC((*negativeExamples)[i], patch) + 1.0));
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			printf("Sr: %f\n", t);
 			if (splus + sminus == 0.0)
 				return 0.0;
 			return splus / (sminus + splus);
+			*/
+			//int64 e1, e2;
+			//float t;
+			//e1 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+			Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			for (int i = 0; i < *posNum; i++)
+			{
+				modelSample.data = &(posExp->data[i * 225]);
+				splus = std::max(splus, 0.5 * (NCC(modelSample, patch) + 1.0));
+			}
+			for (int i = 0; i < *negNum; i++)
+			{
+				modelSample.data = &(negExp->data[i * 225]);
+				sminus = std::max(sminus, 0.5 * (NCC(modelSample, patch) + 1.0));
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sr CPU: %f\n", t);
+			if (splus + sminus == 0.0)
+				return 0.0;
+			return splus / (sminus + splus);
+		}
+
+		double TLDDetector::ocl_Sr(const Mat_<uchar>& patch)
+		{
+			//int64 e1, e2, e3, e4;
+			//double t;
+			//e1 = getTickCount();
+			//e3 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+
+
+			UMat devPatch = patch.getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPositiveSamples = posExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegativeSamples = negExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNCC(1, 2*MAX_EXAMPLES_IN_MODEL, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+
+
+			ocl::Kernel k;
+			ocl::ProgramSource src = ocl::tracking::tldDetector_oclsrc;
+			String error;
+			ocl::Program prog(src, NULL, error);
+			k.create("NCC", prog);
+			if (k.empty())
+				printf("Kernel create failed!!!\n");
+			k.args(
+				ocl::KernelArg::PtrReadOnly(devPatch),
+				ocl::KernelArg::PtrReadOnly(devPositiveSamples),
+				ocl::KernelArg::PtrReadOnly(devNegativeSamples),
+				ocl::KernelArg::PtrWriteOnly(devNCC),
+				posNum,
+				negNum);
+
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Mem Cpy GPU: %f\n", t);
+
+			size_t globSize = 1000;
+			//e3 = getTickCount();
+			if (!k.run(1, &globSize, NULL, false))
+				printf("Kernel Run Error!!!");
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Kernel Run GPU: %f\n", t);
+
+			//e3 = getTickCount();
+			Mat resNCC = devNCC.getMat(ACCESS_READ);
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Read Mem GPU: %f\n", t);
+
+			////Compare
+			//Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			//for (int i = 0; i < 200; i+=17)
+			//{
+			//	modelSample.data = &(posExp->data[i * 225]);
+			//	printf("%f\t%f\n\n", resNCC.at<float>(i), NCC(modelSample, patch));
+			//}
+
+			//for (int i = 0; i < 200; i+=23)
+			//{
+			//	modelSample.data = &(negExp->data[i * 225]);
+			//	printf("%f\t%f\n", resNCC.at<float>(500+i), NCC(modelSample, patch));
+			//}
+
+
+			for (int i = 0; i < *posNum; i++)
+				splus = std::max(splus, 0.5 * (resNCC.at<float>(i) + 1.0));
+
+			for (int i = 0; i < *negNum; i++)
+				sminus = std::max(sminus, 0.5 * (resNCC.at<float>(i+500) +1.0));
+
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sr GPU: %f\n\n", t);
+
+			if (splus + sminus == 0.0)
+				return 0.0;
+			return splus / (sminus + splus);
+		}
+
+		void TLDDetector::ocl_batchSrSc(const Mat_<uchar>& patches, double *resultSr, double *resultSc, int numOfPatches)
+		{
+			//int64 e1, e2, e3, e4;
+			//double t;
+			//e1 = getTickCount();
+			//e3 = getTickCount();
+
+			UMat devPatches = patches.getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPositiveSamples = posExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegativeSamples = negExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPosNCC(MAX_EXAMPLES_IN_MODEL, numOfPatches, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegNCC(MAX_EXAMPLES_IN_MODEL, numOfPatches, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+
+			ocl::Kernel k;
+			ocl::ProgramSource src = ocl::tracking::tldDetector_oclsrc;
+			String error;
+			ocl::Program prog(src, NULL, error);
+			k.create("batchNCC", prog);
+			if (k.empty())
+				printf("Kernel create failed!!!\n");
+			k.args(
+				ocl::KernelArg::PtrReadOnly(devPatches),
+				ocl::KernelArg::PtrReadOnly(devPositiveSamples),
+				ocl::KernelArg::PtrReadOnly(devNegativeSamples),
+				ocl::KernelArg::PtrWriteOnly(devPosNCC),
+				ocl::KernelArg::PtrWriteOnly(devNegNCC),
+				posNum,
+				negNum,
+				numOfPatches);
+
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Mem Cpy GPU: %f\n", t);
+
+			// 2 -> Pos&Neg
+			size_t globSize = 2 * numOfPatches*MAX_EXAMPLES_IN_MODEL;
+			//e3 = getTickCount();
+			if (!k.run(1, &globSize, NULL, false))
+				printf("Kernel Run Error!!!");
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Kernel Run GPU: %f\n", t);
+
+			//e3 = getTickCount();
+			Mat posNCC = devPosNCC.getMat(ACCESS_READ);
+			Mat negNCC = devNegNCC.getMat(ACCESS_READ);
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Read Mem GPU: %f\n", t);
+
+			//Calculate Srs
+			for (int id = 0; id < numOfPatches; id++)
+			{
+				double spr = 0.0, smr = 0.0, spc = 0.0, smc = 0;
+				int med = getMedian((*timeStampsPositive));
+				for (int i = 0; i < *posNum; i++)
+				{
+					spr = std::max(spr, 0.5 * (posNCC.at<float>(id * 500 + i) + 1.0));
+					if ((int)(*timeStampsPositive)[i] <= med)
+						spc = std::max(spr, 0.5 * (posNCC.at<float>(id * 500 + i) + 1.0));
+				}
+				for (int i = 0; i < *negNum; i++)
+					smc = smr = std::max(smr, 0.5 * (negNCC.at<float>(id * 500 + i) + 1.0));
+
+				if (spr + smr == 0.0)
+					resultSr[id] = 0.0;
+				else
+					resultSr[id] = spr / (smr + spr);
+
+				if (spc + smc == 0.0)
+					resultSc[id] = 0.0;
+				else
+					resultSc[id] = spc / (smc + spc);
+			}
+
+			////Compare positive NCCs
+			/*Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			Mat_<uchar> patch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			for (int j = 0; j < numOfPatches; j++)
+			{
+				for (int i = 0; i < 1; i++)
+				{
+					modelSample.data = &(posExp->data[i * 225]);
+					patch.data = &(patches.data[j * 225]);
+					printf("%f\t%f\n", resultSr[j], Sr(patch));
+					printf("%f\t%f\n", resultSc[j], Sc(patch));
+				}
+			}*/
+
+			//for (int i = 0; i < 200; i+=23)
+			//{
+			//	modelSample.data = &(negExp->data[i * 225]);
+			//	printf("%f\t%f\n", resNCC.at<float>(500+i), NCC(modelSample, patch));
+			//}
+
+
+
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sr GPU: %f\n\n", t);
 		}

 		// Calculate Conservative similarity of the patch (NN-Model)
 		double TLDDetector::Sc(const Mat_<uchar>& patch)
 		{
+			/*
+			int64 e1, e2;
+			float t;
+			e1 = getTickCount();
 			double splus = 0.0, sminus = 0.0;
 			int med = getMedian((*timeStampsPositive));
 			for (int i = 0; i < (int)(*positiveExamples).size(); i++)
@ -87,6 +300,117 @@ namespace cv
 			}
 			for (int i = 0; i < (int)(*negativeExamples).size(); i++)
 				sminus = std::max(sminus, 0.5 * (NCC((*negativeExamples)[i], patch) + 1.0));
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			printf("Sc: %f\n", t);
+			if (splus + sminus == 0.0)
+				return 0.0;
+
+			return splus / (sminus + splus);
+			*/
+
+			//int64 e1, e2;
+			//double t;
+			//e1 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+			Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			int med = getMedian((*timeStampsPositive));
+			for (int i = 0; i < *posNum; i++)
+			{
+				if ((int)(*timeStampsPositive)[i] <= med)
+				{
+					modelSample.data = &(posExp->data[i * 225]);
+					splus = std::max(splus, 0.5 * (NCC(modelSample, patch) + 1.0));
+				}
+			}
+			for (int i = 0; i < *negNum; i++)
+			{
+				modelSample.data = &(negExp->data[i * 225]);
+				sminus = std::max(sminus, 0.5 * (NCC(modelSample, patch) + 1.0));
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sc: %f\n", t);
+			if (splus + sminus == 0.0)
+				return 0.0;
+
+			return splus / (sminus + splus);
+		}
+
+		double TLDDetector::ocl_Sc(const Mat_<uchar>& patch)
+		{
+			//int64 e1, e2, e3, e4;
+			//float t;
+			//e1 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+
+			//e3 = getTickCount();
+
+			UMat devPatch = patch.getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPositiveSamples = posExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegativeSamples = negExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNCC(1, 2 * MAX_EXAMPLES_IN_MODEL, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+
+
+			ocl::Kernel k;
+			ocl::ProgramSource src = ocl::tracking::tldDetector_oclsrc;
+			String error;
+			ocl::Program prog(src, NULL, error);
+			k.create("NCC", prog);
+			if (k.empty())
+				printf("Kernel create failed!!!\n");
+			k.args(
+				ocl::KernelArg::PtrReadOnly(devPatch),
+				ocl::KernelArg::PtrReadOnly(devPositiveSamples),
+				ocl::KernelArg::PtrReadOnly(devNegativeSamples),
+				ocl::KernelArg::PtrWriteOnly(devNCC),
+				posNum,
+				negNum);
+
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Mem Cpy GPU: %f\n", t);
+
+			size_t globSize = 1000;
+			//e3 = getTickCount();
+			if (!k.run(1, &globSize, NULL, false))
+				printf("Kernel Run Error!!!");
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Kernel Run GPU: %f\n", t);
+
+			//e3 = getTickCount();
+			Mat resNCC = devNCC.getMat(ACCESS_READ);
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Read Mem GPU: %f\n", t);
+
+			////Compare
+			//Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			//for (int i = 0; i < 200; i+=17)
+			//{
+			//	modelSample.data = &(posExp->data[i * 225]);
+			//	printf("%f\t%f\n\n", resNCC.at<float>(i), NCC(modelSample, patch));
+			//}
+
+			//for (int i = 0; i < 200; i+=23)
+			//{
+			//	modelSample.data = &(negExp->data[i * 225]);
+			//	printf("%f\t%f\n", resNCC.at<float>(500+i), NCC(modelSample, patch));
+			//}
+
+			int med = getMedian((*timeStampsPositive));
+			for (int i = 0; i < *posNum; i++)
+				if ((int)(*timeStampsPositive)[i] <= med)
+					splus = std::max(splus, 0.5 * (resNCC.at<float>(i) +1.0));
+
+			for (int i = 0; i < *negNum; i++)
+				sminus = std::max(sminus, 0.5 * (resNCC.at<float>(i + 500) + 1.0));
+
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sc GPU: %f\n\n", t);
+
 			if (splus + sminus == 0.0)
 				return 0.0;
 			return splus / (sminus + splus);
@ -129,76 +453,243 @@ namespace cv
 		}

 		//Detection - returns most probable new target location (Max Sc)
+
 		bool TLDDetector::detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize)
 		{
 			patches.clear();
-
-			Mat resized_img, blurred_img;
 			Mat_<uchar> standardPatch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
-			img.copyTo(resized_img);
-			imgBlurred.copyTo(blurred_img);
+			Mat tmp;
 			int dx = initSize.width / 10, dy = initSize.height / 10;
 			Size2d size = img.size();
 			double scale = 1.0;
-			int total = 0, pass = 0;
 			int npos = 0, nneg = 0;
-			double tmp = 0, maxSc = -5.0;
+			double maxSc = -5.0;
 			Rect2d maxScRect;
+			int scaleID;
+			std::vector <Mat> resized_imgs, blurred_imgs;
+			std::vector <Point> varBuffer, ensBuffer;
+			std::vector <int> varScaleIDs, ensScaleIDs;
+			//int64 e1, e2;
+			//double t;
+
+			//e1 = getTickCount();

 			//Detection part
+			//Generate windows and filter by variance
+			scaleID = 0;
+			resized_imgs.push_back(img);
+			blurred_imgs.push_back(imgBlurred);
 			do
 			{
 				Mat_<double> intImgP, intImgP2;
-				computeIntegralImages(resized_img, intImgP, intImgP2);
-
-				prepareClassifiers((int)blurred_img.step[0]);
-				for (int i = 0, imax = cvFloor((0.0 + resized_img.cols - initSize.width) / dx); i < imax; i++)
+				computeIntegralImages(resized_imgs[scaleID], intImgP, intImgP2);
+				for (int i = 0, imax = cvFloor((0.0 + resized_imgs[scaleID].cols - initSize.width) / dx); i < imax; i++)
 				{
-					for (int j = 0, jmax = cvFloor((0.0 + resized_img.rows - initSize.height) / dy); j < jmax; j++)
+					for (int j = 0, jmax = cvFloor((0.0 + resized_imgs[scaleID].rows - initSize.height) / dy); j < jmax; j++)
 					{
-						LabeledPatch labPatch;
-						total++;
 						if (!patchVariance(intImgP, intImgP2, originalVariancePtr, Point(dx * i, dy * j), initSize))
 							continue;
-						if (ensembleClassifierNum(&blurred_img.at<uchar>(dy * j, dx * i)) <= ENSEMBLE_THRESHOLD)
-							continue;
-						pass++;
+						varBuffer.push_back(Point(dx * i, dy * j));
+						varScaleIDs.push_back(scaleID);
+					}
+				}
+				scaleID++;
+				size.width /= SCALE_STEP;
+				size.height /= SCALE_STEP;
+				scale *= SCALE_STEP;
+				resize(img, tmp, size, 0, 0, DOWNSCALE_MODE);
+				resized_imgs.push_back(tmp);
+				GaussianBlur(resized_imgs[scaleID], tmp, GaussBlurKernelSize, 0.0f);
+				blurred_imgs.push_back(tmp);
+			} while (size.width >= initSize.width && size.height >= initSize.height);
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Variance: %d\t%f\n", varBuffer.size(), t);
+
+			//Encsemble classification
+			//e1 = getTickCount();
+			for (int i = 0; i < (int)varBuffer.size(); i++)
+			{
+				prepareClassifiers(static_cast<int> (blurred_imgs[varScaleIDs[i]].step[0]));
+				if (ensembleClassifierNum(&blurred_imgs[varScaleIDs[i]].at<uchar>(varBuffer[i].y, varBuffer[i].x)) <= ENSEMBLE_THRESHOLD)
+					continue;
+				ensBuffer.push_back(varBuffer[i]);
+				ensScaleIDs.push_back(varScaleIDs[i]);
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Ensemble: %d\t%f\n", ensBuffer.size(), t);

-						labPatch.rect = Rect2d(dx * i * scale, dy * j * scale, initSize.width * scale, initSize.height * scale);
-						resample(resized_img, Rect2d(Point(dx * i, dy * j), initSize), standardPatch);
-						tmp = Sr(standardPatch);
+			//NN classification
+			//e1 = getTickCount();
+			for (int i = 0; i < (int)ensBuffer.size(); i++)
+			{
+				LabeledPatch labPatch;
+				double curScale = pow(SCALE_STEP, ensScaleIDs[i]);
+				labPatch.rect = Rect2d(ensBuffer[i].x*curScale, ensBuffer[i].y*curScale, initSize.width * curScale, initSize.height * curScale);
+				resample(resized_imgs[ensScaleIDs[i]], Rect2d(ensBuffer[i], initSize), standardPatch);

-						////To fix: Check the paper, probably this cause wrong learning
-						//
-						labPatch.isObject = tmp > THETA_NN;
-						labPatch.shouldBeIntegrated = abs(tmp - THETA_NN) < 0.1;
-						patches.push_back(labPatch);
-						//
+				double srValue, scValue;
+				srValue = Sr(standardPatch);

-						if (!labPatch.isObject)
-						{
-							nneg++;
+				////To fix: Check the paper, probably this cause wrong learning
+				//
+				labPatch.isObject = srValue > THETA_NN;
+				labPatch.shouldBeIntegrated = abs(srValue - THETA_NN) < 0.1;
+				patches.push_back(labPatch);
+				//
+
+				if (!labPatch.isObject)
+				{
+					nneg++;
+					continue;
+				}
+				else
+				{
+					npos++;
+				}
+				scValue = Sc(standardPatch);
+				if (scValue > maxSc)
+				{
+					maxSc = scValue;
+					maxScRect = labPatch.rect;
+				}
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("NN: %d\t%f\n", patches.size(), t);
+
+			if (maxSc < 0)
+				return false;
+			res = maxScRect;
+			return true;
+		}
+
+		bool TLDDetector::ocl_detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize)
+		{
+			patches.clear();
+			Mat_<uchar> standardPatch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			Mat tmp;
+			int dx = initSize.width / 10, dy = initSize.height / 10;
+			Size2d size = img.size();
+			double scale = 1.0;
+			int npos = 0, nneg = 0;
+			double maxSc = -5.0;
+			Rect2d maxScRect;
+			int scaleID;
+			std::vector <Mat> resized_imgs, blurred_imgs;
+			std::vector <Point> varBuffer, ensBuffer;
+			std::vector <int> varScaleIDs, ensScaleIDs;
+			//int64 e1, e2;
+			//double t;
+
+			//e1 = getTickCount();
+			//Detection part
+			//Generate windows and filter by variance
+			scaleID = 0;
+			resized_imgs.push_back(img);
+			blurred_imgs.push_back(imgBlurred);
+			do
+			{
+				Mat_<double> intImgP, intImgP2;
+				computeIntegralImages(resized_imgs[scaleID], intImgP, intImgP2);
+				for (int i = 0, imax = cvFloor((0.0 + resized_imgs[scaleID].cols - initSize.width) / dx); i < imax; i++)
+				{
+					for (int j = 0, jmax = cvFloor((0.0 + resized_imgs[scaleID].rows - initSize.height) / dy); j < jmax; j++)
+					{
+						if (!patchVariance(intImgP, intImgP2, originalVariancePtr, Point(dx * i, dy * j), initSize))
 							continue;
-						}
-						else
-						{
-							npos++;
-						}
-						tmp = Sc(standardPatch);
-						if (tmp > maxSc)
-						{
-							maxSc = tmp;
-							maxScRect = labPatch.rect;
-						}
+						varBuffer.push_back(Point(dx * i, dy * j));
+						varScaleIDs.push_back(scaleID);
 					}
 				}
-
+				scaleID++;
 				size.width /= SCALE_STEP;
 				size.height /= SCALE_STEP;
 				scale *= SCALE_STEP;
-				resize(img, resized_img, size, 0, 0, DOWNSCALE_MODE);
-				GaussianBlur(resized_img, blurred_img, GaussBlurKernelSize, 0.0f);
+				resize(img, tmp, size, 0, 0, DOWNSCALE_MODE);
+				resized_imgs.push_back(tmp);
+				GaussianBlur(resized_imgs[scaleID], tmp, GaussBlurKernelSize, 0.0f);
+				blurred_imgs.push_back(tmp);
 			} while (size.width >= initSize.width && size.height >= initSize.height);
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Variance: %d\t%f\n", varBuffer.size(), t);
+
+			//Encsemble classification
+			//e1 = getTickCount();
+			for (int i = 0; i < (int)varBuffer.size(); i++)
+			{
+				prepareClassifiers((int)blurred_imgs[varScaleIDs[i]].step[0]);
+				if (ensembleClassifierNum(&blurred_imgs[varScaleIDs[i]].at<uchar>(varBuffer[i].y, varBuffer[i].x)) <= ENSEMBLE_THRESHOLD)
+					continue;
+				ensBuffer.push_back(varBuffer[i]);
+				ensScaleIDs.push_back(varScaleIDs[i]);
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Ensemble: %d\t%f\n", ensBuffer.size(), t);
+
+			//NN classification
+			//e1 = getTickCount();
+			//Prepare batch of patches
+			int numOfPatches = (int)ensBuffer.size();
+			Mat_<uchar> stdPatches(numOfPatches, 225);
+			double *resultSr = new double[numOfPatches];
+			double *resultSc = new double[numOfPatches];
+
+			uchar *patchesData = stdPatches.data;
+			for (int i = 0; i < (int)ensBuffer.size(); i++)
+			{
+				resample(resized_imgs[ensScaleIDs[i]], Rect2d(ensBuffer[i], initSize), standardPatch);
+				uchar *stdPatchData = standardPatch.data;
+				for (int j = 0; j < 225; j++)
+					patchesData[225*i+j] = stdPatchData[j];
+			}
+			//Calculate Sr and Sc batches
+			ocl_batchSrSc(stdPatches, resultSr, resultSc, numOfPatches);
+
+
+			for (int i = 0; i < (int)ensBuffer.size(); i++)
+			{
+				LabeledPatch labPatch;
+				standardPatch.data = &stdPatches.data[225 * i];
+				double curScale = pow(SCALE_STEP, ensScaleIDs[i]);
+				labPatch.rect = Rect2d(ensBuffer[i].x*curScale, ensBuffer[i].y*curScale, initSize.width * curScale, initSize.height * curScale);
+
+				double srValue, scValue;
+
+				srValue = resultSr[i];
+
+				//srValue = Sr(standardPatch);
+				//printf("%f\t%f\t\n", srValue, resultSr[i]);
+
+				////To fix: Check the paper, probably this cause wrong learning
+				//
+				labPatch.isObject = srValue > THETA_NN;
+				labPatch.shouldBeIntegrated = abs(srValue - THETA_NN) < 0.1;
+				patches.push_back(labPatch);
+				//
+
+				if (!labPatch.isObject)
+				{
+					nneg++;
+					continue;
+				}
+				else
+				{
+					npos++;
+				}
+				scValue = resultSc[i];
+				if (scValue > maxSc)
+				{
+					maxSc = scValue;
+					maxScRect = labPatch.rect;
+				}
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("NN: %d\t%f\n", patches.size(), t);

 			if (maxSc < 0)
 				return false;
--- a/modules/tracking/src/tldDetector.hpp
+++ b/modules/tracking/src/tldDetector.hpp
@ -43,6 +43,7 @@
 #define OPENCV_TLD_DETECTOR

 #include "precomp.hpp"
+#include "opencl_kernels_tracking.hpp"
 #include "tldEnsembleClassifier.hpp"
 #include "tldUtils.hpp"

@ -73,9 +74,14 @@ namespace cv
 			inline double ensembleClassifierNum(const uchar* data);
 			inline void prepareClassifiers(int rowstep);
 			double Sr(const Mat_<uchar>& patch);
+			double ocl_Sr(const Mat_<uchar>& patch);
 			double Sc(const Mat_<uchar>& patch);
+			double ocl_Sc(const Mat_<uchar>& patch);
+			void ocl_batchSrSc(const Mat_<uchar>& patches, double *resultSr, double *resultSc, int numOfPatches);

 			std::vector<TLDEnsembleClassifier> classifiers;
+			Mat *posExp, *negExp;
+			int *posNum, *negNum;
 			std::vector<Mat_<uchar> > *positiveExamples, *negativeExamples;
 			std::vector<int> *timeStampsPositive, *timeStampsNegative;
 			double *originalVariancePtr;
@ -87,6 +93,7 @@ namespace cv
 				bool isObject, shouldBeIntegrated;
 			};
 			bool detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize);
+			bool ocl_detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize);
 		protected:


--- a/modules/tracking/src/tldEnsembleClassifier.cpp
+++ b/modules/tracking/src/tldEnsembleClassifier.cpp
@ -58,11 +58,11 @@ namespace cv
 		// Calculate measure locations from 15x15 grid on minSize patches
 		void TLDEnsembleClassifier::stepPrefSuff(std::vector<Vec4b>& arr, int pos, int len, int gridSize)
 		{
-#if 0
+		#if 0
 			int step = len / (gridSize - 1), pref = (len - step * (gridSize - 1)) / 2;
 			for (int i = 0; i < (int)(sizeof(x1) / sizeof(x1[0])); i++)
 				arr[i] = pref + arr[i] * step;
-#else
+		#else
 			int total = len - gridSize;
 			int quo = total / (gridSize - 1), rem = total % (gridSize - 1);
 			int smallStep = quo, bigStep = quo + 1;
--- a/modules/tracking/src/tldEnsembleClassifier.hpp
+++ b/modules/tracking/src/tldEnsembleClassifier.hpp
@ -64,6 +64,5 @@ namespace cv
 			std::vector<Point2i> offset;
 			int lastStep_;
 		};
-
 	}
 }
--- a/modules/tracking/src/tldModel.cpp
+++ b/modules/tracking/src/tldModel.cpp
@ -56,6 +56,15 @@ namespace cv
 			detector = Ptr<TLDDetector>(new TLDDetector());

 			//Propagate data to Detector
+			posNum = 0;
+			negNum = 0;
+			posExp = Mat(Size(225, 500), CV_8UC1);
+			negExp = Mat(Size(225, 500), CV_8UC1);
+			detector->posNum = &posNum;
+			detector->negNum = &negNum;
+			detector->posExp = &posExp;
+			detector->negExp = &negExp;
+
 			detector->positiveExamples = &positiveExamples;
 			detector->negativeExamples = &negativeExamples;
 			detector->timeStampsPositive = &timeStampsPositive;
@ -69,14 +78,13 @@ namespace cv
 				scaledImg, blurredImg, GaussBlurKernelSize, SCALE_STEP);
 			GaussianBlur(image, image_blurred, GaussBlurKernelSize, 0.0);
 			TLDDetector::generateScanGrid(image.rows, image.cols, minSize_, scanGrid);
-			getClosestN(scanGrid, Rect2d(boundingBox.x / scale, boundingBox.y / scale, boundingBox.width / scale,
-				boundingBox.height / scale), 10, closest);
-
+			getClosestN(scanGrid, Rect2d(boundingBox.x / scale, boundingBox.y / scale, boundingBox.width / scale, boundingBox.height / scale), 10, closest);
 			Mat_<uchar> blurredPatch(minSize);
 			TLDEnsembleClassifier::makeClassifiers(minSize, MEASURES_PER_CLASSIFIER, GRIDSIZE, detector->classifiers);

 			//Generate initial positive samples and put them to the model
 			positiveExamples.reserve(200);
+
 			for (int i = 0; i < (int)closest.size(); i++)
 			{
 				for (int j = 0; j < 20; j++)
@ -188,6 +196,11 @@ namespace cv
 		void TrackerTLDModel::integrateAdditional(const std::vector<Mat_<uchar> >& eForModel, const std::vector<Mat_<uchar> >& eForEnsemble, bool isPositive)
 		{
 			int positiveIntoModel = 0, negativeIntoModel = 0, positiveIntoEnsemble = 0, negativeIntoEnsemble = 0;
+			if ((int)eForModel.size() == 0) return;
+
+			//int64 e1, e2;
+			//double t;
+			//e1 = getTickCount();
 			for (int k = 0; k < (int)eForModel.size(); k++)
 			{
 				double sr = detector->Sr(eForModel[k]);
@ -218,6 +231,79 @@ namespace cv
 						detector->classifiers[i].integrate(eForEnsemble[k], isPositive);
 				}
 			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency() * 1000;
+			//printf("Integrate Additional: %fms\n", t);
+			/*
+			if( negativeIntoModel > 0 )
+			dfprintf((stdout, "negativeIntoModel = %d ", negativeIntoModel));
+			if( positiveIntoModel > 0 )
+			dfprintf((stdout, "positiveIntoModel = %d ", positiveIntoModel));
+			if( negativeIntoEnsemble > 0 )
+			dfprintf((stdout, "negativeIntoEnsemble = %d ", negativeIntoEnsemble));
+			if( positiveIntoEnsemble > 0 )
+			dfprintf((stdout, "positiveIntoEnsemble = %d ", positiveIntoEnsemble));
+			dfprintf((stdout, "\n"));*/
+		}
+
+		void TrackerTLDModel::ocl_integrateAdditional(const std::vector<Mat_<uchar> >& eForModel, const std::vector<Mat_<uchar> >& eForEnsemble, bool isPositive)
+		{
+			int positiveIntoModel = 0, negativeIntoModel = 0, positiveIntoEnsemble = 0, negativeIntoEnsemble = 0;
+			if ((int)eForModel.size() == 0) return;
+
+			//int64 e1, e2;
+			//double t;
+			//e1 = getTickCount();
+
+			//Prepare batch of patches
+			int numOfPatches = (int)eForModel.size();
+			Mat_<uchar> stdPatches(numOfPatches, 225);
+			double *resultSr = new double[numOfPatches];
+			double *resultSc = new double[numOfPatches];
+			uchar *patchesData = stdPatches.data;
+			for (int i = 0; i < numOfPatches; i++)
+			{
+				uchar *stdPatchData = eForModel[i].data;
+				for (int j = 0; j < 225; j++)
+					patchesData[225 * i + j] = stdPatchData[j];
+			}
+
+			//Calculate Sr and Sc batches
+			detector->ocl_batchSrSc(stdPatches, resultSr, resultSc, numOfPatches);
+
+			for (int k = 0; k < (int)eForModel.size(); k++)
+			{
+				double sr = resultSr[k];
+				if ((sr > THETA_NN) != isPositive)
+				{
+					if (isPositive)
+					{
+						positiveIntoModel++;
+						pushIntoModel(eForModel[k], true);
+					}
+					else
+					{
+						negativeIntoModel++;
+						pushIntoModel(eForModel[k], false);
+					}
+				}
+				double p = 0;
+				for (int i = 0; i < (int)detector->classifiers.size(); i++)
+					p += detector->classifiers[i].posteriorProbability(eForEnsemble[k].data, (int)eForEnsemble[k].step[0]);
+				p /= detector->classifiers.size();
+				if ((p > ENSEMBLE_THRESHOLD) != isPositive)
+				{
+					if (isPositive)
+						positiveIntoEnsemble++;
+					else
+						negativeIntoEnsemble++;
+					for (int i = 0; i < (int)detector->classifiers.size(); i++)
+						detector->classifiers[i].integrate(eForEnsemble[k], isPositive);
+				}
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency() * 1000;
+			//printf("Integrate Additional OCL: %fms\n", t);
 			/*
 			if( negativeIntoModel > 0 )
 			dfprintf((stdout, "negativeIntoModel = %d ", negativeIntoModel));
@ -238,12 +324,30 @@ namespace cv
 			std::vector<int>* proxyT;
 			if (positive)
 			{
+				if (posNum < 500)
+				{
+					uchar *patchPtr = example.data;
+					uchar *modelPtr = posExp.data;
+					for (int i = 0; i < STANDARD_PATCH_SIZE*STANDARD_PATCH_SIZE; i++)
+						modelPtr[posNum*STANDARD_PATCH_SIZE*STANDARD_PATCH_SIZE + i] = patchPtr[i];
+					posNum++;
+				}
+
 				proxyV = &positiveExamples;
 				proxyN = &timeStampPositiveNext;
 				proxyT = &timeStampsPositive;
 			}
 			else
 			{
+				if (negNum < 500)
+				{
+					uchar *patchPtr = example.data;
+					uchar *modelPtr = negExp.data;
+					for (int i = 0; i < STANDARD_PATCH_SIZE*STANDARD_PATCH_SIZE; i++)
+						modelPtr[negNum*STANDARD_PATCH_SIZE*STANDARD_PATCH_SIZE + i] = patchPtr[i];
+					negNum++;
+				}
+
 				proxyV = &negativeExamples;
 				proxyN = &timeStampNegativeNext;
 				proxyT = &timeStampsNegative;
@ -268,9 +372,5 @@ namespace cv
 			dfprintf((port, "\tpositiveExamples.size() = %d\n", (int)positiveExamples.size()));
 			dfprintf((port, "\tnegativeExamples.size() = %d\n", (int)negativeExamples.size()));
 		}
-
-
-
-
 	}
 }
--- a/modules/tracking/src/tldModel.hpp
+++ b/modules/tracking/src/tldModel.hpp
@ -50,9 +50,6 @@ namespace cv
 {
 	namespace tld
 	{
-
-
-
 		class TrackerTLDModel : public TrackerModel
 		{
 		public:
@ -61,11 +58,14 @@ namespace cv
 			void setBoudingBox(Rect2d boundingBox){ boundingBox_ = boundingBox; }
 			void integrateRelabeled(Mat& img, Mat& imgBlurred, const std::vector<TLDDetector::LabeledPatch>& patches);
 			void integrateAdditional(const std::vector<Mat_<uchar> >& eForModel, const std::vector<Mat_<uchar> >& eForEnsemble, bool isPositive);
+			void ocl_integrateAdditional(const std::vector<Mat_<uchar> >& eForModel, const std::vector<Mat_<uchar> >& eForEnsemble, bool isPositive);
 			Size getMinSize(){ return minSize_; }
 			void printme(FILE* port = stdout);
 			Ptr<TLDDetector> detector;

 			std::vector<Mat_<uchar> > positiveExamples, negativeExamples;
+			Mat posExp, negExp;
+			int posNum, negNum;
 			std::vector<int> timeStampsPositive, timeStampsNegative;
 			int timeStampPositiveNext, timeStampNegativeNext;
 			double originalVariance_;
@ -80,7 +80,6 @@ namespace cv
 			void modelUpdateImpl(){}
 			Rect2d boundingBox_;
 			RNG rng;
-
 		};

 	}
--- a/modules/tracking/src/tldTracker.cpp
+++ b/modules/tracking/src/tldTracker.cpp
@ -116,11 +116,20 @@ bool TrackerTLDImpl::updateImpl(const Mat& image, Rect2d& boundingBox)
    std::vector<Rect2d> candidates;
    std::vector<double> candidatesRes;
    bool trackerNeedsReInit = false;
+	bool DETECT_FLG = false;
    for( int i = 0; i < 2; i++ )
    {
        Rect2d tmpCandid = boundingBox;
-        if( ( (i == 0) && !data->failedLastTime && trackerProxy->update(image, tmpCandid) ) ||
-			((i == 1) && (tldModel->detector->detect(imageForDetector, image_blurred, tmpCandid, detectorResults, tldModel->getMinSize()))))
+
+		if (i == 1)
+		{
+			if (ocl::haveOpenCL())
+				DETECT_FLG = tldModel->detector->ocl_detect(imageForDetector, image_blurred, tmpCandid, detectorResults, tldModel->getMinSize());
+			else
+				DETECT_FLG = tldModel->detector->detect(imageForDetector, image_blurred, tmpCandid, detectorResults, tldModel->getMinSize());
+		}
+
+        if( ( (i == 0) && !data->failedLastTime && trackerProxy->update(image, tmpCandid) ) || ( DETECT_FLG))
        {
            candidates.push_back(tmpCandid);
            if( i == 0 )
@ -202,10 +211,17 @@ bool TrackerTLDImpl::updateImpl(const Mat& image, Rect2d& boundingBox)
        tldModel->integrateRelabeled(imageForDetector, image_blurred, detectorResults);
        //dprintf(("%d relabeled by nExpert\n", negRelabeled));
        pExpert.additionalExamples(examplesForModel, examplesForEnsemble);
-        tldModel->integrateAdditional(examplesForModel, examplesForEnsemble, true);
+		if (ocl::haveOpenCL())
+			tldModel->ocl_integrateAdditional(examplesForModel, examplesForEnsemble, true);
+		else
+			tldModel->integrateAdditional(examplesForModel, examplesForEnsemble, true);
        examplesForModel.clear(); examplesForEnsemble.clear();
        nExpert.additionalExamples(examplesForModel, examplesForEnsemble);
-        tldModel->integrateAdditional(examplesForModel, examplesForEnsemble, false);
+
+		if (ocl::haveOpenCL())
+			tldModel->ocl_integrateAdditional(examplesForModel, examplesForEnsemble, false);
+		else
+			tldModel->integrateAdditional(examplesForModel, examplesForEnsemble, false);
    }
    else
    {
--- a/modules/tracking/src/tldTracker.hpp
+++ b/modules/tracking/src/tldTracker.hpp
@ -60,7 +60,6 @@ void TrackerTLD::Params::write(cv::FileStorage& /*fs*/) const {}

 namespace tld
 {
-
 class TrackerProxy
 {
 public:
--- a/modules/tracking/src/tracker.cpp
+++ b/modules/tracking/src/tracker.cpp
@ -110,6 +110,7 @@ Ptr<Tracker> Tracker::create( const String& trackerType )
  BOILERPLATE_CODE("BOOSTING",TrackerBoosting);
  BOILERPLATE_CODE("MEDIANFLOW",TrackerMedianFlow);
  BOILERPLATE_CODE("TLD",TrackerTLD);
+  BOILERPLATE_CODE("KCF",TrackerKCF);
  return Ptr<Tracker>();
 }

--- a/modules/tracking/src/trackerKCF.cpp
+++ b/modules/tracking/src/trackerKCF.cpp
@ -0,0 +1,686 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+
+#include "precomp.hpp"
+#include <complex>
+
+/*---------------------------
+|  TrackerKCFModel
+|---------------------------*/
+namespace cv{
+   /**
+  * \brief Implementation of TrackerModel for MIL algorithm
+  */
+  class TrackerKCFModel : public TrackerModel{
+  public:
+    TrackerKCFModel(TrackerKCF::Params /*params*/){}
+    ~TrackerKCFModel(){}
+  protected:
+    void modelEstimationImpl( const std::vector<Mat>& /*responses*/ ){}
+    void modelUpdateImpl(){}
+  };
+} /* namespace cv */
+
+
+/*---------------------------
+|  TrackerKCF
+|---------------------------*/
+namespace cv{
+
+  /*
+ * Prototype
+ */
+  class TrackerKCFImpl : public TrackerKCF {
+  public:
+    TrackerKCFImpl( const TrackerKCF::Params &parameters = TrackerKCF::Params() );
+    void read( const FileNode& /*fn*/ );
+    void write( FileStorage& /*fs*/ ) const;
+
+  protected:
+     /*
+    * basic functions and vars
+    */
+    bool initImpl( const Mat& /*image*/, const Rect2d& boundingBox );
+    bool updateImpl( const Mat& image, Rect2d& boundingBox );
+
+    TrackerKCF::Params params;
+
+    /*
+    * KCF functions and vars
+    */
+    void createHanningWindow(OutputArray _dst, const cv::Size winSize, const int type) const;
+    void inline fft2(const Mat src, std::vector<Mat> & dest) const;
+    void inline fft2(const Mat src, Mat & dest) const;
+    void inline ifft2(const Mat src, Mat & dest) const;
+    void inline pixelWiseMult(const std::vector<Mat> src1, const std::vector<Mat>  src2, std::vector<Mat>  & dest, const int flags, const bool conjB=false) const;
+    void inline sumChannels(std::vector<Mat> src, Mat & dest) const;
+    void inline updateProjectionMatrix(const Mat src, Mat & old_cov,Mat &  _proj_mtx,double pca_rate, int compressed_sz) const;
+    void inline compress(const Mat _proj_mtx, const Mat src, Mat & dest) const;
+    bool getSubWindow(const Mat img, const Rect roi, Mat& patch) const;
+    void extractCN(Mat _patch, Mat & cnFeatures) const;
+    void denseGaussKernel(const double sigma, const Mat _x, const Mat _y, Mat & _k) const;
+    void calcResponse(const Mat _alphaf, const Mat _k, Mat & _response) const;
+    void calcResponse(const Mat _alphaf, const Mat _alphaf_den, const Mat _k, Mat & _response) const;
+
+    void shiftRows(Mat& mat) const;
+    void shiftRows(Mat& mat, int n) const;
+    void shiftCols(Mat& mat, int n) const;
+
+  private:
+    double output_sigma;
+    Rect2d roi;
+    Mat hann; 	//hann window filter
+
+    Mat y,yf; 	// training response and its FFT
+    Mat x,xf; 	// observation and its FFT
+    Mat k,kf;	// dense gaussian kernel and its FFT
+    Mat kf_lambda; // kf+lambda
+    Mat new_alphaf, alphaf;	// training coefficients
+    Mat new_alphaf_den, alphaf_den; // for splitted training coefficients
+    Mat z, new_z; // model
+    Mat response; // detection result
+    Mat old_cov_mtx, proj_mtx; // for feature compression
+
+    bool resizeImage; // resize the image whenever needed and the patch size is large
+
+    int frame;
+  };
+
+  /*
+ * Constructor
+ */
+  Ptr<TrackerKCF> TrackerKCF::createTracker(const TrackerKCF::Params &parameters){
+      return Ptr<TrackerKCFImpl>(new TrackerKCFImpl(parameters));
+  }
+  TrackerKCFImpl::TrackerKCFImpl( const TrackerKCF::Params &parameters ) :
+      params( parameters )
+  {
+    isInit = false;
+    resizeImage = false;
+
+    CV_Assert(params.descriptor == GRAY || params.descriptor == CN /*|| params.descriptor == CN2*/);
+  }
+
+  void TrackerKCFImpl::read( const cv::FileNode& fn ){
+    params.read( fn );
+  }
+
+  void TrackerKCFImpl::write( cv::FileStorage& fs ) const {
+    params.write( fs );
+  }
+
+  /*
+   * Initialization:
+   * - creating hann window filter
+   * - ROI padding
+   * - creating a gaussian response for the training ground-truth
+   * - perform FFT to the gaussian response
+   */
+  bool TrackerKCFImpl::initImpl( const Mat& /*image*/, const Rect2d& boundingBox ){
+    frame=0;
+    roi = boundingBox;
+
+    //calclulate output sigma
+    output_sigma=sqrt(roi.width*roi.height)*params.output_sigma_factor;
+    output_sigma=-0.5/(output_sigma*output_sigma);
+
+    //resize the ROI whenever needed
+    if(params.resize && roi.width*roi.height>params.max_patch_size){
+      resizeImage=true;
+      roi.x/=2.0;
+      roi.y/=2.0;
+      roi.width/=2.0;
+      roi.height/=2.0;
+    }
+
+    // add padding to the roi
+    roi.x-=roi.width/2;
+    roi.y-=roi.height/2;
+    roi.width*=2;
+    roi.height*=2;
+
+    // initialize the hann window filter
+    createHanningWindow(hann, roi.size(), CV_64F);
+    if(params.descriptor==CN){
+      Mat layers[] = {hann, hann, hann, hann, hann, hann, hann, hann, hann, hann};
+      merge(layers, 10, hann);
+    }
+
+    // create gaussian response
+    y=Mat::zeros((int)roi.height,(int)roi.width,CV_64F);
+    for(unsigned i=0;i<roi.height;i++){
+      for(unsigned j=0;j<roi.width;j++){
+        y.at<double>(i,j)=(i-roi.height/2+1)*(i-roi.height/2+1)+(j-roi.width/2+1)*(j-roi.width/2+1);
+      }
+    }
+
+    y*=(double)output_sigma;
+    cv::exp(y,y);
+
+    // perform fourier transfor to the gaussian response
+    fft2(y,yf);
+
+    model=Ptr<TrackerKCFModel>(new TrackerKCFModel(params));
+
+    // TODO: return true only if roi inside the image
+    return true;
+  }
+
+  /*
+   * Main part of the KCF algorithm
+   */
+  bool TrackerKCFImpl::updateImpl( const Mat& image, Rect2d& boundingBox ){
+    double minVal, maxVal;	// min-max response
+    Point minLoc,maxLoc;	// min-max location
+    Mat zc;
+
+    Mat img=image.clone();
+    // check the channels of the input image, grayscale is preferred
+    CV_Assert(image.channels() == 1 || image.channels() == 3);
+
+    // resize the image whenever needed
+    if(resizeImage)resize(img,img,Size(img.cols/2,img.rows/2));
+
+    // extract and pre-process the patch
+    if(!getSubWindow(img,roi, x))return false;
+
+    // detection part
+    if(frame>0){
+      //compute the gaussian kernel
+      if(params.compress_feature){
+        compress(proj_mtx,x,x);
+        compress(proj_mtx,z,zc);
+        denseGaussKernel(params.sigma,x,zc,k);
+      }else
+        denseGaussKernel(params.sigma,x,z,k);
+
+      // calculate filter response
+      if(params.split_coeff)
+        calcResponse(alphaf,alphaf_den,k,response);
+      else
+        calcResponse(alphaf,k,response);
+
+      // extract the maximum response
+      minMaxLoc( response, &minVal, &maxVal, &minLoc, &maxLoc );
+      roi.x+=(maxLoc.x-roi.width/2+1);
+      roi.y+=(maxLoc.y-roi.height/2+1);
+
+      // update the bounding box
+      boundingBox.x=(resizeImage?roi.x*2:roi.x)+boundingBox.width/2;
+      boundingBox.y=(resizeImage?roi.y*2:roi.y)+boundingBox.height/2;
+    }
+
+    // extract the patch for learning purpose
+    if(!getSubWindow(img,roi, x))return false;
+
+    //update the training data
+    new_z=x.clone();
+    if(frame==0)
+      z=x.clone();
+    else
+      z=(1.0-params.interp_factor)*z+params.interp_factor*new_z;
+
+    if(params.compress_feature){
+      // feature compression
+      updateProjectionMatrix(z,old_cov_mtx,proj_mtx,params.pca_learning_rate,params.compressed_size);
+      compress(proj_mtx,x,x);
+    }
+
+    // Kernel Regularized Least-Squares, calculate alphas
+    denseGaussKernel(params.sigma,x,x,k);
+
+    fft2(k,kf);
+    kf_lambda=kf+params.lambda;
+
+    /* TODO: optimize this element-wise division
+     * new_alphaf=yf./kf
+     * z=(a+bi)/(c+di)[(ac+bd)+i(bc-ad)]/(c^2+d^2)
+     */
+    new_alphaf=Mat_<Vec2d >(yf.rows, yf.cols);
+    std::complex<double> temp;
+
+    if(params.split_coeff){
+      mulSpectrums(yf,kf,new_alphaf,0);
+      mulSpectrums(kf,kf_lambda,new_alphaf_den,0);
+    }else{
+      for(int i=0;i<yf.rows;i++){
+        for(int j=0;j<yf.cols;j++){
+          temp=std::complex<double>(yf.at<Vec2d>(i,j)[0],yf.at<Vec2d>(i,j)[1])/(std::complex<double>(kf_lambda.at<Vec2d>(i,j)[0],kf_lambda.at<Vec2d>(i,j)[1])/*+std::complex<double>(0.0000000001,0.0000000001)*/);
+          new_alphaf.at<Vec2d >(i,j)[0]=temp.real();
+          new_alphaf.at<Vec2d >(i,j)[1]=temp.imag();
+        }
+      }
+    }
+
+    // update the RLS model
+    if(frame==0){
+      alphaf=new_alphaf.clone();
+      if(params.split_coeff)alphaf_den=new_alphaf_den.clone();
+    }else{
+      alphaf=(1.0-params.interp_factor)*alphaf+params.interp_factor*new_alphaf;
+      if(params.split_coeff)alphaf_den=(1.0-params.interp_factor)*alphaf_den+params.interp_factor*new_alphaf_den;
+    }
+
+    frame++;
+    return true;
+  }
+
+
+  /*-------------------------------------
+  |  implementation of the KCF functions
+  |-------------------------------------*/
+
+  /*
+   * hann window filter
+   */
+  void TrackerKCFImpl::createHanningWindow(OutputArray _dst, const cv::Size winSize, const int type) const {
+      CV_Assert( type == CV_32FC1 || type == CV_64FC1 );
+
+      _dst.create(winSize, type);
+      Mat dst = _dst.getMat();
+
+      int rows = dst.rows, cols = dst.cols;
+
+      AutoBuffer<double> _wc(cols);
+      double * const wc = (double *)_wc;
+
+      double coeff0 = 2.0 * CV_PI / (double)(cols - 1), coeff1 = 2.0f * CV_PI / (double)(rows - 1);
+      for(int j = 0; j < cols; j++)
+        wc[j] = 0.5 * (1.0 - cos(coeff0 * j));
+
+      if(dst.depth() == CV_32F){
+        for(int i = 0; i < rows; i++){
+          float* dstData = dst.ptr<float>(i);
+          double wr = 0.5 * (1.0 - cos(coeff1 * i));
+          for(int j = 0; j < cols; j++)
+            dstData[j] = (float)(wr * wc[j]);
+        }
+      }else{
+        for(int i = 0; i < rows; i++){
+          double* dstData = dst.ptr<double>(i);
+          double wr = 0.5 * (1.0 - cos(coeff1 * i));
+          for(int j = 0; j < cols; j++)
+            dstData[j] = wr * wc[j];
+        }
+      }
+
+      // perform batch sqrt for SSE performance gains
+      //cv::sqrt(dst, dst); //matlab do not use the square rooted version
+  }
+
+  /*
+   * simplification of fourier transform function in opencv
+   */
+  void inline TrackerKCFImpl::fft2(const Mat src, Mat & dest) const {
+    std::vector<Mat> layers(src.channels());
+    std::vector<Mat> outputs(src.channels());
+
+    split(src, layers);
+
+    for(int i=0;i<src.channels();i++){
+      dft(layers[i],outputs[i],DFT_COMPLEX_OUTPUT);
+    }
+
+    merge(outputs,dest);
+  }
+
+  void inline TrackerKCFImpl::fft2(const Mat src, std::vector<Mat> & dest) const {
+    std::vector<Mat> layers(src.channels());
+    dest.clear();
+    dest.resize(src.channels());
+
+    split(src, layers);
+
+    for(int i=0;i<src.channels();i++){
+      dft(layers[i],dest[i],DFT_COMPLEX_OUTPUT);
+    }
+  }
+
+  /*
+   * simplification of inverse fourier transform function in opencv
+   */
+  void inline TrackerKCFImpl::ifft2(const Mat src, Mat & dest) const {
+    idft(src,dest,DFT_SCALE+DFT_REAL_OUTPUT);
+  }
+
+  /*
+   * Point-wise multiplication of two Multichannel Mat data
+   */
+  void inline TrackerKCFImpl::pixelWiseMult(const std::vector<Mat> src1, const std::vector<Mat>  src2, std::vector<Mat>  & dest, const int flags, const bool conjB) const {
+    dest.clear();
+    dest.resize(src1.size());
+
+    for(unsigned i=0;i<src1.size();i++){
+      mulSpectrums(src1[i], src2[i], dest[i],flags,conjB);
+    }
+  }
+
+  /*
+   * Combines all channels in a multi-channels Mat data into a single channel
+   */
+  void inline TrackerKCFImpl::sumChannels(std::vector<Mat> src, Mat & dest) const {
+    dest=src[0].clone();
+    for(unsigned i=1;i<src.size();i++){
+      dest+=src[i];
+    }
+  }
+
+  /*
+   * obtains the projection matrix using PCA
+   */
+  void inline TrackerKCFImpl::updateProjectionMatrix(const Mat src, Mat & old_cov,Mat &  _proj_mtx, double pca_rate, int compressed_sz) const {
+    CV_Assert(compressed_sz<=src.channels());
+
+    // compute average
+    std::vector<Mat> layers(src.channels());
+    std::vector<Scalar> average(src.channels());
+    split(src,layers);
+
+    for (int i=0;i<src.channels();i++){
+      average[i]=mean(layers[i]);
+      layers[i]-=average[i];
+    }
+
+    // calc covariance matrix
+    Mat data,new_cov;
+    merge(layers,data);
+    data=data.reshape(1,src.rows*src.cols);
+
+    new_cov=1.0/(double)(src.rows*src.cols-1)*(data.t()*data);
+    if(old_cov.rows==0)old_cov=new_cov.clone();
+
+    // calc PCA
+    Mat w, u, vt;
+    SVD::compute((1.0-pca_rate)*old_cov+pca_rate*new_cov, w, u, vt);
+
+    // extract the projection matrix
+    _proj_mtx=u(Rect(0,0,compressed_sz,src.channels())).clone();
+    Mat proj_vars=Mat::eye(compressed_sz,compressed_sz,_proj_mtx.type());
+    for(int i=0;i<compressed_sz;i++){
+      proj_vars.at<double>(i,i)=w.at<double>(i);
+    }
+
+    // update the covariance matrix
+    old_cov=(1.0-pca_rate)*old_cov+pca_rate*_proj_mtx*proj_vars*_proj_mtx.t();
+  }
+
+  /*
+   * compress the features
+   */
+  void inline TrackerKCFImpl::compress(const Mat _proj_mtx, const Mat src, Mat & dest) const {
+    Mat data=src.reshape(1,src.rows*src.cols);
+    Mat compressed=data*_proj_mtx;
+    dest=compressed.reshape(_proj_mtx.cols,src.rows).clone();
+  }
+
+  /*
+   * obtain the patch and apply hann window filter to it
+   */
+  bool TrackerKCFImpl::getSubWindow(const Mat img, const Rect _roi, Mat& patch) const {
+
+    Rect region=_roi;
+
+    // return false if roi is outside the image
+    if((_roi.x+_roi.width<0)
+      ||(_roi.y+_roi.height<0)
+      ||(_roi.x>=img.cols)
+      ||(_roi.y>=img.rows)
+    )return false;
+
+    // extract patch inside the image
+    if(_roi.x<0){region.x=0;region.width+=_roi.x;}
+    if(_roi.y<0){region.y=0;region.height+=_roi.y;}
+    if(_roi.x+_roi.width>img.cols)region.width=img.cols-_roi.x;
+    if(_roi.y+_roi.height>img.rows)region.height=img.rows-_roi.y;
+    if(region.width>img.cols)region.width=img.cols;
+    if(region.height>img.rows)region.height=img.rows;
+
+    patch=img(region).clone();
+
+    // add some padding to compensate when the patch is outside image border
+    int addTop,addBottom, addLeft, addRight;
+    addTop=region.y-_roi.y;
+    addBottom=(_roi.height+_roi.y>img.rows?_roi.height+_roi.y-img.rows:0);
+    addLeft=region.x-_roi.x;
+    addRight=(_roi.width+_roi.x>img.cols?_roi.width+_roi.x-img.cols:0);
+
+    copyMakeBorder(patch,patch,addTop,addBottom,addLeft,addRight,BORDER_REPLICATE);
+    if(patch.rows==0 || patch.cols==0)return false;
+
+    // extract the desired descriptors
+    switch(params.descriptor){
+      case GRAY:
+        if(img.channels()>1)cvtColor(patch,patch, CV_BGR2GRAY);
+        patch.convertTo(patch,CV_64F);
+        patch=patch/255.0-0.5; // normalize to range -0.5 .. 0.5
+        break;
+      case CN:
+        CV_Assert(img.channels() == 3);
+        extractCN(patch,patch);
+        break;
+      case CN2:
+        if(patch.channels()>1)cvtColor(patch,patch, CV_BGR2GRAY);
+        break;
+    }
+
+    patch=patch.mul(hann); // hann window filter
+
+    return true;
+
+  }
+
+  /* Convert BGR to ColorNames
+   */
+  void TrackerKCFImpl::extractCN(Mat _patch, Mat & cnFeatures) const {
+    Vec3b & pixel = _patch.at<Vec3b>(0,0);
+    unsigned index;
+
+    Mat temp = Mat::zeros(_patch.rows,_patch.cols,CV_64FC(10));
+
+    for(int i=0;i<_patch.rows;i++){
+      for(int j=0;j<_patch.cols;j++){
+        pixel=_patch.at<Vec3b>(i,j);
+        index=(unsigned)(floor(pixel[2]/8)+32*floor(pixel[1]/8)+32*32*floor(pixel[0]/8));
+
+        //copy the values
+        for(int _k=0;_k<10;_k++){
+          temp.at<Vec<double,10> >(i,j)[_k]=ColorNames[index][_k];
+        }
+      }
+    }
+
+    cnFeatures=temp.clone();
+  }
+
+  /*
+   *  dense gauss kernel function
+   */
+  void TrackerKCFImpl::denseGaussKernel(const double sigma, const Mat _x, const Mat _y, Mat & _k) const {
+    std::vector<Mat> _xf,_yf,xyf_v;
+    Mat xy,xyf;
+    double normX, normY;
+
+    fft2(_x,_xf);
+    fft2(_y,_yf);
+
+    normX=norm(_x);
+    normX*=normX;
+    normY=norm(_y);
+    normY*=normY;
+
+    pixelWiseMult(_xf,_yf,xyf_v,0,true);
+    sumChannels(xyf_v,xyf);
+    ifft2(xyf,xyf);
+
+    if(params.wrap_kernel){
+      shiftRows(xyf, _x.rows/2);
+      shiftCols(xyf, _x.cols/2);
+    }
+
+    //(xx + yy - 2 * xy) / numel(x)
+    xy=(normX+normY-2*xyf)/(_x.rows*_x.cols*_x.channels());
+
+    // TODO: check wether we really need thresholding or not
+    //threshold(xy,xy,0.0,0.0,THRESH_TOZERO);//max(0, (xx + yy - 2 * xy) / numel(x))
+    for(int i=0;i<xy.rows;i++){
+      for(int j=0;j<xy.cols;j++){
+        if(xy.at<double>(i,j)<0.0)xy.at<double>(i,j)=0.0;
+      }
+    }
+
+    double sig=-1.0/(sigma*sigma);
+    xy=sig*xy;
+    exp(xy,_k);
+
+  }
+
+  /* CIRCULAR SHIFT Function
+   * http://stackoverflow.com/questions/10420454/shift-like-matlab-function-rows-or-columns-of-a-matrix-in-opencv
+   */
+  // circular shift one row from up to down
+  void TrackerKCFImpl::shiftRows(Mat& mat) const {
+
+      Mat temp;
+      Mat m;
+      int _k = (mat.rows-1);
+      mat.row(_k).copyTo(temp);
+      for(; _k > 0 ; _k-- ) {
+        m = mat.row(_k);
+        mat.row(_k-1).copyTo(m);
+      }
+      m = mat.row(0);
+      temp.copyTo(m);
+
+  }
+
+  // circular shift n rows from up to down if n > 0, -n rows from down to up if n < 0
+  void TrackerKCFImpl::shiftRows(Mat& mat, int n) const {
+      if( n < 0 ) {
+        n = -n;
+        flip(mat,mat,0);
+        for(int _k=0; _k < n;_k++) {
+          shiftRows(mat);
+        }
+        flip(mat,mat,0);
+      }else{
+        for(int _k=0; _k < n;_k++) {
+          shiftRows(mat);
+        }
+      }
+  }
+
+  //circular shift n columns from left to right if n > 0, -n columns from right to left if n < 0
+  void TrackerKCFImpl::shiftCols(Mat& mat, int n) const {
+      if(n < 0){
+        n = -n;
+        flip(mat,mat,1);
+        transpose(mat,mat);
+        shiftRows(mat,n);
+        transpose(mat,mat);
+        flip(mat,mat,1);
+      }else{
+        transpose(mat,mat);
+        shiftRows(mat,n);
+        transpose(mat,mat);
+      }
+  }
+
+  /*
+   * calculate the detection response
+   */
+  void TrackerKCFImpl::calcResponse(const Mat _alphaf, const Mat _k, Mat & _response) const {
+    //alpha f--> 2channels ; k --> 1 channel;
+    Mat _kf;
+    fft2(_k,_kf);
+    Mat spec;
+    mulSpectrums(_alphaf,_kf,spec,0,false);
+    ifft2(spec,_response);
+  }
+
+  /*
+   * calculate the detection response for splitted form
+   */
+  void TrackerKCFImpl::calcResponse(const Mat _alphaf, const Mat _alphaf_den, const Mat _k, Mat & _response) const {
+    Mat _kf;
+    fft2(_k,_kf);
+    Mat spec;
+    Mat spec2=Mat_<Vec2d >(_k.rows, _k.cols);
+    std::complex<double> temp;
+
+    mulSpectrums(_alphaf,_kf,spec,0,false);
+
+    for(int i=0;i<_k.rows;i++){
+      for(int j=0;j<_k.cols;j++){
+        temp=std::complex<double>(spec.at<Vec2d>(i,j)[0],spec.at<Vec2d>(i,j)[1])/(std::complex<double>(_alphaf_den.at<Vec2d>(i,j)[0],_alphaf_den.at<Vec2d>(i,j)[1])/*+std::complex<double>(0.0000000001,0.0000000001)*/);
+        spec2.at<Vec2d >(i,j)[0]=temp.real();
+        spec2.at<Vec2d >(i,j)[1]=temp.imag();
+      }
+    }
+
+    ifft2(spec2,_response);
+  }
+  /*----------------------------------------------------------------------*/
+
+  /*
+ * Parameters
+ */
+  TrackerKCF::Params::Params(){
+      sigma=0.2;
+      lambda=0.01;
+      interp_factor=0.075;
+      output_sigma_factor=1.0/16.0;
+      resize=true;
+      max_patch_size=80*80;
+      descriptor=CN;
+      split_coeff=true;
+      wrap_kernel=false;
+
+      //feature compression
+      compress_feature=true;
+      compressed_size=2;
+      pca_learning_rate=0.15;
+  }
+
+  void TrackerKCF::Params::read( const cv::FileNode& /*fn*/ ){}
+
+  void TrackerKCF::Params::write( cv::FileStorage& /*fs*/ ) const{}
+
+} /* namespace cv */
--- a/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp
+++ b/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp
@ -165,10 +165,10 @@ Note: the descriptor can be coupled with any keypoint extractor. The only demand
 Note: a complete example can be found under /samples/cpp/tutorial_code/xfeatures2D/latch_match.cpp

 */
-class CV_EXPORTS LATCH : public DescriptorExtractor
+class CV_EXPORTS_W LATCH : public Feature2D
 {
 public:
-	static Ptr<LATCH> create(int bytes = 32, bool rotationInvariance = true, int half_ssd_size=3);
+	CV_WRAP static Ptr<LATCH> create(int bytes = 32, bool rotationInvariance = true, int half_ssd_size=3);
 };

 /** @brief Class implementing DAISY descriptor, described in @cite Tola10
@ -187,14 +187,14 @@ DAISY::NRM_SIFT mean that descriptors are normalized for L2 norm equal to 1.0 bu
@param use_orientation sample patterns using keypoints orientation, disabled by default.

 */
-class CV_EXPORTS DAISY : public DescriptorExtractor
+class CV_EXPORTS_W DAISY : public Feature2D
 {
 public:
    enum
    {
        NRM_NONE = 100, NRM_PARTIAL = 101, NRM_FULL = 102, NRM_SIFT = 103,
    };
-    static Ptr<DAISY> create( float radius = 15, int q_radius = 3, int q_theta = 8,
+    CV_WRAP static Ptr<DAISY> create( float radius = 15, int q_radius = 3, int q_theta = 8,
                int q_hist = 8, int norm = DAISY::NRM_NONE, InputArray H = noArray(),
                bool interpolation = true, bool use_orientation = false );

--- a/modules/xfeatures2d/src/daisy.cpp
+++ b/modules/xfeatures2d/src/daisy.cpp
@ -1267,7 +1267,7 @@ struct RoundingInvoker : ParallelLoopBody
    {
      for (int c = range.start; c < range.end; ++c)
      {
-        scale_map->at<float>(r,c) = (float) round( scale_map->at<float>(r,c) );
+        scale_map->at<float>(r,c) = (float) cvRound( scale_map->at<float>(r,c) );
      }
    }
    int r;
--- a/modules/ximgproc/doc/ximgproc.bib
+++ b/modules/ximgproc/doc/ximgproc.bib
@ -55,3 +55,25 @@
  year={2013},
  organization={IEEE}
 }
+
+@article{Min2014,
+  title={Fast global image smoothing based on weighted least squares},
+  author={Min, Dongbo and Choi, Sunghwan and Lu, Jiangbo and Ham, Bumsub and Sohn, Kwanghoon and Do, Minh N},
+  journal={Image Processing, IEEE Transactions on},
+  volume={23},
+  number={12},
+  pages={5638--5653},
+  year={2014},
+  publisher={IEEE}
+}
+
+@inproceedings{Farbman2008,
+  title={Edge-preserving decompositions for multi-scale tone and detail manipulation},
+  author={Farbman, Zeev and Fattal, Raanan and Lischinski, Dani and Szeliski, Richard},
+  booktitle={ACM Transactions on Graphics (TOG)},
+  volume={27},
+  number={3},
+  pages={67},
+  year={2008},
+  organization={ACM}
+}
--- a/modules/ximgproc/include/opencv2/ximgproc.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc.hpp
@ -38,6 +38,7 @@
 #define __OPENCV_XIMGPROC_HPP__

 #include "ximgproc/edge_filter.hpp"
+#include "ximgproc/disparity_filter.hpp"
 #include "ximgproc/structured_edge_detection.hpp"
 #include "ximgproc/seeds.hpp"

--- a/modules/ximgproc/include/opencv2/ximgproc/disparity_filter.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/disparity_filter.hpp
@ -43,49 +43,147 @@
 namespace cv {
 namespace ximgproc {

+//! @addtogroup ximgproc_filters
+//! @{
+
+/** @brief Main interface for all disparity map filters.
+ */
 class CV_EXPORTS_W DisparityFilter : public Algorithm
 {
 public:
-    CV_WRAP virtual void filter(InputArray disparity_map, InputArray left_view, OutputArray filtered_disparity_map,Rect ROI) = 0;
-};

-/////////////////////////////////////////////////////////////////////////////////////////////////
+    /** @brief Apply filtering to the disparity map.

-class CV_EXPORTS_W DisparityDTFilter : public DisparityFilter
-{
-public:
-    CV_WRAP virtual double getSigmaSpatial() = 0;
-    CV_WRAP virtual void setSigmaSpatial(double _sigmaSpatial) = 0;
-    CV_WRAP virtual double getSigmaColor() = 0;
-    CV_WRAP virtual void setSigmaColor(double _sigmaColor) = 0;
-};
+    @param disparity_map_left disparity map of the left view, 1 channel, CV_16S type. Implicitly assumes that disparity
+    values are scaled by 16 (one-pixel disparity corresponds to the value of 16 in the disparity map). Disparity map
+    can have any resolution, it will be automatically resized to fit left_view resolution.

-CV_EXPORTS_W
-Ptr<DisparityDTFilter> createDisparityDTFilter();
+    @param left_view left view of the original stereo-pair to guide the filtering process, 8-bit single-channel
+    or three-channel image.

-class CV_EXPORTS_W DisparityGuidedFilter : public DisparityFilter
-{
-public:
-    CV_WRAP virtual double getEps() = 0;
-    CV_WRAP virtual void setEps(double _eps) = 0;
-    CV_WRAP virtual int getRadius() = 0;
-    CV_WRAP virtual void setRadius(int _radius) = 0;
-};
+    @param filtered_disparity_map output disparity map.

-CV_EXPORTS_W
-Ptr<DisparityGuidedFilter> createDisparityGuidedFilter();
+    @param ROI region of the disparity map to filter.

+    @param disparity_map_right optional argument, some implementations might also use the disparity map
+    of the right view to compute confidence maps, for instance.
+
+    @param right_view optional argument, some implementations might also use the right view of the original
+    stereo-pair.
+     */
+    CV_WRAP virtual void filter(InputArray disparity_map_left, InputArray left_view, OutputArray filtered_disparity_map, Rect ROI, InputArray disparity_map_right = Mat(), InputArray right_view = Mat()) = 0;
+};
+
+/** @brief Disparity map filter based on Weighted Least Squares filter (in form of Fast Global Smoother that
+is a lot faster than traditional Weighted Least Squares filter implementations) and optional use of
+left-right-consistency-based confidence to refine the results in half-occlusions and uniform areas.
+ */
 class CV_EXPORTS_W DisparityWLSFilter : public DisparityFilter
 {
+public:
+    /** filter parameters */
+
+    /** @brief Lambda is a parameter defining the amount of regularization during filtering. Larger values force
+    filtered disparity map edges to adhere more to source image edges. Typical value is 8000.
+     */
    CV_WRAP virtual double getLambda() = 0;
+    /** @see getLambda */
    CV_WRAP virtual void setLambda(double _lambda) = 0;
+    /** @brief SigmaColor is a parameter defining how sensitive the filtering process is to source image edges.
+    Large values can lead to disparity leakage through low-contrast edges. Small values can make the filter too
+    sensitive to noise and textures in the source image. Typical values range from 0.8 to 2.0.
+     */
    CV_WRAP virtual double getSigmaColor() = 0;
+    /** @see getSigmaColor */
    CV_WRAP virtual void setSigmaColor(double _sigma_color) = 0;
+
+    /** confidence-related parameters */
+
+    /** @brief LRCthresh is a threshold of disparity difference used in left-right-consistency check during
+    confidence map computation. The default value of 24 (1.5 pixels) is virtually always good enough.
+     */
+    CV_WRAP virtual int getLRCthresh() = 0;
+    /** @see getLRCthresh */
+    CV_WRAP virtual void setLRCthresh(int _LRC_thresh) = 0;
+    /** @brief DepthDiscontinuityRadius is a parameter used in confidence computation. It defines the size of
+    low-confidence regions around depth discontinuities. For typical window sizes used in stereo matching the
+    optimal value is around 5.
+     */
+    CV_WRAP virtual int getDepthDiscontinuityRadius() = 0;
+    /** @see getDepthDiscontinuityRadius */
+    CV_WRAP virtual void setDepthDiscontinuityRadius(int _disc_radius) = 0;
+    /** @brief Get the confidence map that was used in the last filter call. It is a CV_32F one-channel image
+    with values ranging from 0.0 (totally untrusted regions of the raw disparity map) to 255.0 (regions containing
+    correct disparity values with a high degree of confidence).
+     */
+    CV_WRAP virtual Mat getConfidenceMap() = 0;
+
 };

+/** @brief Factory method, create instance of DisparityWLSFilter and execute the initialization routines.
+
+@param use_confidence filtering with confidence requires two disparity maps (for the left and right views) and is
+approximately two times slower. However, quality is typically significantly better.
+*/
 CV_EXPORTS_W
-Ptr<DisparityWLSFilter> createDisparityWLSFilter();
+Ptr<DisparityWLSFilter> createDisparityWLSFilter(bool use_confidence);
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+/** @brief Function for reading ground truth disparity maps. Supports basic Middlebury
+and MPI-Sintel formats. Note that the resulting disparity map is scaled by 16.
+
+@param src_path path to the image, containing ground-truth disparity map
+
+@param dst output disparity map, CV_16S depth
+
+@result returns zero if successfully read the ground truth
+ */
+CV_EXPORTS
+int readGT(String src_path,OutputArray dst);
+
+/** @brief Function for computing mean square error for disparity maps
+
+@param GT ground truth disparity map
+
+@param src disparity map to evaluate
+
+@param ROI region of interest
+
+@result returns mean square error between GT and src
+ */
+CV_EXPORTS
+double computeMSE(InputArray GT, InputArray src, Rect ROI);
+
+/** @brief Function for computing the percent of "bad" pixels in the disparity map
+(pixels where error is higher than a specified threshold)
+
+@param GT ground truth disparity map
+
+@param src disparity map to evaluate
+
+@param ROI region of interest
+
+@param thresh threshold used to determine "bad" pixels
+
+@result returns mean square error between GT and src
+ */
+CV_EXPORTS
+double computeBadPixelPercent(InputArray GT, InputArray src, Rect ROI, int thresh=24/*1.5 pixels*/);
+
+/** @brief Function for creating a disparity map visualization (clamped CV_8U image)
+
+@param src input disparity map (CV_16S depth)
+
+@param dst output visualization
+
+@param scale disparity map will be multiplied by this value for visualization
+ */
+CV_EXPORTS
+void getDisparityVis(InputArray src,OutputArray dst,double scale=1.0);

+//! @}
 }
 }
 #endif
--- a/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp
@ -316,19 +316,69 @@ proportional to sigmaSpace .
 CV_EXPORTS_W
 void jointBilateralFilter(InputArray joint, InputArray src, OutputArray dst, int d, double sigmaColor, double sigmaSpace, int borderType = BORDER_DEFAULT);

-//! @}
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+

-class CV_EXPORTS_W WeightedLeastSquaresFilter : public Algorithm
+/** @brief Interface for implementations of Fast Global Smoother filter.
+
+For more details about this filter see @cite Min2014 and @cite Farbman2008 .
+*/
+class CV_EXPORTS_W FastGlobalSmootherFilter : public Algorithm
 {
 public:
+    /** @brief Apply smoothing operation to the source image.
+
+    @param src source image for filtering with unsigned 8-bit or signed 16-bit or floating-point 32-bit depth and up to 4 channels.
+
+    @param dst destination image.
+    */
    CV_WRAP virtual void filter(InputArray src, OutputArray dst) = 0;
-    virtual ~WeightedLeastSquaresFilter();
 };

-CV_EXPORTS_W Ptr<WeightedLeastSquaresFilter> createWeightedLeastSquaresFilter(InputArray guide, double lambda, double sigma_color, int num_iter=3);
+/** @brief Factory method, create instance of FastGlobalSmootherFilter and execute the initialization routines.
+
+@param guide image serving as guide for filtering. It should have 8-bit depth and either 1 or 3 channels.
+
+@param lambda parameter defining the amount of regularization
+
+@param sigma_color parameter, that is similar to color space sigma in bilateralFilter.
+
+@param lambda_attenuation internal parameter, defining how much lambda decreases after each iteration. Normally,
+it should be 0.25. Setting it to 1.0 may lead to streaking artifacts.

-CV_EXPORTS_W void weightedLeastSquaresFilter(InputArray guide, InputArray src, OutputArray dst, double lambda, double sigma_color, int num_iter=3);
+@param num_iter number of iterations used for filtering, 3 is usually enough.

+For more details about Fast Global Smoother parameters, see the original paper @cite Min2014. However, please note that
+there are several differences. Lambda attenuation described in the paper is implemented a bit differently so do not
+expect the results to be identical to those from the paper; sigma_color values from the paper should be multiplied by 255.0 to
+achieve the same effect. Also, in case of image filtering where source and guide image are the same, authors
+propose to dynamically update the guide image after each iteration. To maximize the performance this feature
+was not implemented here.
+*/
+CV_EXPORTS_W Ptr<FastGlobalSmootherFilter> createFastGlobalSmootherFilter(InputArray guide, double lambda, double sigma_color, double lambda_attenuation=0.25, int num_iter=3);
+
+/** @brief Simple one-line Fast Global Smoother filter call. If you have multiple images to filter with the same
+guide then use FastGlobalSmootherFilter interface to avoid extra computations.
+
+@param guide image serving as guide for filtering. It should have 8-bit depth and either 1 or 3 channels.
+
+@param src source image for filtering with unsigned 8-bit or signed 16-bit or floating-point 32-bit depth and up to 4 channels.
+
+@param dst destination image.
+
+@param lambda parameter defining the amount of regularization
+
+@param sigma_color parameter, that is similar to color space sigma in bilateralFilter.
+
+@param lambda_attenuation internal parameter, defining how much lambda decreases after each iteration. Normally,
+it should be 0.25. Setting it to 1.0 may lead to streaking artifacts.
+
+@param num_iter number of iterations used for filtering, 3 is usually enough.
+*/
+CV_EXPORTS_W void fastGlobalSmootherFilter(InputArray guide, InputArray src, OutputArray dst, double lambda, double sigma_color, double lambda_attenuation=0.25, int num_iter=3);
+
+//! @}
 }
 }
 #endif
--- a/modules/ximgproc/perf/perf_disparity_wls_filter.cpp
+++ b/modules/ximgproc/perf/perf_disparity_wls_filter.cpp
@ -0,0 +1,169 @@
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  *Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "perf_precomp.hpp"
+#include "opencv2/ximgproc/disparity_filter.hpp"
+
+namespace cvtest
+{
+
+using std::tr1::tuple;
+using std::tr1::get;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::ximgproc;
+
+void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_left_disparity_map, Mat& dst_right_disparity_map, Rect& dst_ROI);
+
+CV_ENUM(GuideTypes, CV_8UC3);
+CV_ENUM(SrcTypes, CV_16S);
+typedef tuple<GuideTypes, SrcTypes, Size, bool, bool> DisparityWLSParams;
+
+typedef TestBaseWithParam<DisparityWLSParams> DisparityWLSFilterPerfTest;
+
+PERF_TEST_P( DisparityWLSFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::all(), Values(sz720p), Values(true,false), Values(true,false)) )
+{
+    RNG rng(0);
+
+    DisparityWLSParams params = GetParam();
+    int guideType        = get<0>(params);
+    int srcType          = get<1>(params);
+    Size sz              = get<2>(params);
+    bool use_conf        = get<3>(params);
+    bool use_downscale   = get<4>(params);
+
+    Mat guide(sz, guideType);
+    Mat disp_left(sz, srcType);
+    Mat disp_right(sz, srcType);
+    Mat dst(sz, srcType);
+    Rect ROI;
+
+    MakeArtificialExample(rng,guide,disp_left,disp_right,ROI);
+    if(use_downscale)
+    {
+        resize(disp_left,disp_left,Size(),0.5,0.5);
+        disp_left/=2;
+        resize(disp_right,disp_right,Size(),0.5,0.5);
+        disp_right/=2;
+        ROI = Rect(ROI.x/2,ROI.y/2,ROI.width/2,ROI.height/2);
+    }
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    TEST_CYCLE_N(10)
+    {
+        Ptr<DisparityWLSFilter> wls_filter = createDisparityWLSFilter(use_conf);
+        wls_filter->filter(disp_left,guide,dst,ROI,disp_right);
+    }
+
+    SANITY_CHECK(dst);
+}
+
+void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_left_disparity_map, Mat& dst_right_disparity_map, Rect& dst_ROI)
+{
+    int w = dst_left_view.cols;
+    int h = dst_left_view.rows;
+
+    //params:
+    unsigned char bg_level = (unsigned char)rng.uniform(0.0,255.0);
+    unsigned char fg_level = (unsigned char)rng.uniform(0.0,255.0);
+    int rect_width  = (int)rng.uniform(w/16,w/2);
+    int rect_height = (int)rng.uniform(h/16,h/2);
+    int rect_disparity = (int)(0.15*w); //typical maximum disparity value
+    double sigma = 6.0;
+
+    int rect_x_offset = (w-rect_width) /2;
+    int rect_y_offset = (h-rect_height)/2;
+
+    if(dst_left_view.channels()==3)
+        dst_left_view = Scalar(Vec3b(bg_level,bg_level,bg_level));
+    else
+        dst_left_view = Scalar(bg_level);
+    dst_left_disparity_map  = Scalar(0);
+    dst_right_disparity_map = Scalar(0);
+    Mat dst_left_view_rect =          Mat(dst_left_view,         Rect(rect_x_offset,rect_y_offset,rect_width,rect_height));
+    Mat dst_left_disparity_map_rect = Mat(dst_left_disparity_map,Rect(rect_x_offset,rect_y_offset,rect_width,rect_height));
+    if(dst_left_view.channels()==3)
+        dst_left_view_rect = Scalar(Vec3b(fg_level,fg_level,fg_level));
+    else
+        dst_left_view_rect = Scalar(fg_level);
+    dst_left_disparity_map_rect = Scalar(16*rect_disparity);
+
+    rect_x_offset-=rect_disparity;
+    Mat dst_right_disparity_map_rect = Mat(dst_right_disparity_map,Rect(rect_x_offset,rect_y_offset,rect_width,rect_height));
+    dst_right_disparity_map_rect = Scalar(-16*rect_disparity);
+
+    //add some gaussian noise:
+    unsigned char *l;
+    short *ldisp, *rdisp;
+    for(int i=0;i<h;i++)
+    {
+        l = dst_left_view.ptr(i);
+        ldisp = (short*)dst_left_disparity_map.ptr(i);
+        rdisp = (short*)dst_right_disparity_map.ptr(i);
+
+        if(dst_left_view.channels()==3)
+        {
+            for(int j=0;j<w;j++)
+            {
+                l[0] = saturate_cast<unsigned char>(l[0] + rng.gaussian(sigma));
+                l[1] = saturate_cast<unsigned char>(l[1] + rng.gaussian(sigma));
+                l[2] = saturate_cast<unsigned char>(l[2] + rng.gaussian(sigma));
+                l+=3;
+                ldisp[0] = saturate_cast<short>(ldisp[0] + rng.gaussian(sigma));
+                ldisp++;
+                rdisp[0] = saturate_cast<short>(rdisp[0] + rng.gaussian(sigma));
+                rdisp++;
+            }
+        }
+        else
+        {
+            for(int j=0;j<w;j++)
+            {
+                l[0] = saturate_cast<unsigned char>(l[0] + rng.gaussian(sigma));
+                l++;
+                ldisp[0] = saturate_cast<short>(ldisp[0] + rng.gaussian(sigma));
+                ldisp++;
+                rdisp[0] = saturate_cast<short>(rdisp[0] + rng.gaussian(sigma));
+                rdisp++;
+            }
+        }
+    }
+
+    dst_ROI = Rect(rect_disparity,0,w-rect_disparity,h);
+}
+
+}
--- a/modules/ximgproc/perf/perf_fgs_filter.cpp
+++ b/modules/ximgproc/perf/perf_fgs_filter.cpp
@ -0,0 +1,81 @@
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  *Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "perf_precomp.hpp"
+
+namespace cvtest
+{
+
+using std::tr1::tuple;
+using std::tr1::get;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::ximgproc;
+
+CV_ENUM(GuideTypes, CV_8UC1, CV_8UC3);
+CV_ENUM(SrcTypes, CV_8UC1, CV_8UC3, CV_16SC1, CV_16SC3, CV_32FC1, CV_32FC3);
+typedef tuple<GuideTypes, SrcTypes, Size> FGSParams;
+
+typedef TestBaseWithParam<FGSParams> FGSFilterPerfTest;
+
+PERF_TEST_P( FGSFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::all(), Values(sz720p)) )
+{
+    RNG rng(0);
+
+    FGSParams params = GetParam();
+    int guideType   = get<0>(params);
+    int srcType     = get<1>(params);
+    Size sz         = get<2>(params);
+
+    Mat guide(sz, guideType);
+    Mat src(sz, srcType);
+    Mat dst(sz, srcType);
+
+    declare.in(guide, src, WARMUP_RNG).out(dst).tbb_threads(cv::getNumberOfCPUs());
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    TEST_CYCLE_N(10)
+    {
+        double lambda = rng.uniform(500.0, 10000.0);
+        double sigma  = rng.uniform(1.0, 100.0);
+        fastGlobalSmootherFilter(guide,src,dst,lambda,sigma);
+    }
+
+    SANITY_CHECK(dst);
+}
+
+}
--- a/modules/ximgproc/samples/disparity_filtering.cpp
+++ b/modules/ximgproc/samples/disparity_filtering.cpp
@ -4,215 +4,306 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/ximgproc/disparity_filter.hpp"
-#include <stdio.h>
+#include <iostream>
 #include <string>
-#include <vector>
-#include <map>
-
-#if defined(_WIN32)
-#include <direct.h>
-#else 
-#include <sys/stat.h>
-#endif

 using namespace cv;
 using namespace cv::ximgproc;
 using namespace std;

-#define UNKNOWN_DISPARITY 16320
- 
-static void print_help()
-{
-    printf("\nDemo for disparity filtering, evaluating speed and performance of different filters\n");
-    printf("\nUsage: disparity_filtering.exe <path_to_dataset_folder> <path_to_results_folder>\n");
-}
+Rect computeROI(Size2i src_sz, Ptr<StereoMatcher> matcher_instance);
+
+const String keys =
+    "{help h usage ? |                  | print this message                                                }"
+    "{@left          |../data/aloeL.jpg | left view of the stereopair                                       }"
+    "{@right         |../data/aloeR.jpg | right view of the stereopair                                      }"
+    "{GT             |../data/aloeGT.png| optional ground-truth disparity (MPI-Sintel or Middlebury format) }"
+    "{dst_path       |None              | optional path to save the resulting filtered disparity map        }"
+    "{dst_raw_path   |None              | optional path to save raw disparity map before filtering          }"
+    "{algorithm      |bm                | stereo matching method (bm or sgbm)                               }"
+    "{filter         |wls_conf          | used post-filtering (wls_conf or wls_no_conf)                     }"
+    "{no-display     |                  | don't display results                                             }"
+    "{no-downscale   |                  | prevent stereo matching on downscaled views                       }"
+    "{dst_conf_path  |None              | optional path to save the confidence map used in filtering        }"
+    "{vis_mult       |1.0               | coefficient used to scale disparity map visualizations            }"
+    "{max_disparity  |160               | parameter of stereo matching                                      }"
+    "{window_size    |19                | parameter of stereo matching                                      }"
+    "{wls_lambda     |8000.0            | parameter of post-filtering                                       }"
+    "{wls_sigma      |1.0               | parameter of post-filtering                                       }"
+    ;

-struct dataset_entry
+int main(int argc, char** argv)
 {
-    string name;
-    string dataset_folder;
-    string left_file,right_file,GT_file;
-    dataset_entry(string _dataset_folder): dataset_folder(_dataset_folder){}
-    void readEntry(Mat& dst_left,Mat& dst_right,Mat& dst_GT)
+    CommandLineParser parser(argc,argv,keys);
+    parser.about("Disparity Filtering Demo");
+    if (parser.has("help"))
    {
-        dst_left  = imread(dataset_folder+"/"+left_file,  IMREAD_COLOR);
-        dst_right = imread(dataset_folder+"/"+right_file, IMREAD_COLOR);
-        Mat raw_disp = imread(dataset_folder+"/"+GT_file, IMREAD_COLOR);
-        dst_GT = Mat(raw_disp.rows,raw_disp.cols,CV_16S);
-        for(int i=0;i<raw_disp.rows;i++)
-            for(int j=0;j<raw_disp.cols;j++)
-            {
-                Vec3b bgrPixel = raw_disp.at<Vec3b>(i, j);
-                dst_GT.at<short>(i,j) = 64*bgrPixel.val[2]+bgrPixel.val[1]/4; //16-multiplied disparity
-            }
+        parser.printMessage();
+        return 0;
    }
-};

-struct config
-{
-    Ptr<StereoMatcher> matcher_instance;
-    Ptr<DisparityFilter> filter_instance;
-    config(Ptr<StereoMatcher> _matcher_instance,Ptr<DisparityFilter> _filter_instance)
+    String left_im = parser.get<String>(0);
+    String right_im = parser.get<String>(1);
+    String GT_path = parser.get<String>("GT");
+
+    String dst_path = parser.get<String>("dst_path");
+    String dst_raw_path = parser.get<String>("dst_raw_path");
+    String dst_conf_path = parser.get<String>("dst_conf_path");
+    String algo = parser.get<String>("algorithm");
+    String filter = parser.get<String>("filter");
+    bool no_display = parser.has("no-display");
+    bool no_downscale = parser.has("no-downscale");
+    int max_disp = parser.get<int>("max_disparity");
+    int wsize = parser.get<int>("window_size");
+    double lambda = parser.get<double>("wls_lambda");
+    double sigma  = parser.get<double>("wls_sigma");
+    double vis_mult = parser.get<double>("vis_mult");
+
+    if (!parser.check())
    {
-        matcher_instance = _matcher_instance;
-        filter_instance = _filter_instance;
+        parser.printErrors();
+        return -1;
    }
-    config() {}
-};

-void operator>>(const FileNode& node,dataset_entry& entry);
-double computeMSE(Mat& GT, Mat& src, Rect ROI);
-double computeBadPixelPercent(Mat& GT, Mat& src, Rect ROI, int thresh=24/*1.5 pixels*/);
-void getDisparityVis(Mat& disparity_map,Mat& dst);
-Rect computeROI(Size2i src_sz, Ptr<StereoMatcher> matcher_instance);
-void setConfigsForTesting(map<string,config>& cfgs);
-void CreateDir(string path);
-
-int main(int argc, char** argv)
-{
-    if(argc < 3)
+    Mat left  = imread(left_im ,IMREAD_COLOR);
+    if ( left.empty() )
    {
-        print_help();
-        return 0;
+        cout<<"Cannot read image file: "<<left_im;
+        return -1;
    }
-    string dataset_folder(argv[1]);
-    string res_folder(argv[2]);

-    map<string,config> configs_for_testing;
-    setConfigsForTesting(configs_for_testing);
-    CreateDir(res_folder);
+    Mat right = imread(right_im,IMREAD_COLOR);
+    if ( right.empty() )
+    {
+        cout<<"Cannot read image file: "<<right_im;
+        return -1;
+    }

-    for (map<string,config>::iterator cfg = configs_for_testing.begin(); cfg != configs_for_testing.end(); cfg++)
+    bool noGT;
+    Mat GT_disp;
+    if (GT_path=="../data/aloeGT.png" && left_im!="../data/aloeL.jpg")
+        noGT=true;
+    else
    {
-        string vis_folder = res_folder+"/vis_"+cfg->first;
-        CreateDir(vis_folder);
-
-        string cfg_file_name = res_folder+"/"+cfg->first+"_res.csv";
-        FILE* cur_cfg_res_file = fopen(cfg_file_name.c_str(),"w");
-        fprintf(cur_cfg_res_file,"Name,MSE,MSE after postfiltering,Percent bad,Percent bad after postfiltering,Matcher Execution Time(s),Filter Execution Time(s)\n");
-
-        printf("Processing configuration: %s\n",cfg->first.c_str());
-
-        FileStorage fs(dataset_folder + "/_dataset.xml", FileStorage::READ);
-        FileNode n = fs["data_set"];
-        double MSE_pre,percent_pre,MSE_post,percent_post,matching_time,filtering_time;
-        double average_MSE_pre=0,average_percent_pre=0,average_MSE_post=0,
-            average_percent_post=0,average_matching_time=0,average_filtering_time=0;
-        int cnt = 0;
-        for (FileNodeIterator it = n.begin(); it != n.end(); it++)
+        noGT=false;
+        if(readGT(GT_path,GT_disp)!=0)
        {
-            dataset_entry entry(dataset_folder);
-            (*it)>>entry;
-            printf("%s ",entry.name.c_str());
-            Mat left,right,GT;
-            entry.readEntry(left,right,GT);
-            Mat raw_disp;
-            Mat left_gray; cvtColor(left, left_gray, COLOR_BGR2GRAY );
-            Mat right_gray; cvtColor(right, right_gray, COLOR_BGR2GRAY );
-            matching_time = (double)getTickCount();
-            cfg->second.matcher_instance->compute(left_gray,right_gray,raw_disp);
-            matching_time = ((double)getTickCount() - matching_time)/getTickFrequency();
-
-            Rect ROI = computeROI(left.size(),cfg->second.matcher_instance);
-            Mat filtered_disp;
-            filtering_time = (double)getTickCount();
-            cfg->second.filter_instance->filter(raw_disp,left,filtered_disp,ROI);
-            filtering_time = ((double)getTickCount() - filtering_time)/getTickFrequency();
+            cout<<"Cannot read ground truth image file: "<<GT_path<<endl;
+            return -1;
+        }
+    }

+    Mat left_for_matcher, right_for_matcher;
+    Mat left_disp,right_disp;
+    Mat filtered_disp;
+    Mat conf_map = Mat(left.rows,left.cols,CV_8U);
+    conf_map = Scalar(255);
+    Rect ROI;
+    double matching_time, filtering_time;
+    if(max_disp<=0 || max_disp%16!=0)
+    {
+        cout<<"Incorrect max_disparity value: it should be positive and divisible by 16";
+        return -1;
+    }
+    if(wsize<=0 || wsize%2!=1)
+    {
+        cout<<"Incorrect window_size value: it should be positive and odd";
+        return -1;
+    }
+    if(filter=="wls_conf")
+    {
+        if(!no_downscale)
+        {
+            wsize = wsize/2;
+            if(wsize%2==0) wsize++;
+            max_disp/=2;
+            if(max_disp%16!=0)
+                max_disp += 16-(max_disp%16);
+            resize(left ,left_for_matcher ,Size(),0.5,0.5);
+            resize(right,right_for_matcher,Size(),0.5,0.5);
+        }
+        else
+        {
+            left_for_matcher  = left.clone();
+            right_for_matcher = right.clone();
+        }

-            MSE_pre = computeMSE(GT,raw_disp,ROI);
-            percent_pre = computeBadPixelPercent(GT,raw_disp,ROI);
-            MSE_post = computeMSE(GT,filtered_disp,ROI);
-            percent_post = computeBadPixelPercent(GT,filtered_disp,ROI);

-            fprintf(cur_cfg_res_file,"%s,%.1f,%.1f,%.1f,%.1f,%.3f,%.3f\n",entry.name.c_str(),MSE_pre,MSE_post,
-                percent_pre,percent_post,matching_time,filtering_time);
+        if(algo=="bm")
+        {
+            Ptr<StereoBM> left_matcher  = StereoBM::create(max_disp,wsize);
+            left_matcher->setMinDisparity(0);
+            Ptr<StereoBM> right_matcher = StereoBM::create(max_disp,wsize);
+            right_matcher->setMinDisparity(-max_disp+1);
+            left_matcher->setTextureThreshold(0);
+            left_matcher->setUniquenessRatio(0);
+            right_matcher->setTextureThreshold(0);
+            right_matcher->setUniquenessRatio(0);
+            cvtColor(left_for_matcher,  left_for_matcher, COLOR_BGR2GRAY);
+            cvtColor(right_for_matcher, right_for_matcher, COLOR_BGR2GRAY);
+            ROI = computeROI(left_for_matcher.size(),left_matcher);

-            average_matching_time+=matching_time; average_filtering_time+=filtering_time;
-            average_MSE_pre+=MSE_pre; average_percent_pre+=percent_pre;
-            average_MSE_post+=MSE_post; average_percent_post+=percent_post;
-            cnt++;
+            matching_time = (double)getTickCount();
+            left_matcher-> compute(left_for_matcher, right_for_matcher,left_disp);
+            right_matcher->compute(right_for_matcher,left_for_matcher, right_disp);
+            matching_time = ((double)getTickCount() - matching_time)/getTickFrequency();
+        }
+        else if(algo=="sgbm")
+        {
+            Ptr<StereoSGBM> left_matcher  = StereoSGBM::create(0,max_disp,wsize);
+            left_matcher->setMinDisparity(0);
+            Ptr<StereoSGBM> right_matcher = StereoSGBM::create(-max_disp+1,max_disp,wsize);
+            left_matcher->setUniquenessRatio(0);
+            left_matcher->setDisp12MaxDiff(1000000);
+            left_matcher->setSpeckleWindowSize(0);
+            right_matcher->setUniquenessRatio(0);
+            right_matcher->setDisp12MaxDiff(1000000);
+            right_matcher->setSpeckleWindowSize(0);
+            ROI = computeROI(left_for_matcher.size(),left_matcher);

-            // dump visualizations:
-            imwrite(vis_folder + "/" + entry.name + "_left.png",left);
-            Mat GT_vis,raw_disp_vis,filtered_disp_vis;
-            getDisparityVis(GT,GT_vis);
-            getDisparityVis(raw_disp,raw_disp_vis);
-            getDisparityVis(filtered_disp,filtered_disp_vis);
-            imwrite(vis_folder + "/" + entry.name + "_disparity_GT.png",GT_vis);
-            imwrite(vis_folder + "/" + entry.name + "_disparity_raw.png",raw_disp_vis);
-            imwrite(vis_folder + "/" + entry.name + "_disparity_filtered.png",filtered_disp_vis);
+            matching_time = (double)getTickCount();
+            left_matcher-> compute(left_for_matcher, right_for_matcher,left_disp);
+            right_matcher->compute(right_for_matcher,left_for_matcher, right_disp);
+            matching_time = ((double)getTickCount() - matching_time)/getTickFrequency();
+        }
+        else
+        {
+            cout<<"Unsupported algorithm";
+            return -1;
+        }

-            printf("- Done\n");
+        Ptr<DisparityWLSFilter> wls_filter = createDisparityWLSFilter(true);
+        wls_filter->setLambda(lambda);
+        wls_filter->setSigmaColor(sigma);
+        filtering_time = (double)getTickCount();
+        wls_filter->filter(left_disp,left,filtered_disp,ROI,right_disp);
+        filtering_time = ((double)getTickCount() - filtering_time)/getTickFrequency();
+        conf_map = wls_filter->getConfidenceMap();

+        if(!no_downscale)
+        {
+            // upscale raw disparity and ROI back for a proper comparison:
+            resize(left_disp,left_disp,Size(),2.0,2.0);
+            left_disp = left_disp*2.0;
+            ROI = Rect(ROI.x*2,ROI.y*2,ROI.width*2,ROI.height*2);
        }
-        fprintf(cur_cfg_res_file,"%s,%.1f,%.1f,%.1f,%.1f,%.3f,%.3f\n","average",average_MSE_pre/cnt,
-            average_MSE_post/cnt,average_percent_pre/cnt,average_percent_post/cnt,
-            average_matching_time/cnt,average_filtering_time/cnt);
-        fclose(cur_cfg_res_file);
    }
-    return 0;
-}
-
-void operator>>(const FileNode& node,dataset_entry& entry) 
-{
-    node["name"] >> entry.name;
-    node["left_file"] >> entry.left_file;
-    node["right_file"] >> entry.right_file;
-    node["GT_file"] >> entry.GT_file;
-}
+    else if(filter=="wls_no_conf")
+    {
+        left_for_matcher  = left.clone();
+        right_for_matcher = right.clone();

-double computeMSE(Mat& GT, Mat& src, Rect ROI)
-{
-    double res = 0;
-    Mat GT_ROI(GT,ROI);
-    Mat src_ROI(src,ROI);
-    int cnt=0;
-    for(int i=0;i<src_ROI.rows;i++)
-        for(int j=0;j<src_ROI.cols;j++)
+        if(algo=="bm")
        {
-            if(GT_ROI.at<short>(i,j)!=UNKNOWN_DISPARITY)
-            {
-                res += (GT_ROI.at<short>(i,j) - src_ROI.at<short>(i,j))*(GT_ROI.at<short>(i,j) - src_ROI.at<short>(i,j));
-                cnt++;
-            }
+            Ptr<StereoBM> matcher  = StereoBM::create(max_disp,wsize);
+            matcher->setTextureThreshold(0);
+            matcher->setUniquenessRatio(0);
+            cvtColor(left_for_matcher,  left_for_matcher, COLOR_BGR2GRAY);
+            cvtColor(right_for_matcher, right_for_matcher, COLOR_BGR2GRAY);
+            ROI = computeROI(left_for_matcher.size(),matcher);
+
+            matching_time = (double)getTickCount();
+            matcher->compute(left_for_matcher,right_for_matcher,left_disp);
+            matching_time = ((double)getTickCount() - matching_time)/getTickFrequency();
        }
-    res /= cnt*256;
-    return res;
-}
+        else if(algo=="sgbm")
+        {
+            Ptr<StereoSGBM> matcher  = StereoSGBM::create(0,max_disp,wsize);
+            matcher->setUniquenessRatio(0);
+            matcher->setDisp12MaxDiff(1000000);
+            matcher->setSpeckleWindowSize(0);
+            ROI = computeROI(left_for_matcher.size(),matcher);

-double computeBadPixelPercent(Mat& GT, Mat& src, Rect ROI, int thresh)
-{
-    int bad_pixel_num = 0;
-    Mat GT_ROI(GT,ROI);
-    Mat src_ROI(src,ROI);
-    int cnt=0;
-    for(int i=0;i<src_ROI.rows;i++)
-        for(int j=0;j<src_ROI.cols;j++)
+            matching_time = (double)getTickCount();
+            matcher->compute(left_for_matcher,right_for_matcher,left_disp);
+            matching_time = ((double)getTickCount() - matching_time)/getTickFrequency();
+        }
+        else
        {
-            if(GT_ROI.at<short>(i,j)!=UNKNOWN_DISPARITY)
-            {
-                if( abs(GT_ROI.at<short>(i,j) - src_ROI.at<short>(i,j))>=thresh )
-                    bad_pixel_num++;
-                cnt++;
-            }
+            cout<<"Unsupported algorithm";
+            return -1;
        }
-    return (100.0*bad_pixel_num)/cnt;
-}

-void getDisparityVis(Mat& disparity_map,Mat& dst)
-{
-    dst = Mat(disparity_map.rows,disparity_map.cols,CV_8UC3);
-    for(int i=0;i<dst.rows;i++)
-        for(int j=0;j<dst.cols;j++)
+        Ptr<DisparityWLSFilter> wls_filter = createDisparityWLSFilter(false);
+        wls_filter->setLambda(lambda);
+        wls_filter->setSigmaColor(sigma);
+        filtering_time = (double)getTickCount();
+        wls_filter->filter(left_disp,left,filtered_disp,ROI);
+        filtering_time = ((double)getTickCount() - filtering_time)/getTickFrequency();
+    }
+    else
+    {
+        cout<<"Unsupported filter";
+        return -1;
+    }
+
+    //collect and print all the stats:
+    cout.precision(2);
+    cout<<"Matching time:  "<<matching_time<<"s"<<endl;
+    cout<<"Filtering time: "<<filtering_time<<"s"<<endl;
+    cout<<endl;
+
+    double MSE_before,percent_bad_before,MSE_after,percent_bad_after;
+    if(!noGT)
+    {
+        MSE_before = computeMSE(GT_disp,left_disp,ROI);
+        percent_bad_before = computeBadPixelPercent(GT_disp,left_disp,ROI);
+        MSE_after = computeMSE(GT_disp,filtered_disp,ROI);
+        percent_bad_after = computeBadPixelPercent(GT_disp,filtered_disp,ROI);
+
+        cout.precision(5);
+        cout<<"MSE before filtering: "<<MSE_before<<endl;
+        cout<<"MSE after filtering:  "<<MSE_after<<endl;
+        cout<<endl;
+        cout.precision(3);
+        cout<<"Percent of bad pixels before filtering: "<<percent_bad_before<<endl;
+        cout<<"Percent of bad pixels after filtering:  "<<percent_bad_after<<endl;
+    }
+
+    if(dst_path!="None")
+    {
+        Mat filtered_disp_vis;
+        getDisparityVis(filtered_disp,filtered_disp_vis,vis_mult);
+        imwrite(dst_path,filtered_disp_vis);
+    }
+    if(dst_raw_path!="None")
+    {
+        Mat raw_disp_vis;
+        getDisparityVis(left_disp,raw_disp_vis,vis_mult);
+        imwrite(dst_raw_path,raw_disp_vis);
+    }
+    if(dst_conf_path!="None")
+    {
+        imwrite(dst_conf_path,conf_map);
+    }
+
+    if(!no_display)
+    {
+        namedWindow("left", WINDOW_AUTOSIZE);
+        imshow("left", left);
+        namedWindow("right", WINDOW_AUTOSIZE);
+        imshow("right", right);
+
+        if(!noGT)
        {
-            if(disparity_map.at<short>(i,j)==UNKNOWN_DISPARITY)
-                dst.at<Vec3b>(i,j) = Vec3b(0,0,0);
-            else
-                dst.at<Vec3b>(i,j) = Vec3b(saturate_cast<unsigned char>(disparity_map.at<short>(i,j)/8),
-                                           saturate_cast<unsigned char>(disparity_map.at<short>(i,j)/8),
-                                           saturate_cast<unsigned char>(disparity_map.at<short>(i,j)/8));
+            Mat GT_disp_vis;
+            getDisparityVis(GT_disp,GT_disp_vis,vis_mult);
+            namedWindow("ground-truth disparity", WINDOW_AUTOSIZE);
+            imshow("ground-truth disparity", GT_disp_vis);
        }
+
+        Mat raw_disp_vis;
+        getDisparityVis(left_disp,raw_disp_vis,vis_mult);
+        namedWindow("raw disparity", WINDOW_AUTOSIZE);
+        imshow("raw disparity", raw_disp_vis);
+        Mat filtered_disp_vis;
+        getDisparityVis(filtered_disp,filtered_disp_vis,vis_mult);
+        namedWindow("filtered disparity", WINDOW_AUTOSIZE);
+        imshow("filtered disparity", filtered_disp_vis);
+        waitKey();
+    }
+
+    return 0;
 }

 Rect computeROI(Size2i src_sz, Ptr<StereoMatcher> matcher_instance)
@ -225,34 +316,10 @@ Rect computeROI(Size2i src_sz, Ptr<StereoMatcher> matcher_instance)
    int minD = min_disparity, maxD = min_disparity + num_disparities - 1;

    int xmin = maxD + bs2;
-    int xmax = src_sz.width - minD - bs2;
+    int xmax = src_sz.width + minD - bs2;
    int ymin = bs2;
    int ymax = src_sz.height - bs2;

    Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
    return r;
 }
-
-void setConfigsForTesting(map<string,config>& cfgs)
-{
-    Ptr<StereoBM> stereobm_matcher = StereoBM::create(128,21); 
-    stereobm_matcher->setTextureThreshold(0); 
-    stereobm_matcher->setUniquenessRatio(0);
-
-    Ptr<DisparityFilter> wls_filter = createDisparityWLSFilter();
-    Ptr<DisparityFilter> dt_filter = createDisparityDTFilter();
-    Ptr<DisparityFilter> guided_filter = createDisparityGuidedFilter();
-
-    cfgs["stereobm_wls"] = config(stereobm_matcher,wls_filter);
-    cfgs["stereobm_dtf"] = config(stereobm_matcher,dt_filter);
-    cfgs["stereobm_gf"]  = config(stereobm_matcher,guided_filter);
-}
-
-void CreateDir(string path)
-{
-#if defined(_WIN32)
-    _mkdir(path.c_str());
-#else 
-    mkdir(path.c_str(), 0777);
-#endif
-}
--- a/modules/ximgproc/src/disparity_filters.cpp
+++ b/modules/ximgproc/src/disparity_filters.cpp
@ -36,149 +36,429 @@

 #include "precomp.hpp"
 #include "opencv2/ximgproc/disparity_filter.hpp"
+#include "opencv2/highgui.hpp"
 #include <math.h>
+#include <vector>

 namespace cv {
 namespace ximgproc {

-void setROI(InputArray disparity_map, InputArray left_view, OutputArray filtered_disparity_map,
-            Mat& disp,Mat& src,Mat& dst,Rect ROI);
+using std::vector;
+#define EPS 1e-43f

-class DisparityDTFilterImpl : public DisparityDTFilter
+class DisparityWLSFilterImpl : public DisparityWLSFilter
 {
 protected:
-    double sigmaSpatial,sigmaColor;
-    void init(double _sigmaSpatial, double _sigmaColor)
+    double lambda,sigma_color;
+    bool use_confidence;
+    Mat confidence_map;
+    int LRC_thresh,depth_discontinuity_radius;
+    float depth_discontinuity_roll_off_factor;
+    float resize_factor;
+    int num_stripes;
+
+    void init(double _lambda, double _sigma_color, bool _use_confidence);
+    void computeDepthDiscontinuityMaps(Mat& left_disp, Mat& right_disp, Mat& left_dst, Mat& right_dst, Rect ROI);
+    void computeConfidenceMap(InputArray left_disp, InputArray right_disp, Rect ROI);
+
+protected:
+    struct ComputeDiscontinuityAwareLRC_ParBody : public ParallelLoopBody
    {
-        sigmaColor = _sigmaColor;
-        sigmaSpatial = _sigmaSpatial;
-    }
-public:
-    double getSigmaSpatial() {return sigmaSpatial;}
-    void setSigmaSpatial(double _sigmaSpatial) {sigmaSpatial = _sigmaSpatial;}
-    double getSigmaColor() {return sigmaColor;}
-    void setSigmaColor(double _sigmaColor) {sigmaColor = _sigmaColor;}
+        DisparityWLSFilterImpl* wls;
+        Mat *left_disp, *right_disp;
+        Mat *left_disc, *right_disc, *dst;
+        Rect left_ROI, right_ROI;
+        int nstripes, stripe_sz;
+
+        ComputeDiscontinuityAwareLRC_ParBody(DisparityWLSFilterImpl& _wls, Mat& _left_disp, Mat& _right_disp, Mat& _left_disc, Mat& _right_disc, Mat& _dst, Rect _left_ROI, Rect _right_ROI, int _nstripes);
+        void operator () (const Range& range) const;
+    };

-    static Ptr<DisparityDTFilterImpl> create()
+    struct ComputeDepthDisc_ParBody : public ParallelLoopBody
    {
-        DisparityDTFilterImpl *dtf = new DisparityDTFilterImpl();
-        dtf->init(25.0,60.0); 
-        return Ptr<DisparityDTFilterImpl>(dtf);
+        DisparityWLSFilterImpl* wls;
+        Mat *disp,*disp_squares,*dst;
+        int nstripes, stripe_sz;
+
+        ComputeDepthDisc_ParBody(DisparityWLSFilterImpl& _wls, Mat& _disp, Mat& _disp_squares, Mat& _dst, int _nstripes);
+        void operator () (const Range& range) const;
+    };
+
+    typedef void (DisparityWLSFilterImpl::*MatOp)(Mat& src, Mat& dst);
+
+    struct ParallelMatOp_ParBody : public ParallelLoopBody
+    {
+        DisparityWLSFilterImpl* wls;
+        vector<MatOp> ops;
+        vector<Mat*> src;
+        vector<Mat*> dst;
+
+        ParallelMatOp_ParBody(DisparityWLSFilterImpl& _wls, vector<MatOp> _ops, vector<Mat*>& _src, vector<Mat*>& _dst);
+        void operator () (const Range& range) const;
+    };
+
+    void boxFilterOp(Mat& src,Mat& dst)
+    {
+        int rad = (int)ceil(resize_factor*depth_discontinuity_radius);
+        boxFilter(src,dst,CV_32F,Size(2*rad+1,2*rad+1),Point(-1,-1));
    }

-    void filter(InputArray disparity_map, InputArray left_view, OutputArray filtered_disparity_map,Rect ROI)
+    void sqrBoxFilterOp(Mat& src,Mat& dst)
    {
-        Mat disp,src,dst;
-        setROI(disparity_map,left_view,filtered_disparity_map,disp,src,dst,ROI);
+        int rad = (int)ceil(resize_factor*depth_discontinuity_radius);
+        sqrBoxFilter(src,dst,CV_32F,Size(2*rad+1,2*rad+1),Point(-1,-1));
+    }

-        Mat disp_32F,dst_32F;
-        disp.convertTo(disp_32F,CV_32F);
-        dtFilter(src,disp_32F,dst_32F,sigmaSpatial,sigmaColor);
-        dst_32F.convertTo(dst,CV_16S);
+    void copyToOp(Mat& src,Mat& dst)
+    {
+        src.copyTo(dst);
    }
+
+public:
+    static Ptr<DisparityWLSFilterImpl> create(bool _use_confidence);
+    void filter(InputArray disparity_map_left, InputArray left_view, OutputArray filtered_disparity_map, Rect ROI, InputArray disparity_map_right, InputArray);
+
+    double getLambda() {return lambda;}
+    void setLambda(double _lambda) {lambda = _lambda;}
+    double getSigmaColor() {return sigma_color;}
+    void setSigmaColor(double _sigma_color) {sigma_color = _sigma_color;}
+    Mat getConfidenceMap() {return confidence_map;}
+    int getLRCthresh() {return LRC_thresh;}
+    void setLRCthresh(int _LRC_thresh) {LRC_thresh = _LRC_thresh;}
+    int getDepthDiscontinuityRadius() {return depth_discontinuity_radius;}
+    void setDepthDiscontinuityRadius(int _disc_radius) {depth_discontinuity_radius = _disc_radius;}
 };

-CV_EXPORTS_W
-Ptr<DisparityDTFilter> createDisparityDTFilter()
+void DisparityWLSFilterImpl::init(double _lambda, double _sigma_color, bool _use_confidence)
 {
-    return Ptr<DisparityDTFilter>(DisparityDTFilterImpl::create());
+    lambda = _lambda;
+    sigma_color = _sigma_color;
+    use_confidence = _use_confidence;
+    confidence_map = Mat();
+    LRC_thresh = 24;
+    depth_discontinuity_radius = 5;
+    depth_discontinuity_roll_off_factor = 0.001f;
+    resize_factor = 1.0;
+    num_stripes = getNumThreads();
 }

-//////////////////////////////////////////////////////////////////////////////////////////////////////
-    
-class DisparityGuidedFilterImpl : public DisparityGuidedFilter
+void DisparityWLSFilterImpl::computeDepthDiscontinuityMaps(Mat& left_disp, Mat& right_disp, Mat& left_dst, Mat& right_dst, Rect ROI)
 {
-protected:
-    int radius;
-    double eps;
-    void init(int _radius, double _eps)
+    Rect right_ROI(left_disp.cols-(ROI.x+ROI.width),ROI.y,ROI.width,ROI.height);
+    Mat left_disp_ROI (left_disp, ROI);
+    Mat right_disp_ROI(right_disp,right_ROI);
+    Mat ldisp,rdisp,ldisp_squared,rdisp_squared;
+
    {
-        radius = _radius;
-        eps = _eps;
+        vector<Mat*> _src; _src.push_back(&left_disp_ROI);_src.push_back(&right_disp_ROI);
+                           _src.push_back(&left_disp_ROI);_src.push_back(&right_disp_ROI);
+        vector<Mat*> _dst; _dst.push_back(&ldisp);        _dst.push_back(&rdisp);
+                           _dst.push_back(&ldisp_squared);_dst.push_back(&rdisp_squared);
+        vector<MatOp> _ops; _ops.push_back(&DisparityWLSFilterImpl::copyToOp); _ops.push_back(&DisparityWLSFilterImpl::copyToOp);
+                            _ops.push_back(&DisparityWLSFilterImpl::copyToOp); _ops.push_back(&DisparityWLSFilterImpl::copyToOp);
+        parallel_for_(Range(0,4),ParallelMatOp_ParBody(*this,_ops,_src,_dst));
    }
-public:
-    double getEps() {return eps;}
-    void setEps(double _eps) {eps = _eps;}
-    int getRadius() {return radius;}
-    void setRadius(int _radius) {radius = _radius;}

-    static Ptr<DisparityGuidedFilterImpl> create()
    {
-        DisparityGuidedFilterImpl *gf = new DisparityGuidedFilterImpl();
-        gf->init(20,100.0);
-        return Ptr<DisparityGuidedFilterImpl>(gf);
+        vector<Mat*> _src; _src.push_back(&ldisp);        _src.push_back(&rdisp);
+                           _src.push_back(&ldisp_squared);_src.push_back(&rdisp_squared);
+        vector<Mat*> _dst; _dst.push_back(&ldisp);        _dst.push_back(&rdisp);
+                           _dst.push_back(&ldisp_squared);_dst.push_back(&rdisp_squared);
+        vector<MatOp> _ops; _ops.push_back(&DisparityWLSFilterImpl::boxFilterOp);   _ops.push_back(&DisparityWLSFilterImpl::boxFilterOp);
+                            _ops.push_back(&DisparityWLSFilterImpl::sqrBoxFilterOp);_ops.push_back(&DisparityWLSFilterImpl::sqrBoxFilterOp);
+        parallel_for_(Range(0,4),ParallelMatOp_ParBody(*this,_ops,_src,_dst));
    }

-    void filter(InputArray disparity_map, InputArray left_view, OutputArray filtered_disparity_map,Rect ROI)
+    left_dst  = Mat::zeros(left_disp.rows,left_disp.cols,CV_32F);
+    right_dst = Mat::zeros(right_disp.rows,right_disp.cols,CV_32F);
+    Mat left_dst_ROI (left_dst,ROI);
+    Mat right_dst_ROI(right_dst,right_ROI);
+
+    parallel_for_(Range(0,num_stripes),ComputeDepthDisc_ParBody(*this,ldisp,ldisp_squared,left_dst_ROI ,num_stripes));
+    parallel_for_(Range(0,num_stripes),ComputeDepthDisc_ParBody(*this,rdisp,rdisp_squared,right_dst_ROI,num_stripes));
+}
+
+
+void DisparityWLSFilterImpl::computeConfidenceMap(InputArray left_disp, InputArray right_disp, Rect ROI)
+{
+    Mat ldisp = left_disp.getMat();
+    Mat rdisp = right_disp.getMat();
+    Mat depth_discontinuity_map_left,depth_discontinuity_map_right;
+    computeDepthDiscontinuityMaps(ldisp,rdisp,depth_discontinuity_map_left,depth_discontinuity_map_right,ROI);
+
+    Rect right_ROI(ldisp.cols-(ROI.x+ROI.width),ROI.y,ROI.width,ROI.height);
+    confidence_map = depth_discontinuity_map_left;
+
+    parallel_for_(Range(0,num_stripes),ComputeDiscontinuityAwareLRC_ParBody(*this,ldisp,rdisp, depth_discontinuity_map_left,depth_discontinuity_map_right,confidence_map,ROI,right_ROI,num_stripes));
+    confidence_map = 255.0f*confidence_map;
+}
+
+Ptr<DisparityWLSFilterImpl> DisparityWLSFilterImpl::create(bool _use_confidence)
+{
+    DisparityWLSFilterImpl *wls = new DisparityWLSFilterImpl();
+    wls->init(8000.0,1.0,_use_confidence);
+    return Ptr<DisparityWLSFilterImpl>(wls);
+}
+
+void DisparityWLSFilterImpl::filter(InputArray disparity_map_left, InputArray left_view, OutputArray filtered_disparity_map, Rect ROI, InputArray disparity_map_right, InputArray)
+{
+    CV_Assert( !disparity_map_left.empty() && (disparity_map_left.depth() == CV_16S) && (disparity_map_left.channels() == 1) );
+    CV_Assert( !left_view.empty() && (left_view.depth() == CV_8U) && (left_view.channels() == 3 || left_view.channels() == 1) );
+    Mat disp,src,dst;
+    if(disparity_map_left.size()!=left_view.size())
+        resize_factor = disparity_map_left.cols()/(float)left_view.cols();
+    else
+        resize_factor = 1.0;
+
+    if(!use_confidence)
    {
-        Mat disp,src,dst;
-        setROI(disparity_map,left_view,filtered_disparity_map,disp,src,dst,ROI);
+        Mat disp_full_size = disparity_map_left.getMat();
+        Mat src_full_size = left_view.getMat();
+        if(disp_full_size.size!=src_full_size.size)
+        {
+            float x_ratio = src_full_size.cols/(float)disp_full_size.cols;
+            float y_ratio = src_full_size.rows/(float)disp_full_size.rows;
+            resize(disp_full_size,disp_full_size,src_full_size.size());
+            disp_full_size = disp_full_size*x_ratio;
+            ROI = Rect((int)(ROI.x*x_ratio),(int)(ROI.y*y_ratio),(int)(ROI.width*x_ratio),(int)(ROI.height*y_ratio));
+        }
+        disp = Mat(disp_full_size,ROI);
+        src  = Mat(src_full_size ,ROI);
+        filtered_disparity_map.create(disp_full_size.size(), disp_full_size.type());
+        Mat& dst_full_size = filtered_disparity_map.getMatRef();
+        dst_full_size = Scalar(-16);
+        dst = Mat(dst_full_size,ROI);
+        Mat filtered_disp;
+        fastGlobalSmootherFilter(src,disp,filtered_disp,lambda,sigma_color);
+        filtered_disp.copyTo(dst);
+    }
+    else
+    {
+        CV_Assert( !disparity_map_right.empty() && (disparity_map_right.depth() == CV_16S) && (disparity_map_right.channels() == 1) );
+        CV_Assert( (disparity_map_left.cols() == disparity_map_right.cols()) );
+        CV_Assert( (disparity_map_left.rows() == disparity_map_right.rows()) );
+        computeConfidenceMap(disparity_map_left,disparity_map_right,ROI);
+        Mat disp_full_size = disparity_map_left.getMat();
+        Mat src_full_size = left_view.getMat();
+        if(disp_full_size.size!=src_full_size.size)
+        {
+            float x_ratio = src_full_size.cols/(float)disp_full_size.cols;
+            float y_ratio = src_full_size.rows/(float)disp_full_size.rows;
+            resize(disp_full_size,disp_full_size,src_full_size.size());
+            disp_full_size = disp_full_size*x_ratio;
+            resize(confidence_map,confidence_map,src_full_size.size());
+            ROI = Rect((int)(ROI.x*x_ratio),(int)(ROI.y*y_ratio),(int)(ROI.width*x_ratio),(int)(ROI.height*y_ratio));
+        }
+        disp = Mat(disp_full_size,ROI);
+        src  = Mat(src_full_size ,ROI);
+        filtered_disparity_map.create(disp_full_size.size(), disp_full_size.type());
+        Mat& dst_full_size = filtered_disparity_map.getMatRef();
+        dst_full_size = Scalar(-16);
+        dst = Mat(dst_full_size,ROI);
+        Mat conf(confidence_map,ROI);

-        Mat disp_32F,dst_32F;
-        disp.convertTo(disp_32F,CV_32F);
-        guidedFilter(src,disp_32F,dst_32F,radius,eps);
-        dst_32F.convertTo(dst,CV_16S);
+        Mat disp_mul_conf;
+        disp.convertTo(disp_mul_conf,CV_32F);
+        disp_mul_conf = conf.mul(disp_mul_conf);
+        Mat conf_filtered;
+        Ptr<FastGlobalSmootherFilter> wls = createFastGlobalSmootherFilter(src,lambda,sigma_color);
+        wls->filter(disp_mul_conf,disp_mul_conf);
+        wls->filter(conf,conf_filtered);
+        disp_mul_conf = disp_mul_conf.mul(1/(conf_filtered+EPS));
+        disp_mul_conf.convertTo(dst,CV_16S);
    }
-};
+}

-CV_EXPORTS_W
-Ptr<DisparityGuidedFilter> createDisparityGuidedFilter()
+DisparityWLSFilterImpl::ComputeDiscontinuityAwareLRC_ParBody::ComputeDiscontinuityAwareLRC_ParBody(DisparityWLSFilterImpl& _wls, Mat& _left_disp, Mat& _right_disp, Mat& _left_disc, Mat& _right_disc, Mat& _dst, Rect _left_ROI, Rect _right_ROI, int _nstripes):
+wls(&_wls),left_disp(&_left_disp),right_disp(&_right_disp),left_disc(&_left_disc),right_disc(&_right_disc),dst(&_dst),left_ROI(_left_ROI),right_ROI(_right_ROI),nstripes(_nstripes)
 {
-    return Ptr<DisparityGuidedFilter>(DisparityGuidedFilterImpl::create());
+    stripe_sz = (int)ceil(left_disp->rows/(double)nstripes);
 }
-    
-//////////////////////////////////////////////////////////////////////////////////////////////////////

-class DisparityWLSFilterImpl : public DisparityWLSFilter
+void DisparityWLSFilterImpl::ComputeDiscontinuityAwareLRC_ParBody::operator() (const Range& range) const
 {
-protected:
-    double lambda,sigma_color;
-    void init(double _lambda, double _sigma_color)
+    short* row_left;
+    float* row_left_conf;
+    short* row_right;
+    float* row_right_conf;
+    float* row_dst;
+    int right_idx;
+    int h = left_disp->rows;
+
+    int start = std::min(range.start * stripe_sz, h);
+    int end   = std::min(range.end   * stripe_sz, h);
+    int thresh = (int)(wls->resize_factor*wls->LRC_thresh);
+    for(int i=start;i<end;i++)
    {
-        lambda = _lambda;
-        sigma_color = _sigma_color;
+        row_left  = (short*)left_disp->ptr(i);
+        row_left_conf  = (float*)left_disc->ptr(i);
+        row_right = (short*)right_disp->ptr(i);
+        row_right_conf  = (float*)right_disc->ptr(i);
+        row_dst   = (float*)dst->ptr(i);
+        int j_start = left_ROI.x;
+        int j_end = left_ROI.x+left_ROI.width;
+        int right_end = right_ROI.x+right_ROI.width;
+        for(int j=j_start;j<j_end;j++)
+        {
+            right_idx = j-(row_left[j]>>4);
+            if( right_idx>=right_ROI.x && right_idx<right_end)
+            {
+                if(abs(row_left[j] + row_right[right_idx])< thresh)
+                    row_dst[j] = min(row_left_conf[j],row_right_conf[right_idx]);
+                else
+                    row_dst[j] = 0.0f;
+            }
+        }
    }
-public:
-    double getLambda() {return lambda;}
-    void setLambda(double _lambda) {lambda = _lambda;}
-    double getSigmaColor() {return sigma_color;}
-    void setSigmaColor(double _sigma_color) {sigma_color = _sigma_color;}
+}
+
+DisparityWLSFilterImpl::ComputeDepthDisc_ParBody::ComputeDepthDisc_ParBody(DisparityWLSFilterImpl& _wls, Mat& _disp, Mat& _disp_squares, Mat& _dst, int _nstripes):
+wls(&_wls),disp(&_disp),disp_squares(&_disp_squares),dst(&_dst),nstripes(_nstripes)
+{
+    stripe_sz = (int)ceil(disp->rows/(double)nstripes);
+}

-    static Ptr<DisparityWLSFilterImpl> create()
+void DisparityWLSFilterImpl::ComputeDepthDisc_ParBody::operator() (const Range& range) const
+{
+    float* row_disp;
+    float* row_disp_squares;
+    float* row_dst;
+    float variance;
+    int h = disp->rows;
+    int w = disp->cols;
+    int start = std::min(range.start * stripe_sz, h);
+    int end   = std::min(range.end   * stripe_sz, h);
+    float roll_off = wls->depth_discontinuity_roll_off_factor/(wls->resize_factor*wls->resize_factor);
+
+    for(int i=start;i<end;i++)
    {
-        DisparityWLSFilterImpl *wls = new DisparityWLSFilterImpl();
-        wls->init(500.0,2.0);
-        return Ptr<DisparityWLSFilterImpl>(wls);
+        row_disp = (float*)disp->ptr(i);
+        row_disp_squares = (float*)disp_squares->ptr(i);
+        row_dst = (float*)dst->ptr(i);
+
+        for(int j=0;j<w;j++)
+        {
+            variance = row_disp_squares[j] - (row_disp[j])*(row_disp[j]);
+            row_dst[j] = max(1.0f - roll_off*variance,0.0f);
+        }
    }
+}
+
+DisparityWLSFilterImpl::ParallelMatOp_ParBody::ParallelMatOp_ParBody(DisparityWLSFilterImpl& _wls, vector<MatOp> _ops, vector<Mat*>& _src, vector<Mat*>& _dst):
+wls(&_wls),ops(_ops),src(_src),dst(_dst)
+{}
+
+void DisparityWLSFilterImpl::ParallelMatOp_ParBody::operator() (const Range& range) const
+{
+    for(int i=range.start;i<range.end;i++)
+        (wls->*ops[i])(*src[i],*dst[i]);
+}
+
+CV_EXPORTS_W
+Ptr<DisparityWLSFilter> createDisparityWLSFilter(bool use_confidence)
+{
+    return Ptr<DisparityWLSFilter>(DisparityWLSFilterImpl::create(use_confidence));
+}
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+#define UNKNOWN_DISPARITY 16320

-    void filter(InputArray disparity_map, InputArray left_view, OutputArray filtered_disparity_map,Rect ROI)
+int readGT(String src_path,OutputArray dst)
+{
+    Mat src = imread(src_path,IMREAD_UNCHANGED);
+    dst.create(src.rows,src.cols,CV_16S);
+    Mat& dstMat = dst.getMatRef();
+
+    if(!src.empty() && src.channels()==3 && src.depth()==CV_8U)
    {
-        Mat disp,src,dst;
-        setROI(disparity_map,left_view,filtered_disparity_map,disp,src,dst,ROI);
-        weightedLeastSquaresFilter(src,disp,dst,lambda,sigma_color);
+        //MPI-Sintel format:
+        for(int i=0;i<src.rows;i++)
+            for(int j=0;j<src.cols;j++)
+            {
+                Vec3b bgrPixel = src.at<Vec3b>(i, j);
+                dstMat.at<short>(i,j) = 64*bgrPixel.val[2]+bgrPixel.val[1]/4; //16-multiplied disparity
+            }
+        return 0;
    }
-};
+    else if(!src.empty() && src.channels()==1 && src.depth()==CV_8U)
+    {
+        //Middlebury format:
+        for(int i=0;i<src.rows;i++)
+            for(int j=0;j<src.cols;j++)
+            {
+                short src_val = src.at<unsigned char>(i, j);
+                if(src_val==0)
+                    dstMat.at<short>(i,j) = UNKNOWN_DISPARITY;
+                else
+                    dstMat.at<short>(i,j) = 16*src_val; //16-multiplied disparity
+            }
+         return 0;
+    }
+    else
+        return 1;
+}

-CV_EXPORTS_W
-Ptr<DisparityWLSFilter> createDisparityWLSFilter()
+double computeMSE(InputArray GT, InputArray src, Rect ROI)
 {
-    return Ptr<DisparityWLSFilter>(DisparityWLSFilterImpl::create());
+    CV_Assert( !GT.empty()  && (GT.depth()  == CV_16S) && (GT.channels()  == 1) );
+    CV_Assert( !src.empty() && (src.depth() == CV_16S) && (src.channels() == 1) );
+    CV_Assert( src.rows() == GT.rows() && src.cols() == GT.cols() );
+    double res = 0;
+    Mat GT_ROI (GT.getMat(), ROI);
+    Mat src_ROI(src.getMat(),ROI);
+    int cnt=0;
+    for(int i=0;i<src_ROI.rows;i++)
+        for(int j=0;j<src_ROI.cols;j++)
+        {
+            if(GT_ROI.at<short>(i,j)!=UNKNOWN_DISPARITY)
+            {
+                res += (GT_ROI.at<short>(i,j) - src_ROI.at<short>(i,j))*(GT_ROI.at<short>(i,j) - src_ROI.at<short>(i,j));
+                cnt++;
+            }
+        }
+    res /= cnt*256;
+    return res;
 }

-//////////////////////////////////////////////////////////////////////////////////////////////////////
+double computeBadPixelPercent(InputArray GT, InputArray src, Rect ROI, int thresh)
+{
+    CV_Assert( !GT.empty()  && (GT.depth()  == CV_16S) && (GT.channels()  == 1) );
+    CV_Assert( !src.empty() && (src.depth() == CV_16S) && (src.channels() == 1) );
+    CV_Assert( src.rows() == GT.rows() && src.cols() == GT.cols() );
+    int bad_pixel_num = 0;
+    Mat GT_ROI (GT.getMat(), ROI);
+    Mat src_ROI(src.getMat(),ROI);
+    int cnt=0;
+    for(int i=0;i<src_ROI.rows;i++)
+        for(int j=0;j<src_ROI.cols;j++)
+        {
+            if(GT_ROI.at<short>(i,j)!=UNKNOWN_DISPARITY)
+            {
+                if( abs(GT_ROI.at<short>(i,j) - src_ROI.at<short>(i,j))>=thresh )
+                    bad_pixel_num++;
+                cnt++;
+            }
+        }
+    return (100.0*bad_pixel_num)/cnt;
+}

-void setROI(InputArray disparity_map, InputArray left_view, OutputArray filtered_disparity_map,
-    Mat& disp,Mat& src,Mat& dst,Rect ROI)
+void getDisparityVis(InputArray src,OutputArray dst,double scale)
 {
-    Mat disp_full_size = disparity_map.getMat();
-    Mat src_full_size = left_view.getMat();
-    CV_Assert( (disp_full_size.depth() == CV_16S) && (disp_full_size.channels() == 1) );
-    CV_Assert( (src_full_size.depth()  == CV_8U)  && (src_full_size.channels()  <= 4) );
-    disp = Mat(disp_full_size,ROI);
-    src  = Mat(src_full_size ,ROI);
-    filtered_disparity_map.create(disp_full_size.size(), disp_full_size.type());
-    Mat& dst_full_size = filtered_disparity_map.getMatRef();
-    dst = Mat(dst_full_size,ROI);
+    CV_Assert( !src.empty() && (src.depth() == CV_16S) && (src.channels() == 1) );
+    Mat srcMat = src.getMat();
+    dst.create(srcMat.rows,srcMat.cols,CV_8UC1);
+    Mat& dstMat = dst.getMatRef();
+
+    for(int i=0;i<dstMat.rows;i++)
+        for(int j=0;j<dstMat.cols;j++)
+        {
+            if(srcMat.at<short>(i,j)==UNKNOWN_DISPARITY)
+                dstMat.at<unsigned char>(i,j) = 0;
+            else
+                dstMat.at<unsigned char>(i,j) = saturate_cast<unsigned char>(scale*srcMat.at<short>(i,j)/16.0);
+        }
 }

 }
--- a/modules/ximgproc/src/fgs_filter.cpp
+++ b/modules/ximgproc/src/fgs_filter.cpp
@ -0,0 +1,694 @@
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  *Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "precomp.hpp"
+#include "opencv2/hal/intrin.hpp"
+#include <vector>
+
+namespace cv {
+namespace ximgproc {
+
+using std::vector;
+
+typedef float WorkType;
+typedef Vec<WorkType, 1> WorkVec;
+typedef WorkType (*get_weight_op)(WorkType*, unsigned char*,unsigned char*);
+
+inline WorkType get_weight_1channel(WorkType* LUT, unsigned char* p1,unsigned char* p2)
+{
+    return LUT[ (p1[0]-p2[0])*(p1[0]-p2[0]) ];
+}
+inline WorkType get_weight_3channel(WorkType* LUT, unsigned char* p1,unsigned char* p2)
+{
+    return LUT[ (p1[0]-p2[0])*(p1[0]-p2[0])+
+                (p1[1]-p2[1])*(p1[1]-p2[1])+
+                (p1[2]-p2[2])*(p1[2]-p2[2]) ];
+}
+
+class FastGlobalSmootherFilterImpl : public FastGlobalSmootherFilter
+{
+public:
+    static Ptr<FastGlobalSmootherFilterImpl> create(InputArray guide, double lambda, double sigma_color, int num_iter,double lambda_attenuation);
+    void filter(InputArray src, OutputArray dst);
+
+protected:
+    int w,h;
+    int num_stripes;
+    float sigmaColor,lambda;
+    float lambda_attenuation;
+    int num_iter;
+    Mat weights_LUT;
+    Mat Chor, Cvert;
+    Mat interD;
+    void init(InputArray guide,double _lambda,double _sigmaColor,int _num_iter,double _lambda_attenuation);
+    void horizontalPass(Mat& cur);
+    void verticalPass(Mat& cur);
+protected:
+    struct HorizontalPass_ParBody : public ParallelLoopBody
+    {
+        FastGlobalSmootherFilterImpl* fgs;
+        Mat* cur;
+        int nstripes, stripe_sz;
+        int h;
+
+        HorizontalPass_ParBody(FastGlobalSmootherFilterImpl &_fgs, Mat& _cur, int _nstripes, int _h);
+        void operator () (const Range& range) const;
+    };
+    inline void process_4row_block(Mat* cur,int i);
+    inline void process_row(Mat* cur,int i);
+
+    struct VerticalPass_ParBody : public ParallelLoopBody
+    {
+        FastGlobalSmootherFilterImpl* fgs;
+        Mat* cur;
+        int nstripes, stripe_sz;
+        int w;
+
+        VerticalPass_ParBody(FastGlobalSmootherFilterImpl &_fgs, Mat& _cur, int _nstripes, int _w);
+        void operator () (const Range& range) const;
+    };
+
+    template<get_weight_op get_weight, const int num_ch>
+    struct ComputeHorizontalWeights_ParBody : public ParallelLoopBody
+    {
+        FastGlobalSmootherFilterImpl* fgs;
+        Mat* guide;
+        int nstripes, stripe_sz;
+        int h;
+
+        ComputeHorizontalWeights_ParBody(FastGlobalSmootherFilterImpl &_fgs, Mat& _guide, int _nstripes, int _h);
+        void operator () (const Range& range) const;
+    };
+
+    template<get_weight_op get_weight, const int num_ch>
+    struct ComputeVerticalWeights_ParBody : public ParallelLoopBody
+    {
+        FastGlobalSmootherFilterImpl* fgs;
+        Mat* guide;
+        int nstripes, stripe_sz;
+        int w;
+
+        ComputeVerticalWeights_ParBody(FastGlobalSmootherFilterImpl &_fgs, Mat& _guide, int _nstripes, int _w);
+        void operator () (const Range& range) const;
+    };
+
+    struct ComputeLUT_ParBody : public ParallelLoopBody
+    {
+        FastGlobalSmootherFilterImpl* fgs;
+        WorkType* LUT;
+        int nstripes, stripe_sz;
+        int sz;
+
+        ComputeLUT_ParBody(FastGlobalSmootherFilterImpl &_fgs, WorkType* _LUT, int _nstripes, int _sz);
+        void operator () (const Range& range) const;
+    };
+};
+
+
+void FastGlobalSmootherFilterImpl::init(InputArray guide,double _lambda,double _sigmaColor,int _num_iter,double _lambda_attenuation)
+{
+    CV_Assert( !guide.empty() && _lambda >= 0 && _sigmaColor >= 0 && _num_iter >=1 );
+    CV_Assert( guide.depth() == CV_8U && (guide.channels() == 1 || guide.channels() == 3) );
+    sigmaColor = (float)_sigmaColor;
+    lambda = (float)_lambda;
+    lambda_attenuation = (float)_lambda_attenuation;
+    num_iter = _num_iter;
+    num_stripes = getNumThreads();
+    int num_levels = 3*256*256;
+    weights_LUT.create(1,num_levels,WorkVec::type);
+
+    WorkType* LUT = (WorkType*)weights_LUT.ptr(0);
+    parallel_for_(Range(0,num_stripes),ComputeLUT_ParBody(*this,LUT,num_stripes,num_levels));
+
+    w = guide.cols();
+    h = guide.rows();
+    Chor.  create(h,w,WorkVec::type);
+    Cvert. create(h,w,WorkVec::type);
+    interD.create(h,w,WorkVec::type);
+    Mat guideMat = guide.getMat();
+
+    if(guide.channels() == 1)
+    {
+        parallel_for_(Range(0,num_stripes),ComputeHorizontalWeights_ParBody<get_weight_1channel,1>(*this,guideMat,num_stripes,h));
+        parallel_for_(Range(0,num_stripes),ComputeVerticalWeights_ParBody  <get_weight_1channel,1>(*this,guideMat,num_stripes,w));
+    }
+    if(guide.channels() == 3)
+    {
+        parallel_for_(Range(0,num_stripes),ComputeHorizontalWeights_ParBody<get_weight_3channel,3>(*this,guideMat,num_stripes,h));
+        parallel_for_(Range(0,num_stripes),ComputeVerticalWeights_ParBody  <get_weight_3channel,3>(*this,guideMat,num_stripes,w));
+    }
+}
+
+Ptr<FastGlobalSmootherFilterImpl> FastGlobalSmootherFilterImpl::create(InputArray guide, double lambda, double sigma_color, int num_iter, double lambda_attenuation)
+{
+    FastGlobalSmootherFilterImpl *fgs = new FastGlobalSmootherFilterImpl();
+    fgs->init(guide,lambda,sigma_color,num_iter,lambda_attenuation);
+    return Ptr<FastGlobalSmootherFilterImpl>(fgs);
+}
+
+void FastGlobalSmootherFilterImpl::filter(InputArray src, OutputArray dst)
+{
+    CV_Assert(!src.empty() && (src.depth() == CV_8U || src.depth() == CV_16S || src.depth() == CV_32F) && src.channels()<=4);
+    if (src.rows() != h || src.cols() != w)
+    {
+        CV_Error(Error::StsBadSize, "Size of the filtered image must be equal to the size of the guide image");
+        return;
+    }
+
+    vector<Mat> src_channels;
+    vector<Mat> dst_channels;
+    if(src.channels()==1)
+        src_channels.push_back(src.getMat());
+    else
+        split(src,src_channels);
+
+    float lambda_ref = lambda;
+
+    for(int i=0;i<src.channels();i++)
+    {
+        lambda = lambda_ref;
+        Mat cur_res = src_channels[i].clone();
+        if(src.depth()!=WorkVec::type)
+            cur_res.convertTo(cur_res,WorkVec::type);
+
+        for(int n=0;n<num_iter;n++)
+        {
+            horizontalPass(cur_res);
+            verticalPass(cur_res);
+            lambda*=lambda_attenuation;
+        }
+
+        Mat dstMat;
+        if(src.depth()!=WorkVec::type)
+            cur_res.convertTo(dstMat,src.depth());
+        else
+            dstMat = cur_res;
+
+        dst_channels.push_back(dstMat);
+    }
+
+    lambda = lambda_ref;
+
+    dst.create(src.size(),src.type());
+    if(src.channels()==1)
+    {
+        Mat& dstMat = dst.getMatRef();
+        dstMat = dst_channels[0];
+    }
+    else
+        merge(dst_channels,dst);
+}
+
+void FastGlobalSmootherFilterImpl::horizontalPass(Mat& cur)
+{
+    parallel_for_(Range(0,num_stripes),HorizontalPass_ParBody(*this,cur,num_stripes,h));
+}
+
+void FastGlobalSmootherFilterImpl::verticalPass(Mat& cur)
+{
+    parallel_for_(Range(0,num_stripes),VerticalPass_ParBody(*this,cur,num_stripes,w));
+}
+
+FastGlobalSmootherFilterImpl::HorizontalPass_ParBody::HorizontalPass_ParBody(FastGlobalSmootherFilterImpl &_fgs, Mat& _cur, int _nstripes, int _h):
+fgs(&_fgs),cur(&_cur), nstripes(_nstripes), h(_h)
+{
+    stripe_sz = (int)ceil(h/(double)nstripes);
+}
+
+void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
+{
+    WorkType denom,denom_next,denom_next2,denom_next3;
+
+    WorkType *Chor_row   = (WorkType*)Chor.ptr  (i);
+    WorkType *interD_row = (WorkType*)interD.ptr(i);
+    WorkType *cur_row    = (WorkType*)cur->ptr  (i);
+
+    WorkType *Chor_row_next   = (WorkType*)Chor.ptr  (i+1);
+    WorkType *interD_row_next = (WorkType*)interD.ptr(i+1);
+    WorkType *cur_row_next    = (WorkType*)cur->ptr  (i+1);
+
+    WorkType *Chor_row_next2   = (WorkType*)Chor.ptr  (i+2);
+    WorkType *interD_row_next2 = (WorkType*)interD.ptr(i+2);
+    WorkType *cur_row_next2    = (WorkType*)cur->ptr  (i+2);
+
+    WorkType *Chor_row_next3   = (WorkType*)Chor.ptr  (i+3);
+    WorkType *interD_row_next3 = (WorkType*)interD.ptr(i+3);
+    WorkType *cur_row_next3    = (WorkType*)cur->ptr  (i+3);
+
+    float coef_cur,          coef_prev;
+    float coef_cur_row_next, coef_prev_row_next;
+    float coef_cur_row_next2,coef_prev_row_next2;
+    float coef_cur_row_next3,coef_prev_row_next3;
+
+    //forward pass:
+    coef_prev           = lambda*Chor_row[0];
+    coef_prev_row_next  = lambda*Chor_row_next[0];
+    coef_prev_row_next2 = lambda*Chor_row_next2[0];
+    coef_prev_row_next3 = lambda*Chor_row_next3[0];
+
+    interD_row[0]       = coef_prev          /(1-coef_prev);
+    interD_row_next[0]  = coef_prev_row_next /(1-coef_prev_row_next);
+    interD_row_next2[0] = coef_prev_row_next2/(1-coef_prev_row_next2);
+    interD_row_next3[0] = coef_prev_row_next3/(1-coef_prev_row_next3);
+
+    cur_row[0]       = cur_row[0]      /(1-coef_prev);
+    cur_row_next[0]  = cur_row_next[0] /(1-coef_prev_row_next);
+    cur_row_next2[0] = cur_row_next2[0]/(1-coef_prev_row_next2);
+    cur_row_next3[0] = cur_row_next3[0]/(1-coef_prev_row_next3);
+    int j=1;
+
+#if CV_SIMD128
+    {
+        v_float32x4 coef_prev_reg(coef_prev,coef_prev_row_next,coef_prev_row_next2,coef_prev_row_next3);
+        v_float32x4 interD_prev_reg(interD_row[0],interD_row_next[0],interD_row_next2[0],interD_row_next3[0]);
+        v_float32x4 cur_prev_reg(cur_row[0],cur_row_next[0],cur_row_next2[0],cur_row_next3[0]);
+        v_float32x4 lambda_reg(lambda,lambda,lambda,lambda);
+        v_float32x4 one_reg(1.0f,1.0f,1.0f,1.0f);
+
+        v_float32x4 a0,a1,a2,a3;
+        v_float32x4 b0,b1,b2,b3;
+        v_float32x4 aux0,aux1,aux2,aux3;
+
+#define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\
+        coef_cur_out = lambda_reg*Chor_in;\
+        aux0 = interD_prev_in*coef_prev_in;\
+        aux1 = coef_cur_out+coef_prev_in;\
+        aux1 = one_reg-aux1;\
+        aux0 = aux1-aux0;\
+        interD_out = coef_cur_out/aux0;\
+        aux1 = cur_prev_in*coef_prev_in;\
+        aux1 = cur_in - aux1;\
+        cur_out = aux1/aux0;
+
+        for(;j<w-3;j+=4)
+        {
+            // processing a 4x4 block:
+
+            aux0 = v_load(Chor_row+j);
+            aux1 = v_load(Chor_row_next+j);
+            aux2 = v_load(Chor_row_next2+j);
+            aux3 = v_load(Chor_row_next3+j);
+            v_transpose4x4(aux0,aux1,aux2,aux3,a0,a1,a2,a3);
+
+            aux0 = v_load(cur_row+j);
+            aux1 = v_load(cur_row_next+j);
+            aux2 = v_load(cur_row_next2+j);
+            aux3 = v_load(cur_row_next3+j);
+            v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);
+
+            PROC4(a0,b0,coef_prev_reg,interD_prev_reg,cur_prev_reg,a0,b0,aux2);
+            PROC4(a1,b1,aux2,a0,b0,a1,b1,aux3);
+            PROC4(a2,b2,aux3,a1,b1,a2,b2,aux2);
+            PROC4(a3,b3,aux2,a2,b2,a3,b3,aux3);
+
+            interD_prev_reg = a3;
+            cur_prev_reg = b3;
+            coef_prev_reg = aux3;
+
+            v_transpose4x4(a0,a1,a2,a3,aux0,aux1,aux2,aux3);
+            v_store(interD_row+j,aux0);
+            v_store(interD_row_next+j,aux1);
+            v_store(interD_row_next2+j,aux2);
+            v_store(interD_row_next3+j,aux3);
+
+            v_transpose4x4(b0,b1,b2,b3,aux0,aux1,aux2,aux3);
+            v_store(cur_row+j,aux0);
+            v_store(cur_row_next+j,aux1);
+            v_store(cur_row_next2+j,aux2);
+            v_store(cur_row_next3+j,aux3);
+        }
+#undef PROC4
+    }
+#endif
+
+    for(;j<w;j++)
+    {
+        coef_prev           = lambda*Chor_row[j-1];
+        coef_prev_row_next  = lambda*Chor_row_next[j-1];
+        coef_prev_row_next2 = lambda*Chor_row_next2[j-1];
+        coef_prev_row_next3 = lambda*Chor_row_next3[j-1];
+
+        coef_cur           = lambda*Chor_row[j];
+        coef_cur_row_next  = lambda*Chor_row_next[j];
+        coef_cur_row_next2 = lambda*Chor_row_next2[j];
+        coef_cur_row_next3 = lambda*Chor_row_next3[j];
+
+        denom       = (1-coef_prev          -coef_cur)          -interD_row[j-1]      *coef_prev;
+        denom_next  = (1-coef_prev_row_next -coef_cur_row_next) -interD_row_next[j-1] *coef_prev_row_next;
+        denom_next2 = (1-coef_prev_row_next2-coef_cur_row_next2)-interD_row_next2[j-1]*coef_prev_row_next2;
+        denom_next3 = (1-coef_prev_row_next3-coef_cur_row_next3)-interD_row_next3[j-1]*coef_prev_row_next3;
+
+        interD_row[j]       = coef_cur          /denom;
+        interD_row_next[j]  = coef_cur_row_next /denom_next;
+        interD_row_next2[j] = coef_cur_row_next2/denom_next2;
+        interD_row_next3[j] = coef_cur_row_next3/denom_next3;
+
+        cur_row[j]       = (cur_row[j]      -cur_row[j-1]      *coef_prev)          /denom;
+        cur_row_next[j]  = (cur_row_next[j] -cur_row_next[j-1] *coef_prev_row_next) /denom_next;
+        cur_row_next2[j] = (cur_row_next2[j]-cur_row_next2[j-1]*coef_prev_row_next2)/denom_next2;
+        cur_row_next3[j] = (cur_row_next3[j]-cur_row_next3[j-1]*coef_prev_row_next3)/denom_next3;
+    }
+    //backward pass:
+    j = w-2;
+
+#if CV_SIMD128
+    {
+        v_float32x4 cur_next_reg(cur_row[w-1],cur_row_next[w-1],cur_row_next2[w-1],cur_row_next3[w-1]);
+        v_float32x4 a0,a1,a2,a3;
+        v_float32x4 b0,b1,b2,b3;
+        v_float32x4 aux0,aux1,aux2,aux3;
+        for(j-=3;j>=0;j-=4)
+        {
+            //process 4x4 block:
+
+            aux0 = v_load(interD_row+j);
+            aux1 = v_load(interD_row_next+j);
+            aux2 = v_load(interD_row_next2+j);
+            aux3 = v_load(interD_row_next3+j);
+            v_transpose4x4(aux0,aux1,aux2,aux3,a0,a1,a2,a3);
+
+            aux0 = v_load(cur_row+j);
+            aux1 = v_load(cur_row_next+j);
+            aux2 = v_load(cur_row_next2+j);
+            aux3 = v_load(cur_row_next3+j);
+            v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);
+
+            aux0 = a3*cur_next_reg;
+            b3 = b3-aux0;
+            aux0 = a2*b3;
+            b2 = b2-aux0;
+            aux0 = a1*b2;
+            b1 = b1-aux0;
+            aux0 = a0*b1;
+            b0 = b0-aux0;
+
+            cur_next_reg = b0;
+
+            v_transpose4x4(b0,b1,b2,b3,aux0,aux1,aux2,aux3);
+            v_store(cur_row+j,aux0);
+            v_store(cur_row_next+j,aux1);
+            v_store(cur_row_next2+j,aux2);
+            v_store(cur_row_next3+j,aux3);
+        }
+        j+=3;
+    }
+#endif
+
+    for(;j>=0;j--)
+    {
+        cur_row[j]       = cur_row[j]      -interD_row[j]      *cur_row[j+1];
+        cur_row_next[j]  = cur_row_next[j] -interD_row_next[j] *cur_row_next[j+1];
+        cur_row_next2[j] = cur_row_next2[j]-interD_row_next2[j]*cur_row_next2[j+1];
+        cur_row_next3[j] = cur_row_next3[j]-interD_row_next3[j]*cur_row_next3[j+1];
+    }
+}
+
+void FastGlobalSmootherFilterImpl::process_row(Mat* cur,int i)
+{
+    WorkType denom;
+    WorkType *Chor_row = (WorkType*)Chor.ptr(i);
+    WorkType *interD_row = (WorkType*)interD.ptr(i);
+    WorkType *cur_row = (WorkType*)cur->ptr(i);
+
+    float coef_cur,coef_prev;
+
+    //forward pass:
+    coef_prev = lambda*Chor_row[0];
+    interD_row[0] = coef_prev/(1-coef_prev);
+    cur_row[0] = cur_row[0]/(1-coef_prev);
+    for(int j=1;j<w;j++)
+    {
+        coef_cur = lambda*Chor_row[j];
+        denom = (1-coef_prev-coef_cur)-interD_row[j-1]*coef_prev;
+        interD_row[j] = coef_cur/denom;
+        cur_row[j] = (cur_row[j]-cur_row[j-1]*coef_prev)/denom;
+        coef_prev = coef_cur;
+    }
+
+    //backward pass:
+    for(int j=w-2;j>=0;j--)
+        cur_row[j] = cur_row[j]-interD_row[j]*cur_row[j+1];
+}
+
+void FastGlobalSmootherFilterImpl::HorizontalPass_ParBody::operator()(const Range& range) const
+{
+    int start = std::min(range.start * stripe_sz, h);
+    int end   = std::min(range.end   * stripe_sz, h);
+
+    int i=start;
+    for(;i<end-3;i+=4)
+        fgs->process_4row_block(cur,i);
+    for(;i<end;i++)
+        fgs->process_row(cur,i);
+}
+
+FastGlobalSmootherFilterImpl::VerticalPass_ParBody::VerticalPass_ParBody(FastGlobalSmootherFilterImpl &_fgs, Mat& _cur, int _nstripes, int _w):
+fgs(&_fgs),cur(&_cur), nstripes(_nstripes), w(_w)
+{
+    stripe_sz = (int)ceil(w/(double)nstripes);
+}
+
+void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range& range) const
+{
+    int start = std::min(range.start * stripe_sz, w);
+    int end   = std::min(range.end   * stripe_sz, w);
+
+    //float lambda = fgs->lambda;
+    WorkType denom;
+    WorkType *Cvert_row, *Cvert_row_prev;
+    WorkType *interD_row, *interD_row_prev, *cur_row, *cur_row_prev, *cur_row_next;
+
+    float coef_cur,coef_prev;
+
+    Cvert_row = (WorkType*)fgs->Cvert.ptr(0);
+    interD_row = (WorkType*)fgs->interD.ptr(0);
+    cur_row = (WorkType*)cur->ptr(0);
+    //forward pass:
+    for(int j=start;j<end;j++)
+    {
+        coef_cur = fgs->lambda*Cvert_row[j];
+        interD_row[j] = coef_cur/(1-coef_cur);
+        cur_row[j] = cur_row[j]/(1-coef_cur);
+    }
+    for(int i=1;i<fgs->h;i++)
+    {
+        Cvert_row = (WorkType*)fgs->Cvert.ptr(i);
+        Cvert_row_prev = (WorkType*)fgs->Cvert.ptr(i-1);
+        interD_row = (WorkType*)fgs->interD.ptr(i);
+        interD_row_prev = (WorkType*)fgs->interD.ptr(i-1);
+        cur_row = (WorkType*)cur->ptr(i);
+        cur_row_prev = (WorkType*)cur->ptr(i-1);
+        int j = start;
+
+#if CV_SIMD128
+        v_float32x4 a,b,c,d,coef_cur_reg,coef_prev_reg;
+        v_float32x4 one_reg(1.0f,1.0f,1.0f,1.0f);
+        v_float32x4 lambda_reg(fgs->lambda,fgs->lambda,fgs->lambda,fgs->lambda);
+        int sz4 = 4*((end-start)/4);
+        int end4 = start+sz4;
+        for(;j<end4;j+=4)
+        {
+            a = v_load(Cvert_row_prev+j);
+            b = v_load(Cvert_row+j);
+            coef_prev_reg = lambda_reg*a;
+            coef_cur_reg =  lambda_reg*b;
+
+            a = v_load(interD_row_prev+j);
+            a = a*coef_prev_reg;
+
+            b = coef_prev_reg+coef_cur_reg;
+            b = b+a;
+            a = one_reg-b; //computed denom
+
+            b =  coef_cur_reg/a; //computed interD_row
+
+            c = v_load(cur_row_prev+j);
+            c = c*coef_prev_reg;
+
+            d = v_load(cur_row+j);
+            d = d-c;
+            d = d/a; //computed cur_row
+
+            v_store(interD_row+j,b);
+            v_store(cur_row+j,d);
+        }
+#endif
+        for(;j<end;j++)
+        {
+            coef_prev = fgs->lambda*Cvert_row_prev[j];
+            coef_cur  = fgs->lambda*Cvert_row[j];
+            denom = (1-coef_prev-coef_cur)-interD_row_prev[j]*coef_prev;
+            interD_row[j] = coef_cur/denom;
+            cur_row[j] = (cur_row[j]-cur_row_prev[j]*coef_prev)/denom;
+        }
+    }
+
+    //backward pass:
+    for(int i=fgs->h-2;i>=0;i--)
+    {
+        interD_row = (WorkType*)fgs->interD.ptr(i);
+        cur_row = (WorkType*)cur->ptr(i);
+        cur_row_next = (WorkType*)cur->ptr(i+1);
+        int j = start;
+#if CV_SIMD128
+        v_float32x4 a,b;
+        int sz4 = 4*((end-start)/4);
+        int end4 = start+sz4;
+        for(;j<end4;j+=4)
+        {
+            a = v_load(interD_row+j);
+            b = v_load(cur_row_next+j);
+            b = a*b;
+
+            a = v_load(cur_row+j);
+            b = a-b;
+            v_store(cur_row+j,b);
+        }
+#endif
+        for(;j<end;j++)
+            cur_row[j] = cur_row[j]-interD_row[j]*cur_row_next[j];
+    }
+}
+
+template<get_weight_op get_weight, const int num_ch>
+FastGlobalSmootherFilterImpl::ComputeHorizontalWeights_ParBody<get_weight,num_ch>::ComputeHorizontalWeights_ParBody(FastGlobalSmootherFilterImpl &_fgs, Mat& _guide, int _nstripes, int _h):
+fgs(&_fgs),guide(&_guide), nstripes(_nstripes), h(_h)
+{
+    stripe_sz = (int)ceil(h/(double)nstripes);
+}
+
+template<get_weight_op get_weight, const int num_ch>
+void FastGlobalSmootherFilterImpl::ComputeHorizontalWeights_ParBody<get_weight,num_ch>::operator()(const Range& range) const
+{
+    int start = std::min(range.start * stripe_sz, h);
+    int end   = std::min(range.end   * stripe_sz, h);
+
+    WorkType* LUT = (WorkType*)fgs->weights_LUT.ptr(0);
+    unsigned char *row;
+    WorkType *Chor_row;
+
+    for(int i=start;i<end;i++)
+    {
+        row = guide->ptr(i);
+        Chor_row = (WorkType*)fgs->Chor.ptr(i);
+        Chor_row[0] = get_weight(LUT,row,row+num_ch);
+        row+=num_ch;
+        for(int j=1;j<fgs->w-1;j++)
+        {
+            Chor_row[j] = get_weight(LUT,row,row+num_ch);
+            row+=num_ch;
+        }
+        Chor_row[fgs->w-1]=0;
+    }
+}
+
+template<get_weight_op get_weight, const int num_ch>
+FastGlobalSmootherFilterImpl::ComputeVerticalWeights_ParBody<get_weight,num_ch>::ComputeVerticalWeights_ParBody(FastGlobalSmootherFilterImpl &_fgs, Mat& _guide, int _nstripes, int _w):
+fgs(&_fgs),guide(&_guide), nstripes(_nstripes), w(_w)
+{
+    stripe_sz = (int)ceil(w/(double)nstripes);
+}
+
+template<get_weight_op get_weight, const int num_ch>
+void FastGlobalSmootherFilterImpl::ComputeVerticalWeights_ParBody<get_weight,num_ch>::operator()(const Range& range) const
+{
+    int start = std::min(range.start * stripe_sz, w);
+    int end   = std::min(range.end   * stripe_sz, w);
+
+    WorkType* LUT = (WorkType*)fgs->weights_LUT.ptr(0);
+    unsigned char *row,*row_next;
+    WorkType *Cvert_row;
+
+    Cvert_row = (WorkType*)fgs->Cvert.ptr(0);
+    row = guide->ptr(0)+start*num_ch;
+    row_next = guide->ptr(1)+start*num_ch;
+    for(int j=start;j<end;j++)
+    {
+        Cvert_row[j] = get_weight(LUT,row,row_next);
+        row+=num_ch;
+        row_next+=num_ch;
+    }
+
+    for(int i=1;i<fgs->h-1;i++)
+    {
+        row = guide->ptr(i)+start*num_ch;
+        row_next = guide->ptr(i+1)+start*num_ch;
+        Cvert_row = (WorkType*)fgs->Cvert.ptr(i);
+        for(int j=start;j<end;j++)
+        {
+            Cvert_row[j] = get_weight(LUT,row,row_next);
+            row+=num_ch;
+            row_next+=num_ch;
+        }
+    }
+
+    Cvert_row = (WorkType*)fgs->Cvert.ptr(fgs->h-1);
+    for(int j=start;j<end;j++)
+        Cvert_row[j] = 0;
+}
+
+FastGlobalSmootherFilterImpl::ComputeLUT_ParBody::ComputeLUT_ParBody(FastGlobalSmootherFilterImpl &_fgs, WorkType *_LUT, int _nstripes, int _sz):
+fgs(&_fgs), LUT(_LUT), nstripes(_nstripes), sz(_sz)
+{
+    stripe_sz = (int)ceil(sz/(double)nstripes);
+}
+
+void FastGlobalSmootherFilterImpl::ComputeLUT_ParBody::operator()(const Range& range) const
+{
+    int start = std::min(range.start * stripe_sz, sz);
+    int end   = std::min(range.end   * stripe_sz, sz);
+    for(int i=start;i<end;i++)
+        LUT[i] = (WorkType)(-cv::exp(-sqrt((float)i)/fgs->sigmaColor));
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+CV_EXPORTS_W
+Ptr<FastGlobalSmootherFilter> createFastGlobalSmootherFilter(InputArray guide, double lambda, double sigma_color, double lambda_attenuation, int num_iter)
+{
+    return Ptr<FastGlobalSmootherFilter>(FastGlobalSmootherFilterImpl::create(guide, lambda, sigma_color, num_iter, lambda_attenuation));
+}
+
+CV_EXPORTS_W
+void fastGlobalSmootherFilter(InputArray guide, InputArray src, OutputArray dst, double lambda, double sigma_color, double lambda_attenuation, int num_iter)
+{
+    Ptr<FastGlobalSmootherFilter> fgs = createFastGlobalSmootherFilter(guide, lambda, sigma_color, lambda_attenuation, num_iter);
+    fgs->filter(src, dst);
+}
+
+}
+}
--- a/modules/ximgproc/src/seeds.cpp
+++ b/modules/ximgproc/src/seeds.cpp
@ -273,8 +273,8 @@ void SuperpixelSEEDSImpl::initialize(int num_superpixels, int num_levels)
    parent_pre_init.resize(seeds_nr_levels);
    nr_wh.resize(2 * seeds_nr_levels);
    int level = 0;
-    int nr_seeds_w = (int)floor(width / seeds_w);
-    int nr_seeds_h = (int)floor(height / seeds_h);
+    int nr_seeds_w = (width / seeds_w);
+    int nr_seeds_h = (height / seeds_h);
    nr_wh[2 * level] = nr_seeds_w;
    nr_wh[2 * level + 1] = nr_seeds_h;
    parent_mat.push_back(Mat(nr_seeds_h, nr_seeds_w, CV_32SC1));
--- a/modules/ximgproc/src/structured_edge_detection.cpp
+++ b/modules/ximgproc/src/structured_edge_detection.cpp
@ -188,8 +188,8 @@ static void gradientHist(const cv::Mat &src, cv::Mat &magnitude, cv::Mat &histog

    magnitude.create( src.size(), cv::DataType<float>::type );
    phase.create( src.size(), cv::DataType<float>::type );
-    histogram.create( cv::Size( cvRound(src.size().width/float(pSize)),
-                                cvRound(src.size().height/float(pSize)) ),
+    histogram.create( cv::Size( cvCeil(src.size().width/float(pSize)),
+                                cvCeil(src.size().height/float(pSize)) ),
        CV_MAKETYPE(cv::DataType<float>::type, nBins) );

    histogram.setTo(0);
--- a/modules/ximgproc/src/wls_filter.cpp
+++ b/modules/ximgproc/src/wls_filter.cpp
@ -1,284 +0,0 @@
-/*
- *  By downloading, copying, installing or using the software you agree to this license.
- *  If you do not agree to this license, do not download, install,
- *  copy or use the software.
- *  
- *  
- *  License Agreement
- *  For Open Source Computer Vision Library
- *  (3 - clause BSD License)
- *  
- *  Redistribution and use in source and binary forms, with or without modification,
- *  are permitted provided that the following conditions are met :
- *  
- *  *Redistributions of source code must retain the above copyright notice,
- *  this list of conditions and the following disclaimer.
- *  
- *  * Redistributions in binary form must reproduce the above copyright notice,
- *  this list of conditions and the following disclaimer in the documentation
- *  and / or other materials provided with the distribution.
- *  
- *  * Neither the names of the copyright holders nor the names of the contributors
- *  may be used to endorse or promote products derived from this software
- *  without specific prior written permission.
- *  
- *  This software is provided by the copyright holders and contributors "as is" and
- *  any express or implied warranties, including, but not limited to, the implied
- *  warranties of merchantability and fitness for a particular purpose are disclaimed.
- *  In no event shall copyright holders or contributors be liable for any direct,
- *  indirect, incidental, special, exemplary, or consequential damages
- *  (including, but not limited to, procurement of substitute goods or services;
- *  loss of use, data, or profits; or business interruption) however caused
- *  and on any theory of liability, whether in contract, strict liability,
- *  or tort(including negligence or otherwise) arising in any way out of
- *  the use of this software, even if advised of the possibility of such damage.
- */
-
-#include "precomp.hpp"
-
-namespace cv {
-namespace ximgproc {
-
-class WeightedLeastSquaresFilterImpl : public WeightedLeastSquaresFilter
-{
-public:
-    static Ptr<WeightedLeastSquaresFilterImpl> create(InputArray guide, double lambda, double sigma_color, int num_iter);
-    void filter(InputArray src, OutputArray dst);
-    ~WeightedLeastSquaresFilterImpl();
-protected:
-    int w,h;
-    double sigmaColor,lambda;
-    int num_iter;
-    double *weights_LUT;
-    double *Ahor,  *Bhor,  *Chor, 
-            *Avert, *Bvert, *Cvert;
-    double *interD,*interE,*cur_res;
-    void init(InputArray guide,double _lambda,double _sigmaColor,int _num_iter);
-    void buildCoefMatrices(Mat& guide);
-    void horizontalPass(double* cur);
-    void verticalPass(double* cur);
-};
-
-void WeightedLeastSquaresFilterImpl::init(InputArray guide,double _lambda,double _sigmaColor,int _num_iter)
-{
-    //currently support only 3 channel 8bit images as guides
-    CV_Assert( !guide.empty() && _lambda >= 0 && _sigmaColor >= 0 && _num_iter >=1 );
-    CV_Assert( guide.depth() == CV_8U && guide.channels() == 3 );
-    sigmaColor = _sigmaColor;
-    lambda = _lambda;
-    num_iter = _num_iter;
-    int num_levels = 3*256*256;
-    weights_LUT = new double[num_levels];
-    for(int i=0;i<num_levels;i++)
-        weights_LUT[i] = exp(-sqrt((double)i)/sigmaColor);
-    w = guide.cols();
-    h = guide.rows();
-    int sz = w*h;
-    Ahor  = new double[sz];Bhor  = new double[sz];Chor  = new double[sz];
-    Avert = new double[sz];Bvert = new double[sz];Cvert = new double[sz];
-    interD = new double[sz];interE = new double[sz];cur_res = new double[sz];
-    Mat guideMat = guide.getMat();
-    buildCoefMatrices(guideMat);
-}
-
-Ptr<WeightedLeastSquaresFilterImpl> WeightedLeastSquaresFilterImpl::create(InputArray guide, double lambda, double sigma_color, int num_iter)
-{
-    WeightedLeastSquaresFilterImpl *wls = new WeightedLeastSquaresFilterImpl();
-    wls->init(guide,lambda,sigma_color,num_iter);
-    return Ptr<WeightedLeastSquaresFilterImpl>(wls);
-}
-
-WeightedLeastSquaresFilter::~WeightedLeastSquaresFilter(){}
-WeightedLeastSquaresFilterImpl::~WeightedLeastSquaresFilterImpl()
-{
-    delete[] weights_LUT;
-    delete[] Ahor;  delete[] Bhor;  delete[] Chor;
-    delete[] Avert; delete[] Bvert; delete[] Cvert;
-    delete[] interD;delete[] interE;delete[] cur_res;
-}
-
-void WeightedLeastSquaresFilterImpl::buildCoefMatrices(Mat& guide)
-{
-    double hor_weight;
-    const unsigned char *row,*row_prev,*row_next;
-    for(int i=0;i<h;i++)
-    {
-        //compute horizontal coefs:
-        row = guide.ptr(i);
-        Ahor[i*w] = 0;
-        hor_weight = weights_LUT[ (row[0]-row[3])*(row[0]-row[3])+
-                                  (row[1]-row[4])*(row[1]-row[4])+
-                                  (row[2]-row[5])*(row[2]-row[5]) ];
-        Chor[i*w] = -lambda*hor_weight;
-        Bhor[i*w] = 1 - Ahor[i*w] - Chor[i*w];
-        row+=3;
-        for(int j=1;j<w-1;j++)
-        {
-            Ahor[i*w+j] = -lambda*hor_weight;
-            hor_weight = weights_LUT[ (row[0]-row[3])*(row[0]-row[3])+
-                                      (row[1]-row[4])*(row[1]-row[4])+
-                                      (row[2]-row[5])*(row[2]-row[5]) ];
-            Chor[i*w+j] = -lambda*hor_weight;
-            Bhor[i*w+j] = 1 - Ahor[i*w+j] - Chor[i*w+j];
-            row+=3;
-        }
-        Ahor[i*w+w-1] = -lambda*hor_weight;
-        Chor[i*w+w-1] = 0;
-        Bhor[i*w+w-1] = 1 - Ahor[i*w+w-1] - Chor[i*w+w-1];
-
-        //compute vertical coefs:
-        row = guide.ptr(i);
-        if(i==0)
-        {
-            row_next = guide.ptr(i+1);
-            for(int j=0;j<w;j++)
-            {
-                Avert[i*w+j] = 0;
-                Cvert[i*w+j] = -lambda*weights_LUT[ (row[0]-row_next[0])*(row[0]-row_next[0])+
-                                                    (row[1]-row_next[1])*(row[1]-row_next[1])+
-                                                    (row[2]-row_next[2])*(row[2]-row_next[2]) ];
-                Bvert[i*w+j] = 1 - Avert[i*w+j] - Cvert[i*w+j];
-                row+=3;
-                row_next+=3;
-            }
-        }
-        else if(i==h-1)
-        {
-            row_prev = guide.ptr(i-1);
-            for(int j=0;j<w;j++)
-            {
-                Avert[i*w+j] = -lambda*weights_LUT[ (row[0]-row_prev[0])*(row[0]-row_prev[0])+
-                                                    (row[1]-row_prev[1])*(row[1]-row_prev[1])+
-                                                    (row[2]-row_prev[2])*(row[2]-row_prev[2]) ];
-                Cvert[i*w+j] = 0;
-                Bvert[i*w+j] = 1 - Avert[i*w+j] - Cvert[i*w+j];
-                row+=3;
-                row_prev+=3;
-            }
-        }
-        else
-        {
-            row_prev = guide.ptr(i-1);
-            row_next = guide.ptr(i+1);
-            for(int j=0;j<w;j++)
-            {
-                Avert[i*w+j] = -lambda*weights_LUT[ (row[0]-row_prev[0])*(row[0]-row_prev[0])+
-                                                    (row[1]-row_prev[1])*(row[1]-row_prev[1])+
-                                                    (row[2]-row_prev[2])*(row[2]-row_prev[2]) ];
-                Cvert[i*w+j] = -lambda*weights_LUT[ (row[0]-row_next[0])*(row[0]-row_next[0])+
-                    (row[1]-row_next[1])*(row[1]-row_next[1])+
-                    (row[2]-row_next[2])*(row[2]-row_next[2]) ];
-                Bvert[i*w+j] = 1 - Avert[i*w+j] - Cvert[i*w+j];
-                row+=3;
-                row_prev+=3;
-                row_next+=3;
-            }
-        }
-    }
-}
-
-void WeightedLeastSquaresFilterImpl::filter(InputArray src, OutputArray dst)
-{
-    //temporarily support only one-channel CV_16S src type (for disparity map filtering)
-    CV_Assert(!src.empty() && (src.depth() == CV_16S) && src.channels()==1);
-    if (src.rows() != h || src.cols() != w)
-    {
-        CV_Error(Error::StsBadSize, "Size of filtering image must be equal to size of guide image");
-        return;
-    }
-
-    Mat srcMat = src.getMat(); 
-    Mat& dstMat = dst.getMatRef();
-    short* row;
-    for(int i=0;i<h;i++)
-    {
-        row = (short*)srcMat.ptr(i);
-        for(int j=0;j<w;j++)
-        {
-            cur_res[i*w+j] = (double)(*row);
-            row++;
-        }
-    }
-
-    for(int n=0;n<num_iter;n++)
-    {
-        horizontalPass(cur_res);
-        verticalPass(cur_res);
-    }
-
-    for(int i=0;i<h;i++)
-    {
-        row = (short*)dstMat.ptr(i);
-        for(int j=0;j<w;j++)
-        {
-            *row = saturate_cast<short>(cur_res[i*w+j]);
-            row++;
-        }
-    }
-}
-
-void WeightedLeastSquaresFilterImpl::horizontalPass(double* cur)
-{
-    double denom;
-    for(int i=0;i<h;i++)
-    {
-        //forward pass:
-        interD[i*w] = Chor[i*w]/Bhor[i*w];
-        interE[i*w] = cur[i*w] /Bhor[i*w];
-        for(int j=1;j<w;j++)
-        {
-            denom = Bhor[i*w+j]-interD[i*w+j-1]*Ahor[i*w+j];
-            interD[i*w+j] = Chor[i*w+j]/denom;
-            interE[i*w+j] = (cur[i*w+j]-interE[i*w+j-1]*Ahor[i*w+j])/denom;
-        }
-
-        //backward pass:
-        cur[i*w+w-1] = interE[i*w+w-1];
-        for(int j=w-2;j>=0;j--)
-            cur[i*w+j] = interE[i*w+j]-interD[i*w+j]*cur[i*w+j+1];
-    }
-}
-
-void WeightedLeastSquaresFilterImpl::verticalPass(double* cur)
-{
-    double denom;
-    //forward pass:
-    for(int j=0;j<w;j++)
-    {
-        interD[j] = Cvert[j]/Bvert[j];
-        interE[j] = cur[j]/Bvert[j];
-    }
-    for(int i=1;i<h;i++)
-    {
-        for(int j=0;j<w;j++)
-        {
-            denom = Bvert[i*w+j]-interD[(i-1)*w+j]*Avert[i*w+j];
-            interD[i*w+j] = Cvert[i*w+j]/denom;
-            interE[i*w+j] = (cur[i*w+j]-interE[(i-1)*w+j]*Avert[i*w+j])/denom;
-        }
-    }
-    //backward pass:
-    for(int j=0;j<w;j++)
-        cur[(h-1)*w+j] = interE[(h-1)*w+j];
-    for(int i=h-2;i>=0;i--)
-        for(int j=w-1;j>=0;j--)
-            cur[i*w+j] = interE[i*w+j]-interD[i*w+j]*cur[(i+1)*w+j];
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-CV_EXPORTS_W
-Ptr<WeightedLeastSquaresFilter> createWeightedLeastSquaresFilter(InputArray guide, double lambda, double sigma_color, int num_iter)
-{
-    return Ptr<WeightedLeastSquaresFilter>(WeightedLeastSquaresFilterImpl::create(guide, lambda, sigma_color, num_iter));
-}
-
-CV_EXPORTS_W
-void weightedLeastSquaresFilter(InputArray guide, InputArray src, OutputArray dst, double lambda, double sigma_color, int num_iter)
-{
-    Ptr<WeightedLeastSquaresFilter> wls = createWeightedLeastSquaresFilter(guide, lambda, sigma_color, num_iter);
-    wls->filter(src, dst);
-}
-
-}
-}
--- a/modules/ximgproc/test/test_disparity_wls_filter.cpp
+++ b/modules/ximgproc/test/test_disparity_wls_filter.cpp
@ -0,0 +1,154 @@
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  *Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "test_precomp.hpp"
+#include "opencv2/ximgproc/disparity_filter.hpp"
+
+namespace cvtest
+{
+
+using namespace std;
+using namespace std::tr1;
+using namespace testing;
+using namespace perf;
+using namespace cv;
+using namespace cv::ximgproc;
+
+static string getDataDir()
+{
+    return cvtest::TS::ptr()->get_data_path();
+}
+
+CV_ENUM(SrcTypes, CV_16S);
+CV_ENUM(GuideTypes, CV_8UC1, CV_8UC3)
+typedef tuple<Size, SrcTypes, GuideTypes, bool, bool> DisparityWLSParams;
+typedef TestWithParam<DisparityWLSParams> DisparityWLSFilterTest;
+
+TEST(DisparityWLSFilterTest, ReferenceAccuracy)
+{
+    string dir = getDataDir() + "cv/disparityfilter";
+
+    Mat left = imread(dir + "/left_view.png",IMREAD_COLOR);
+    ASSERT_FALSE(left.empty());
+    Mat left_disp  = imread(dir + "/disparity_left_raw.png",IMREAD_GRAYSCALE);
+    ASSERT_FALSE(left_disp.empty());
+    left_disp.convertTo(left_disp,CV_16S,16);
+    Mat right_disp = imread(dir + "/disparity_right_raw.png",IMREAD_GRAYSCALE);
+    ASSERT_FALSE(right_disp.empty());
+    right_disp.convertTo(right_disp,CV_16S,-16);
+
+    Mat GT;
+    ASSERT_FALSE(readGT(dir + "/GT.png",GT));
+
+    FileStorage ROI_storage( dir + "/ROI.xml", FileStorage::READ );
+    Rect ROI((int)ROI_storage["x"],(int)ROI_storage["y"],(int)ROI_storage["width"],(int)ROI_storage["height"]);
+
+    FileStorage reference_res( dir + "/reference_accuracy.xml", FileStorage::READ );
+    double ref_MSE = (double)reference_res["MSE_after"];
+    double ref_BadPercent = (double)reference_res["BadPercent_after"];
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    Mat res;
+
+    Ptr<DisparityWLSFilter> wls_filter = createDisparityWLSFilter(true);
+    wls_filter->setLambda(8000.0);
+    wls_filter->setSigmaColor(0.5);
+    wls_filter->filter(left_disp,left,res,ROI,right_disp);
+
+    double MSE = computeMSE(GT,res,ROI);
+    double BadPercent = computeBadPixelPercent(GT,res,ROI);
+    double eps = 0.01;
+
+    EXPECT_LE(MSE,ref_MSE+eps*ref_MSE);
+    EXPECT_LE(BadPercent,ref_BadPercent+eps*ref_BadPercent);
+}
+
+TEST_P(DisparityWLSFilterTest, MultiThreadReproducibility)
+{
+    if (cv::getNumberOfCPUs() == 1)
+        return;
+
+    double MAX_DIF = 1.0;
+    double MAX_MEAN_DIF = 1.0 / 256.0;
+    int loopsCount = 2;
+    RNG rng(0);
+
+    DisparityWLSParams params = GetParam();
+    Size size          = get<0>(params);
+    int srcType        = get<1>(params);
+    int guideType      = get<2>(params);
+    bool use_conf      = get<3>(params);
+    bool use_downscale = get<4>(params);
+
+    Mat left(size, guideType);
+    randu(left, 0, 255);
+    Mat left_disp(size,srcType);
+    int max_disp = (int)(size.width*0.1);
+    randu(left_disp, 0, max_disp-1);
+    Mat right_disp(size,srcType);
+    randu(left_disp, -max_disp+1, 0);
+    Rect ROI(max_disp,0,size.width-max_disp,size.height);
+
+    if(use_downscale)
+    {
+        resize(left_disp,left_disp,Size(),0.5,0.5);
+        resize(right_disp,right_disp,Size(),0.5,0.5);
+        ROI = Rect(ROI.x/2,ROI.y/2,ROI.width/2,ROI.height/2);
+    }
+
+    for (int iter = 0; iter <= loopsCount; iter++)
+    {
+        double lambda = rng.uniform(100.0, 10000.0);
+        double sigma  = rng.uniform(1.0, 100.0);
+
+        Ptr<DisparityWLSFilter> wls_filter = createDisparityWLSFilter(use_conf);
+        wls_filter->setLambda(lambda);
+        wls_filter->setSigmaColor(sigma);
+
+        cv::setNumThreads(cv::getNumberOfCPUs());
+        Mat resMultiThread;
+        wls_filter->filter(left_disp,left,resMultiThread,ROI,right_disp);
+
+        cv::setNumThreads(1);
+        Mat resSingleThread;
+        wls_filter->filter(left_disp,left,resSingleThread,ROI,right_disp);
+
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_INF), MAX_DIF);
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_L1), MAX_MEAN_DIF*left.total());
+    }
+}
+INSTANTIATE_TEST_CASE_P(FullSet,DisparityWLSFilterTest,Combine(Values(szODD, szQVGA), SrcTypes::all(), GuideTypes::all(),Values(true,false),Values(true,false)));
+}
--- a/modules/ximgproc/test/test_fgs_filter.cpp
+++ b/modules/ximgproc/test/test_fgs_filter.cpp
@ -0,0 +1,153 @@
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  *Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "test_precomp.hpp"
+
+namespace cvtest
+{
+
+using namespace std;
+using namespace std::tr1;
+using namespace testing;
+using namespace perf;
+using namespace cv;
+using namespace cv::ximgproc;
+
+static string getDataDir()
+{
+    return cvtest::TS::ptr()->get_data_path();
+}
+
+CV_ENUM(SrcTypes, CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4);
+CV_ENUM(GuideTypes, CV_8UC1, CV_8UC3)
+typedef tuple<Size, SrcTypes, GuideTypes> FGSParams;
+typedef TestWithParam<FGSParams> FastGlobalSmootherTest;
+
+TEST(FastGlobalSmootherTest, SplatSurfaceAccuracy)
+{
+    RNG rnd(0);
+
+    for (int i = 0; i < 10; i++)
+    {
+        Size sz(rnd.uniform(512, 1024), rnd.uniform(512, 1024));
+
+        int guideCn = rnd.uniform(1, 2);
+        if(guideCn==2) guideCn++; //1 or 3 channels
+        Mat guide(sz, CV_MAKE_TYPE(CV_8U, guideCn));
+        randu(guide, 0, 255);
+
+        Scalar surfaceValue;
+        int srcCn = rnd.uniform(1, 4);
+        rnd.fill(surfaceValue, RNG::UNIFORM, 0, 255);
+        Mat src(sz, CV_MAKE_TYPE(CV_16S, srcCn), surfaceValue);
+
+        double lambda = rnd.uniform(100, 10000);
+        double sigma  = rnd.uniform(1.0, 100.0);
+
+        Mat res;
+        fastGlobalSmootherFilter(guide, src, res, lambda, sigma);
+
+        // When filtering a constant image we should get the same image:
+        double normL1 = cvtest::norm(src, res, NORM_L1)/src.total()/src.channels();
+        EXPECT_LE(normL1, 1.0/64);
+    }
+}
+
+TEST(FastGlobalSmootherTest, ReferenceAccuracy)
+{
+    string dir = getDataDir() + "cv/edgefilter";
+
+    Mat src = imread(dir + "/kodim23.png");
+    Mat ref = imread(dir + "/fgs/kodim23_lambda=1000_sigma=10.png");
+
+    ASSERT_FALSE(src.empty());
+    ASSERT_FALSE(ref.empty());
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    Mat res;
+    fastGlobalSmootherFilter(src,src,res,1000.0,10.0);
+
+    double totalMaxError = 1.0/64.0*src.total()*src.channels();
+
+    EXPECT_LE(cvtest::norm(res, ref, NORM_L2), totalMaxError);
+    EXPECT_LE(cvtest::norm(res, ref, NORM_INF), 1);
+}
+
+TEST_P(FastGlobalSmootherTest, MultiThreadReproducibility)
+{
+    if (cv::getNumberOfCPUs() == 1)
+        return;
+
+    double MAX_DIF = 1.0;
+    double MAX_MEAN_DIF = 1.0 / 64.0;
+    int loopsCount = 2;
+    RNG rng(0);
+
+    FGSParams params = GetParam();
+    Size size     = get<0>(params);
+    int srcType   = get<1>(params);
+    int guideType = get<2>(params);
+
+    Mat guide(size, guideType);
+    randu(guide, 0, 255);
+    Mat src(size,srcType);
+    if(src.depth()==CV_8U)
+        randu(src, 0, 255);
+    else if(src.depth()==CV_16S)
+        randu(src, -32767, 32767);
+    else
+        randu(src, -100000.0f, 100000.0f);
+
+    for (int iter = 0; iter <= loopsCount; iter++)
+    {
+        double lambda = rng.uniform(100.0, 10000.0);
+        double sigma  = rng.uniform(1.0, 100.0);
+
+        cv::setNumThreads(cv::getNumberOfCPUs());
+        Mat resMultiThread;
+        fastGlobalSmootherFilter(guide, src, resMultiThread, lambda, sigma);
+
+        cv::setNumThreads(1);
+        Mat resSingleThread;
+        fastGlobalSmootherFilter(guide, src, resSingleThread, lambda, sigma);
+
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_INF), MAX_DIF);
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_L1), MAX_MEAN_DIF*src.total()*src.channels());
+    }
+}
+INSTANTIATE_TEST_CASE_P(FullSet, FastGlobalSmootherTest,Combine(Values(szODD, szQVGA), SrcTypes::all(), GuideTypes::all()));
+
+}
--- a/modules/ximgproc/testdata/disparityfilter/GT.png
+++ b/modules/ximgproc/testdata/disparityfilter/GT.png
--- a/modules/ximgproc/testdata/disparityfilter/ROI.xml
+++ b/modules/ximgproc/testdata/disparityfilter/ROI.xml
@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<opencv_storage>
+<x>166</x>
+<y>7</y>
+<width>851</width>
+<height>422</height>
+</opencv_storage>
--- a/modules/ximgproc/testdata/disparityfilter/disparity_left_raw.png
+++ b/modules/ximgproc/testdata/disparityfilter/disparity_left_raw.png
--- a/modules/ximgproc/testdata/disparityfilter/disparity_right_raw.png
+++ b/modules/ximgproc/testdata/disparityfilter/disparity_right_raw.png
--- a/modules/ximgproc/testdata/disparityfilter/left_view.png
+++ b/modules/ximgproc/testdata/disparityfilter/left_view.png
--- a/modules/ximgproc/testdata/disparityfilter/reference_accuracy.xml
+++ b/modules/ximgproc/testdata/disparityfilter/reference_accuracy.xml
@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<opencv_storage>
+<MSE_before>5043.6495</MSE_before>
+<MSE_after>128.3773</MSE_after>
+<BadPercent_before>48.5417</BadPercent_before>
+<BadPercent_after>45.8749</BadPercent_after>
+</opencv_storage>
--- a/modules/ximgproc/testdata/edgefilter/fgs/kodim23_lambda=1000_sigma=10.png
+++ b/modules/ximgproc/testdata/edgefilter/fgs/kodim23_lambda=1000_sigma=10.png
--- a/modules/xphoto/include/opencv2/xphoto.hpp
+++ b/modules/xphoto/include/opencv2/xphoto.hpp
@ -47,6 +47,6 @@
 */

 #include "xphoto/inpainting.hpp"
-#include "xphoto/simple_color_balance.hpp"
+#include "xphoto/white_balance.hpp"
 #include "xphoto/dct_image_denoising.hpp"
 #endif
--- a/modules/xphoto/include/opencv2/xphoto/simple_color_balance.hpp
+++ b/modules/xphoto/include/opencv2/xphoto/simple_color_balance.hpp
@ -85,6 +85,38 @@ namespace xphoto
        const float inputMin  = 0.0f, const float inputMax  = 255.0f,
        const float outputMin = 0.0f, const float outputMax = 255.0f);

+    /** @brief Implements a simple grayworld white balance algorithm.
+
+    The function autowbGrayworld scales the values of pixels based on a
+    gray-world assumption which states that the average of all channels
+    should result in a gray image.
+
+    This function adds a modification which thresholds pixels based on their
+    saturation value and only uses pixels below the provided threshold in
+    finding average pixel values.
+
+    Saturation is calculated using the following for a 3-channel RGB image per
+    pixel I and is in the range [0, 1]:
+
+    \f[ \texttt{Saturation} [I] = \frac{\textrm{max}(R,G,B) - \textrm{min}(R,G,B)
+    }{\textrm{max}(R,G,B)} \f]
+
+    A threshold of 1 means that all pixels are used to white-balance, while a
+    threshold of 0 means no pixels are used. Lower thresholds are useful in
+    white-balancing saturated images.
+
+    Currently only works on images of type @ref CV_8UC3.
+
+    @param src Input array.
+    @param dst Output array of the same size and type as src.
+    @param thresh Maximum saturation for a pixel to be included in the
+        gray-world assumption.
+
+    @sa balanceWhite
+     */
+    CV_EXPORTS_W void autowbGrayworld(InputArray src, OutputArray dst,
+        float thresh = 0.5f);
+
 //! @}

 }
--- a/modules/xphoto/perf/perf_grayworld.cpp
+++ b/modules/xphoto/perf/perf_grayworld.cpp
@ -0,0 +1,29 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace perf;
+
+typedef std::tr1::tuple<Size, float> Size_WBThresh_t;
+typedef perf::TestBaseWithParam<Size_WBThresh_t> Size_WBThresh;
+
+PERF_TEST_P( Size_WBThresh, autowbGrayworld,
+    testing::Combine(
+        SZ_ALL_HD,
+        testing::Values( 0.1, 0.5, 1.0 )
+    )
+)
+{
+    Size size = std::tr1::get<0>(GetParam());
+    float wb_thresh = std::tr1::get<1>(GetParam());
+
+    Mat src(size, CV_8UC3);
+    Mat dst(size, CV_8UC3);
+
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    TEST_CYCLE() xphoto::autowbGrayworld(src, dst, wb_thresh);
+
+    SANITY_CHECK(dst);
+}
+
--- a/modules/xphoto/perf/perf_main.cpp
+++ b/modules/xphoto/perf/perf_main.cpp
@ -0,0 +1,3 @@
+#include "perf_precomp.hpp"
+
+CV_PERF_TEST_MAIN(xphoto)
--- a/modules/xphoto/perf/perf_precomp.hpp
+++ b/modules/xphoto/perf/perf_precomp.hpp
@ -0,0 +1,19 @@
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/xphoto.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
--- a/modules/xphoto/samples/grayworld_color_balance.cpp
+++ b/modules/xphoto/samples/grayworld_color_balance.cpp
@ -0,0 +1,62 @@
+#include "opencv2/xphoto.hpp"
+
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+
+#include "opencv2/core/utility.hpp"
+
+using namespace cv;
+using namespace std;
+
+const char* keys =
+{
+    "{i || input image name}"
+    "{o || output image name}"
+};
+
+int main( int argc, const char** argv )
+{
+    bool printHelp = ( argc == 1 );
+    printHelp = printHelp || ( argc == 2 && string(argv[1]) == "--help" );
+    printHelp = printHelp || ( argc == 2 && string(argv[1]) == "-h" );
+
+    if ( printHelp )
+    {
+        printf("\nThis sample demonstrates the grayworld balance algorithm\n"
+            "Call:\n"
+            "    simple_color_blance -i=in_image_name [-o=out_image_name]\n\n");
+        return 0;
+    }
+
+    CommandLineParser parser(argc, argv, keys);
+    if ( !parser.check() )
+    {
+        parser.printErrors();
+        return -1;
+    }
+
+    string inFilename = parser.get<string>("i");
+    string outFilename = parser.get<string>("o");
+
+    Mat src = imread(inFilename, 1);
+    if ( src.empty() )
+    {
+        printf("Cannot read image file: %s\n", inFilename.c_str());
+        return -1;
+    }
+
+    Mat res(src.size(), src.type());
+    xphoto::autowbGrayworld(src, res);
+
+    if ( outFilename == "" )
+    {
+        namedWindow("after white balance", 1);
+        imshow("after white balance", res);
+
+        waitKey(0);
+    }
+    else
+        imwrite(outFilename, res);
+
+    return 0;
+}
--- a/modules/xphoto/src/grayworld_white_balance.cpp
+++ b/modules/xphoto/src/grayworld_white_balance.cpp
@ -0,0 +1,213 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/xphoto.hpp"
+
+#include "opencv2/core.hpp"
+#include "opencv2/hal/intrin.hpp"
+
+namespace cv { namespace xphoto {
+
+    void autowbGrayworld(InputArray _src, OutputArray _dst, float thresh)
+    {
+
+        Mat src = _src.getMat();
+        CV_Assert(!src.empty());
+        CV_Assert(src.isContinuous());
+
+        // TODO: Handle CV_8UC1
+        // TODO: Handle types other than CV_8U
+        CV_Assert(src.type() == CV_8UC3);
+
+        _dst.create(src.size(), src.type());
+        Mat dst = _dst.getMat();
+        CV_Assert(dst.isContinuous());
+
+        int width  = src.cols,
+            height = src.rows,
+            N      = width*height,
+            N3     = N*3;
+
+        // Calculate sum of pixel values of each channel
+        const uchar* src_data = src.ptr<uchar>(0);
+        unsigned long sum1 = 0, sum2 = 0, sum3 = 0;
+        unsigned int thresh255 = cvRound(thresh * 255);
+        int i = 0;
+#if CV_SIMD128
+        v_uint8x16 v_inB, v_inG, v_inR;
+        v_uint16x8 v_s1, v_s2;
+        v_uint32x4 v_iB1, v_iB2, v_iB3, v_iB4,
+                   v_iG1, v_iG2, v_iG3, v_iG4,
+                   v_iR1, v_iR2, v_iR3, v_iR4,
+                   v_255 = v_setall_u32(255),
+                   v_thresh = v_setall_u32(thresh255),
+                   v_min1, v_min2, v_min3, v_min4,
+                   v_max1, v_max2, v_max3, v_max4,
+                   v_m1, v_m2, v_m3, v_m4,
+                   v_SB = v_setzero_u32(),
+                   v_SG = v_setzero_u32(),
+                   v_SR = v_setzero_u32();
+
+        for ( ; i < N3 - 47; i += 48 )
+        {
+            // NOTE: This block assumes BGR channels in naming variables
+
+            // Load 3x uint8x16 and deinterleave into vectors of each channel
+            v_load_deinterleave(&src_data[i], v_inB, v_inG, v_inR);
+
+            // Split into four int vectors per channel
+            v_expand(v_inB, v_s1, v_s2);
+            v_expand(v_s1, v_iB1, v_iB2);
+            v_expand(v_s2, v_iB3, v_iB4);
+
+            v_expand(v_inG, v_s1, v_s2);
+            v_expand(v_s1, v_iG1, v_iG2);
+            v_expand(v_s2, v_iG3, v_iG4);
+
+            v_expand(v_inR, v_s1, v_s2);
+            v_expand(v_s1, v_iR1, v_iR2);
+            v_expand(v_s2, v_iR3, v_iR4);
+
+            // Get mins and maxs
+            v_min1 = v_min(v_iB1, v_min(v_iG1, v_iR1));
+            v_min2 = v_min(v_iB2, v_min(v_iG2, v_iR2));
+            v_min3 = v_min(v_iB3, v_min(v_iG3, v_iR3));
+            v_min4 = v_min(v_iB4, v_min(v_iG4, v_iR4));
+
+            v_max1 = v_max(v_iB1, v_max(v_iG1, v_iR1));
+            v_max2 = v_max(v_iB2, v_max(v_iG2, v_iR2));
+            v_max3 = v_max(v_iB3, v_max(v_iG3, v_iR3));
+            v_max4 = v_max(v_iB4, v_max(v_iG4, v_iR4));
+
+            // Calculate masks
+            v_m1 = ~((v_max1 - v_min1) * v_255 > v_thresh * v_max1);
+            v_m2 = ~((v_max2 - v_min2) * v_255 > v_thresh * v_max2);
+            v_m3 = ~((v_max3 - v_min3) * v_255 > v_thresh * v_max3);
+            v_m4 = ~((v_max4 - v_min4) * v_255 > v_thresh * v_max4);
+
+            // Apply mask
+            v_SB += (v_iB1 & v_m1) + (v_iB2 & v_m2) + (v_iB3 & v_m3) + (v_iB4 & v_m4);
+            v_SG += (v_iG1 & v_m1) + (v_iG2 & v_m2) + (v_iG3 & v_m3) + (v_iG4 & v_m4);
+            v_SR += (v_iR1 & v_m1) + (v_iR2 & v_m2) + (v_iR3 & v_m3) + (v_iR4 & v_m4);
+        }
+
+        // Perform final reduction
+        sum1 = v_reduce_sum(v_SB);
+        sum2 = v_reduce_sum(v_SG);
+        sum3 = v_reduce_sum(v_SR);
+#endif
+        unsigned int minRGB, maxRGB;
+        for ( ; i < N3; i += 3 )
+        {
+            minRGB = min(src_data[i], min(src_data[i + 1], src_data[i + 2]));
+            maxRGB = max(src_data[i], max(src_data[i + 1], src_data[i + 2]));
+            if ( (maxRGB - minRGB) * 255 > thresh255 * maxRGB ) continue;
+            sum1 += src_data[i];
+            sum2 += src_data[i + 1];
+            sum3 += src_data[i + 2];
+        }
+
+        // Find inverse of averages
+        double dinv1 = sum1 == 0 ? 0.f : (double)N / (double)sum1,
+               dinv2 = sum2 == 0 ? 0.f : (double)N / (double)sum2,
+               dinv3 = sum3 == 0 ? 0.f : (double)N / (double)sum3;
+
+        // Find maximum
+        double inv_max = max(dinv1, max(dinv2, dinv3));
+
+        // Convert to floats
+        float inv1 = (float) dinv1,
+              inv2 = (float) dinv2,
+              inv3 = (float) dinv3;
+
+        // Scale by maximum
+        if ( inv_max > 0 )
+        {
+            inv1 = (float)((double)inv1 / inv_max);
+            inv2 = (float)((double)inv2 / inv_max);
+            inv3 = (float)((double)inv3 / inv_max);
+        }
+
+        // Fixed point arithmetic, mul by 2^8 then shift back 8 bits
+        int i_inv1 = cvRound(inv1 * (1 << 8)),
+            i_inv2 = cvRound(inv2 * (1 << 8)),
+            i_inv3 = cvRound(inv3 * (1 << 8));
+
+        // Scale input pixel values
+        uchar* dst_data = dst.ptr<uchar>(0);
+        i = 0;
+#if CV_SIMD128
+        v_uint8x16 v_outB, v_outG, v_outR;
+        v_uint16x8 v_sB1, v_sB2, v_sG1, v_sG2, v_sR1, v_sR2,
+                   v_invB = v_setall_u16((unsigned short) i_inv1),
+                   v_invG = v_setall_u16((unsigned short) i_inv2),
+                   v_invR = v_setall_u16((unsigned short) i_inv3);
+
+        for ( ; i < N3 - 47; i += 48 )
+        {
+            // Load 16 x 8bit uchars
+            v_load_deinterleave(&src_data[i], v_inB, v_inG, v_inR);
+
+            // Split into four int vectors per channel
+            v_expand(v_inB, v_sB1, v_sB2);
+            v_expand(v_inG, v_sG1, v_sG2);
+            v_expand(v_inR, v_sR1, v_sR2);
+
+            // Multiply by scaling factors
+            v_sB1 = (v_sB1 * v_invB) >> 8;
+            v_sB2 = (v_sB2 * v_invB) >> 8;
+            v_sG1 = (v_sG1 * v_invG) >> 8;
+            v_sG2 = (v_sG2 * v_invG) >> 8;
+            v_sR1 = (v_sR1 * v_invR) >> 8;
+            v_sR2 = (v_sR2 * v_invR) >> 8;
+
+            // Pack into vectors of v_uint8x16
+            v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2),
+                v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
+        }
+#endif
+        for ( ; i < N3; i += 3 )
+        {
+            dst_data[i]     = (uchar)((src_data[i]     * i_inv1) >> 8);
+            dst_data[i + 1] = (uchar)((src_data[i + 1] * i_inv2) >> 8);
+            dst_data[i + 2] = (uchar)((src_data[i + 2] * i_inv3) >> 8);
+        }
+    }
+
+}}
--- a/modules/xphoto/test/test_grayworld.cpp
+++ b/modules/xphoto/test/test_grayworld.cpp
@ -0,0 +1,90 @@
+#include "test_precomp.hpp"
+
+namespace cvtest {
+
+    using namespace cv;
+
+    void ref_autowbGrayworld(InputArray _src, OutputArray _dst, float thresh)
+    {
+        Mat src = _src.getMat();
+
+        _dst.create(src.size(), src.type());
+        Mat dst = _dst.getMat();
+
+        int width  = src.cols,
+            height = src.rows,
+            N      = width*height,
+            N3     = N*3;
+
+        // Calculate sum of pixel values of each channel
+        const uchar* src_data = src.ptr<uchar>(0);
+        unsigned long sum1 = 0, sum2 = 0, sum3 = 0;
+        int i = 0;
+        unsigned int minRGB, maxRGB, thresh255 = cvRound(thresh * 255);
+        for ( ; i < N3; i += 3 )
+        {
+            minRGB = std::min(src_data[i], std::min(src_data[i + 1], src_data[i + 2]));
+            maxRGB = std::max(src_data[i], std::max(src_data[i + 1], src_data[i + 2]));
+            if ( (maxRGB - minRGB) * 255 > thresh255 * maxRGB ) continue;
+            sum1 += src_data[i];
+            sum2 += src_data[i + 1];
+            sum3 += src_data[i + 2];
+        }
+
+        // Find inverse of averages
+        double inv1 = sum1 == 0 ? 0.f : (double)N / (double)sum1,
+               inv2 = sum2 == 0 ? 0.f : (double)N / (double)sum2,
+               inv3 = sum3 == 0 ? 0.f : (double)N / (double)sum3;
+
+        // Find maximum
+        double inv_max = std::max(std::max(inv1, inv2), inv3);
+
+        // Scale by maximum
+        if ( inv_max > 0 )
+        {
+            inv1 = (double) inv1 / inv_max;
+            inv2 = (double) inv2 / inv_max;
+            inv3 = (double) inv3 / inv_max;
+        }
+
+        // Fixed point arithmetic, mul by 2^8 then shift back 8 bits
+        int i_inv1 = cvRound(inv1 * (1 << 8)),
+            i_inv2 = cvRound(inv2 * (1 << 8)),
+            i_inv3 = cvRound(inv3 * (1 << 8));
+
+        // Scale input pixel values
+        uchar* dst_data = dst.ptr<uchar>(0);
+        i = 0;
+        for ( ; i < N3; i += 3 )
+        {
+            dst_data[i]     = (uchar)((src_data[i]     * i_inv1) >> 8);
+            dst_data[i + 1] = (uchar)((src_data[i + 1] * i_inv2) >> 8);
+            dst_data[i + 2] = (uchar)((src_data[i + 2] * i_inv3) >> 8);
+        }
+    }
+
+    TEST(xphoto_grayworld_white_balance, regression)
+    {
+        String subfolder = "/xphoto/";
+        String dir = cvtest::TS::ptr()->get_data_path() + subfolder + "simple_white_balance/";
+        const int nTests = 14;
+        const float wb_thresh = 0.5f;
+        const float acc_thresh = 2.f;
+
+        for ( int i = 0; i < nTests; ++i )
+        {
+            String srcName = dir + format("sources/%02d.png", i + 1);
+            Mat src = imread(srcName, IMREAD_COLOR);
+            ASSERT_TRUE(!src.empty());
+
+            Mat referenceResult;
+            ref_autowbGrayworld(src, referenceResult, wb_thresh);
+
+            Mat currentResult;
+            xphoto::autowbGrayworld(src, currentResult, wb_thresh);
+
+            ASSERT_LE(cv::norm(currentResult, referenceResult, NORM_INF), acc_thresh);
+        }
+    }
+
+}