diff --git a/3rdparty/ffmpeg/opencv_ffmpeg.dll b/3rdparty/ffmpeg/opencv_ffmpeg.dll
index 1641c8a516..a3fede52c7 100644
Binary files a/3rdparty/ffmpeg/opencv_ffmpeg.dll and b/3rdparty/ffmpeg/opencv_ffmpeg.dll differ
diff --git a/3rdparty/ffmpeg/opencv_ffmpeg_64.dll b/3rdparty/ffmpeg/opencv_ffmpeg_64.dll
index ad02f4e684..b8a5cf5312 100644
Binary files a/3rdparty/ffmpeg/opencv_ffmpeg_64.dll and b/3rdparty/ffmpeg/opencv_ffmpeg_64.dll differ
diff --git a/android/service/doc/BaseLoaderCallback.rst b/android/service/doc/BaseLoaderCallback.rst
index 3258004c59..71915c449d 100644
--- a/android/service/doc/BaseLoaderCallback.rst
+++ b/android/service/doc/BaseLoaderCallback.rst
@@ -48,7 +48,7 @@ See the "15-puzzle" OpenCV sample for details.
         super.onResume();
 
         Log.i(TAG, "Trying to load OpenCV library");
-        if (!OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_3, this, mOpenCVCallBack))
+        if (!OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_4, this, mOpenCVCallBack))
         {
             Log.e(TAG, "Cannot connect to OpenCV Manager");
         }
diff --git a/android/service/doc/JavaHelper.rst b/android/service/doc/JavaHelper.rst
index 9a128db0ab..34798c267e 100644
--- a/android/service/doc/JavaHelper.rst
+++ b/android/service/doc/JavaHelper.rst
@@ -47,3 +47,7 @@ OpenCV version constants
 .. data:: OPENCV_VERSION_2_4_3
 
     OpenCV Library version 2.4.3
+
+.. data:: OPENCV_VERSION_2_4_4
+
+    OpenCV Library version 2.4.4
diff --git a/android/service/engine/AndroidManifest.xml b/android/service/engine/AndroidManifest.xml
index 4af9652027..f4f0eb94fa 100644
--- a/android/service/engine/AndroidManifest.xml
+++ b/android/service/engine/AndroidManifest.xml
@@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.opencv.engine"
-    android:versionCode="25@ANDROID_PLATFORM_VERSION_CODE@"
-    android:versionName="2.5" >
+    android:versionCode="26@ANDROID_PLATFORM_VERSION_CODE@"
+    android:versionName="2.6" >
 
     <uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" />
     <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>
diff --git a/android/service/readme.txt b/android/service/readme.txt
index 7da05853ae..df17c18245 100644
--- a/android/service/readme.txt
+++ b/android/service/readme.txt
@@ -14,20 +14,20 @@ manually using adb tool:
 
 .. code-block:: sh
 
-    adb install OpenCV-2.4.3-android-sdk/apk/OpenCV_2.4.3.2_Manager_2.4_<platform>.apk
+    adb install OpenCV-2.4.4-android-sdk/apk/OpenCV_2.4.4_Manager_2.6_<platform>.apk
 
 Use the table below to determine proper OpenCV Manager package for your device:
 
-+------------------------------+--------------+-----------------------------------------------------+
-| Hardware Platform            | Android ver. | Package name                                        |
-+==============================+==============+=====================================================+
-| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.3.2_Manager_2.4_armv7a-neon.apk          |
-+------------------------------+--------------+-----------------------------------------------------+
-| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.3.2_Manager_2.4_armv7a-neon-android8.apk |
-+------------------------------+--------------+-----------------------------------------------------+
-| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.3.2_Manager_2.4_armeabi.apk              |
-+------------------------------+--------------+-----------------------------------------------------+
-| Intel x86                    |    >= 2.3    | OpenCV_2.4.3.2_Manager_2.4_x86.apk                  |
-+------------------------------+--------------+-----------------------------------------------------+
-| MIPS                         |    >= 2.3    | OpenCV_2.4.3.2_Manager_2.4_mips.apk                 |
-+------------------------------+--------------+-----------------------------------------------------+
++------------------------------+--------------+---------------------------------------------------+
+| Hardware Platform            | Android ver. | Package name                                      |
++==============================+==============+===================================================+
+| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.4_Manager_2.6_armv7a-neon.apk          |
++------------------------------+--------------+---------------------------------------------------+
+| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.4_Manager_2.6_armv7a-neon-android8.apk |
++------------------------------+--------------+---------------------------------------------------+
+| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.4_Manager_2.6_armeabi.apk              |
++------------------------------+--------------+---------------------------------------------------+
+| Intel x86                    |    >= 2.3    | OpenCV_2.4.4_Manager_2.6_x86.apk                  |
++------------------------------+--------------+---------------------------------------------------+
+| MIPS                         |    >= 2.3    | OpenCV_2.4.4_Manager_2.6_mips.apk                 |
++------------------------------+--------------+---------------------------------------------------+
diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake
index cbbc3e8c16..12ab9d3eae 100644
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@@ -81,6 +81,7 @@ else()
     set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT})
     set(ENV_OPENCLROOT $ENV{OPENCLROOT})
     set(ENV_CUDA_PATH $ENV{CUDA_PATH})
+    set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT})
     if(ENV_AMDSTREAMSDKROOT)
       set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDAPPSDKROOT}/include)
       if(CMAKE_SIZEOF_VOID_P EQUAL 4)
@@ -109,6 +110,13 @@ else()
       else()
         set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64)
       endif()
+    elseif(ENV_INTELOCLSDKROOT)
+      set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_INTELOCLSDKROOT}/include)
+      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x86)
+      else()
+        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x64)
+      endif()
     endif()
 
     if(OPENCL_INCLUDE_SEARCH_PATH)
diff --git a/cmake/OpenCVGenAndroidMK.cmake b/cmake/OpenCVGenAndroidMK.cmake
index 5b11afc17b..ba67f41891 100644
--- a/cmake/OpenCVGenAndroidMK.cmake
+++ b/cmake/OpenCVGenAndroidMK.cmake
@@ -20,7 +20,7 @@ if(ANDROID)
   endif()
 
   # setup lists of camera libs
-  foreach(abi ARMEABI ARMEABI_V7A X86)
+  foreach(abi ARMEABI ARMEABI_V7A X86 MIPS)
     ANDROID_GET_ABI_RAWNAME(${abi} ndkabi)
     if(BUILD_ANDROID_CAMERA_WRAPPER)
       if(ndkabi STREQUAL ANDROID_NDK_ABI_NAME)
diff --git a/cmake/templates/OpenCV.mk.in b/cmake/templates/OpenCV.mk.in
index 943d7cb672..8c1b855380 100644
--- a/cmake/templates/OpenCV.mk.in
+++ b/cmake/templates/OpenCV.mk.in
@@ -57,6 +57,9 @@ ifeq (${OPENCV_CAMERA_MODULES},on)
     ifeq ($(TARGET_ARCH_ABI),x86)
         OPENCV_CAMERA_MODULES:=@OPENCV_CAMERA_LIBS_X86_CONFIGCMAKE@
     endif
+    ifeq ($(TARGET_ARCH_ABI),mips)
+        OPENCV_CAMERA_MODULES:=@OPENCV_CAMERA_LIBS_MIPS_CONFIGCMAKE@
+    endif
 else
     OPENCV_CAMERA_MODULES:=
 endif
diff --git a/doc/tutorials/definitions/tocDefinitions.rst b/doc/tutorials/definitions/tocDefinitions.rst
index ffe763fd16..4695623cca 100644
--- a/doc/tutorials/definitions/tocDefinitions.rst
+++ b/doc/tutorials/definitions/tocDefinitions.rst
@@ -10,3 +10,4 @@
 .. |Author_AlexB| unicode:: Alexandre U+0020 Benoit
 .. |Author_EricCh| unicode:: Eric U+0020 Christiansen
 .. |Author_AndreyP| unicode:: Andrey U+0020 Pavlenko
+.. |Author_AlexS| unicode:: Alexander U+0020 Smorkalov
diff --git a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
new file mode 100644
index 0000000000..c40b86c974
--- /dev/null
+++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
@@ -0,0 +1,115 @@
+
+.. _ARM-Linux-cross-compile:
+
+Cross compilation for ARM based Linux systems
+*********************************************
+
+This steps are tested on Ubuntu Linux 12.04, but should work for other Linux distributions.
+I case of other distributions package names and names of cross compilation tools may differ.
+There are several popular EABI versions that are used on ARM platform. This tutorial is
+written for *gnueabi* and *gnueabihf*, but other variants should work with minimal changes.
+
+
+Prerequisites
+=============
+
+  * Host computer with Linux;
+  * Git;
+  * CMake 2.6 or higher;
+  * Cross compilation tools for ARM: gcc, libstc++, etc. Depending on target platform you need
+    to choose *gnueabi* or *gnueabihf* tools.
+    Install command for *gnueabi*:
+
+    .. code-block:: bash
+
+       sudo apt-get install gcc-arm-linux-gnueabi
+
+    Install command for *gnueabihf*:
+
+    .. code-block:: bash
+
+       sudo apt-get install gcc-arm-linux-gnueabihf
+
+  * pkgconfig;
+  * Python 2.6 for host system;
+  * [optional] ffmpeg or libav development packages for armeabi(hf): libavcodec-dev, libavformat-dev, libswscale-dev;
+  * [optional] GTK+2.x or higher, including headers (libgtk2.0-dev) for armeabi(hf);
+  * [optional] libdc1394 2.x;
+  * [optional] libjpeg-dev, libpng-dev, libtiff-dev, libjasper-dev for armeabi(hf).
+
+
+Getting OpenCV Source Code
+==========================
+
+You can use the latest stable OpenCV version available in *sourceforge* or you can grab the latest
+snapshot from our `Git repository <https://github.com/Itseez/opencv.git>`_.
+
+
+Getting the Latest Stable OpenCV Version
+----------------------------------------
+
+* Go to our `page on Sourceforge <http://sourceforge.net/projects/opencvlibrary>`_;
+
+* Download the source tarball and unpack it.
+
+
+Getting the Cutting-edge OpenCV from the Git Repository
+-------------------------------------------------------
+
+Launch Git client and clone `OpenCV repository <http://github.com/itseez/opencv>`_
+
+In Linux it can be achieved with the following command in Terminal:
+
+.. code-block:: bash
+
+   cd ~/<my_working _directory>
+   git clone https://github.com/Itseez/opencv.git
+
+
+Building OpenCV
+===============
+
+#. Create a build directory, make it current and run the following command:
+
+   .. code-block:: bash
+
+      cmake [<some optional parameters>] -DCMAKE_TOOLCHAIN_FILE=<path to the OpenCV source directory>/platforms/linux/arm-gnueabi.toolchain.cmake <path to the OpenCV source directory>
+
+   Toolchain uses *gnueabihf* EABI convention by default. Add ``-DSOFTFP=ON`` cmake argument to switch on softfp compiler.
+
+   .. code-block:: bash
+
+      cmake [<some optional parameters>] -DSOFTFP=ON -DCMAKE_TOOLCHAIN_FILE=<path to the OpenCV source directory>/platforms/linux/arm-gnueabi.toolchain.cmake <path to the OpenCV source directory>
+
+   For example:
+
+   .. code-block:: bash
+
+      cd ~/opencv/platforms/linux
+      mkdir -p build_hardfp
+      cd build_hardfp
+
+      cmake -DCMAKE_TOOLCHAIN_FILE=../arm-gnueabi.toolchain.cmake ../../..
+
+#. Run make in build (<cmake_binary_dir>) directory:
+
+   .. code-block:: bash
+
+      make
+
+.. note::
+
+   Optionally you can strip symbols info from the created library via install/strip make target.
+   This option produces smaller binary (~ twice smaller) but makes further debugging harder.
+
+Enable hardware optimizations
+-----------------------------
+
+Depending on target platfrom architecture different instruction sets can be used. By default
+compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DUSE_VFPV3=ON``
+to cmake command line to enable code generation for VFPv3 and ``-DUSE_NEON=ON`` for using
+NEON SIMD extensions.
+
+TBB is supported on multi core ARM SoCs also.
+Add ``-DWITH_TBB=ON`` and ``-DBUILD_TBB=ON`` to enable it. Cmake scripts download TBB sources
+from official project site `<http://threadingbuildingblocks.org/>`_ and build it.
diff --git a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
index 504e5e5639..5e91a5392c 100644
--- a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
+++ b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
@@ -3,7 +3,9 @@
 Introduction to OpenCV
 -----------------------------------------------------------
 
-Here you can read tutorials about how to set up your computer to work with the OpenCV library. Additionaly you can find a few very basic sample source code that will let introduce you to the world of the OpenCV.
+Here you can read tutorials about how to set up your computer to work with the OpenCV library.
+Additionally you can find a few very basic sample source code that will let introduce you to the
+world of the OpenCV.
 
 .. include:: ../../definitions/tocDefinitions.rst
 
@@ -189,6 +191,24 @@ Here you can read tutorials about how to set up your computer to work with the O
      .. |Install_iOS| image:: images/opencv_ios.png
                  :width:  90pt
 
+* **Embedded Linux**
+
+  .. tabularcolumns:: m{100pt} m{300pt}
+  .. cssclass:: toctableopencv
+
+  =========== ======================================================
+  |Usage_1|   **Title:** :ref:`ARM-Linux-cross-compile`
+
+              *Compatibility:* > OpenCV 2.4.4
+
+              *Author:* |Author_AlexS|
+
+              We will learn how to setup OpenCV cross compilation environment for ARM Linux.
+
+  =========== ======================================================
+
+* **Common**
+
   .. tabularcolumns:: m{100pt} m{300pt}
   .. cssclass:: toctableopencv
 
@@ -249,7 +269,7 @@ Here you can read tutorials about how to set up your computer to work with the O
 
    \pagebreak
 
-.. We use a custom table of content format and as the table of content only imforms Sphinx about the hierarchy of the files, no need to show it.
+.. We use a custom table of content format and as the table of content only informs Sphinx about the hierarchy of the files, no need to show it.
 .. toctree::
    :hidden:
 
@@ -263,6 +283,7 @@ Here you can read tutorials about how to set up your computer to work with the O
    ../android_binary_package/O4A_SDK
    ../android_binary_package/dev_with_OCV_on_Android
    ../ios_install/ios_install
+   ../crosscompilation/arm_crosscompile_with_cmake
    ../display_image/display_image
    ../load_save_image/load_save_image
    ../how_to_write_a_tutorial/how_to_write_a_tutorial
diff --git a/modules/core/doc/basic_structures.rst b/modules/core/doc/basic_structures.rst
index 054aa2ad7c..ca9f5e21a2 100644
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@@ -238,7 +238,7 @@ The constructors.
     :param epsilon: The desired accuracy or change in parameters at which the iterative algorithm stops.
 
     :param criteria: Termination criteria in the deprecated ``CvTermCriteria`` format.
-   
+
 TermCriteria::operator CvTermCriteria
 -------------------------------------
 Converts to the deprecated ``CvTermCriteria`` format.
@@ -418,27 +418,47 @@ Template class for smart reference-counting pointers ::
     };
 
 
-The ``Ptr<_Tp>`` class is a template class that wraps pointers of the corresponding type. It is similar to ``shared_ptr`` that is part of the Boost library (
-http://www.boost.org/doc/libs/1_40_0/libs/smart_ptr/shared_ptr.htm
-) and also part of the `C++0x <http://en.wikipedia.org/wiki/C++0x>`_
-standard.
+The ``Ptr<_Tp>`` class is a template class that wraps pointers of the corresponding type. It is
+similar to ``shared_ptr`` that is part of the Boost library
+(http://www.boost.org/doc/libs/1_40_0/libs/smart_ptr/shared_ptr.htm) and also part of the
+`C++0x <http://en.wikipedia.org/wiki/C++0x>`_ standard.
 
 This class provides the following options:
 
 *
-    Default constructor, copy constructor, and assignment operator for an arbitrary C++ class or a C structure. For some objects, like files, windows, mutexes, sockets, and others, a copy constructor or an assignment operator are difficult to define. For some other objects, like complex classifiers in OpenCV, copy constructors are absent and not easy to implement. Finally, some of complex OpenCV and your own data structures may be written in C. However, copy constructors and default constructors can simplify programming a lot. Besides, they are often required (for example, by STL containers). By wrapping a pointer to such a complex object ``TObj``     to ``Ptr<TObj>`` , you automatically get all of the necessary constructors and the assignment operator.
+    Default constructor, copy constructor, and assignment operator for an arbitrary C++ class
+    or a C structure. For some objects, like files, windows, mutexes, sockets, and others, a copy
+    constructor or an assignment operator are difficult to define. For some other objects, like
+    complex classifiers in OpenCV, copy constructors are absent and not easy to implement. Finally,
+    some of complex OpenCV and your own data structures may be written in C.
+    However, copy constructors and default constructors can simplify programming a lot.Besides,
+    they are often required (for example, by STL containers). By wrapping a pointer to such a
+    complex object ``TObj`` to ``Ptr<TObj>``, you automatically get all of the necessary
+    constructors and the assignment operator.
 
 *
-    *O(1)* complexity of the above-mentioned operations. While some structures, like ``std::vector``,   provide a copy constructor and an assignment operator, the operations may take a considerable amount of time if the data structures are large. But if the structures are put into ``Ptr<>``     , the overhead is small and independent of the data size.
+    *O(1)* complexity of the above-mentioned operations. While some structures, like ``std::vector``,
+    provide a copy constructor and an assignment operator, the operations may take a considerable
+    amount of time if the data structures are large. But if the structures are put into ``Ptr<>``,
+    the overhead is small and independent of the data size.
 
 *
-    Automatic destruction, even for C structures. See the example below with ``FILE*``     .
+    Automatic destruction, even for C structures. See the example below with ``FILE*``.
 
 *
-    Heterogeneous collections of objects. The standard STL and most other C++ and OpenCV containers can store only objects of the same type and the same size. The classical solution to store objects of different types in the same container is to store pointers to the base class ``base_class_t*``     instead but then you loose the automatic memory management. Again, by using ``Ptr<base_class_t>()``     instead of the raw pointers, you can solve the problem.
-
-The ``Ptr`` class treats the wrapped object as a black box. The reference counter is allocated and managed separately. The only thing the pointer class needs to know about the object is how to deallocate it. This knowledge is encapsulated in the ``Ptr::delete_obj()`` method that is called when the reference counter becomes 0. If the object is a C++ class instance, no additional coding is needed, because the default implementation of this method calls ``delete obj;`` .
-However, if the object is deallocated in a different way, the specialized method should be created. For example, if you want to wrap ``FILE`` , the ``delete_obj`` may be implemented as follows: ::
+    Heterogeneous collections of objects. The standard STL and most other C++ and OpenCV containers
+    can store only objects of the same type and the same size. The classical solution to store objects
+    of different types in the same container is to store pointers to the base class ``base_class_t*``
+    instead but then you loose the automatic memory management. Again, by using ``Ptr<base_class_t>()``
+    instead of the raw pointers, you can solve the problem.
+
+The ``Ptr`` class treats the wrapped object as a black box. The reference counter is allocated and
+managed separately. The only thing the pointer class needs to know about the object is how to
+deallocate it. This knowledge is encapsulated in the ``Ptr::delete_obj()`` method that is called when
+the reference counter becomes 0. If the object is a C++ class instance, no additional coding is
+needed, because the default implementation of this method calls ``delete obj;``. However, if the
+object is deallocated in a different way, the specialized method should be created. For example,
+if you want to wrap ``FILE``, the ``delete_obj`` may be implemented as follows: ::
 
     template<> inline void Ptr<FILE>::delete_obj()
     {
@@ -456,7 +476,73 @@ However, if the object is deallocated in a different way, the specialized method
     // the file will be closed automatically by the Ptr<FILE> destructor.
 
 
-.. note:: The reference increment/decrement operations are implemented as atomic operations, and therefore it is normally safe to use the classes in multi-threaded applications. The same is true for :ocv:class:`Mat` and other C++ OpenCV classes that operate on the reference counters.
+.. note:: The reference increment/decrement operations are implemented as atomic operations,
+          and therefore it is normally safe to use the classes in multi-threaded applications.
+          The same is true for :ocv:class:`Mat` and other C++ OpenCV classes that operate on
+          the reference counters.
+
+Ptr::Ptr
+--------
+Various Ptr constructors.
+
+.. ocv:function:: Ptr::Ptr()
+.. ocv:function:: Ptr::Ptr(_Tp* _obj)
+.. ocv:function:: Ptr::Ptr(const Ptr& ptr)
+
+Ptr::~Ptr
+---------
+The Ptr destructor.
+
+.. ocv:function:: Ptr::~Ptr()
+
+Ptr::operator =
+----------------
+Assignment operator.
+
+.. ocv:function:: Ptr& Ptr::operator = (const Ptr& ptr)
+
+Decrements own reference counter (with ``release()``) and increments ptr's reference counter.
+
+Ptr::addref
+-----------
+Increments reference counter.
+
+.. ocv:function:: void Ptr::addref()
+
+Ptr::release
+------------
+Decrements reference counter; when it becomes 0, ``delete_obj()`` is called.
+
+.. ocv:function:: void Ptr::release()
+
+Ptr::delete_obj
+---------------
+User-specified custom object deletion operation. By default, ``delete obj;`` is called.
+
+.. ocv:function:: void Ptr::delete_obj()
+
+Ptr::empty
+----------
+Returns true if obj == 0;
+
+bool empty() const;
+
+Ptr::operator ->
+----------------
+Provide access to the object fields and methods.
+
+ .. ocv:function:: template<typename _Tp> _Tp* Ptr::operator -> ()
+ .. ocv:function:: template<typename _Tp> const _Tp* Ptr::operator -> () const
+
+
+Ptr::operator _Tp*
+------------------
+Returns the underlying object pointer. Thanks to the methods, the ``Ptr<_Tp>`` can be used instead
+of ``_Tp*``.
+
+ .. ocv:function:: template<typename _Tp> Ptr::operator _Tp* ()
+ .. ocv:function:: template<typename _Tp> Ptr::operator const _Tp*() const
+
 
 Mat
 ---
@@ -494,9 +580,9 @@ OpenCV C++ n-dimensional dense array class ::
 
 
 The class ``Mat`` represents an n-dimensional dense numerical single-channel or multi-channel array. It can be used to store real or complex-valued vectors and matrices, grayscale or color images, voxel volumes, vector fields, point clouds, tensors, histograms (though, very high-dimensional histograms may be better stored in a ``SparseMat`` ). The data layout of the array
-:math:`M` is defined by the array ``M.step[]`` , so that the address of element
-:math:`(i_0,...,i_{M.dims-1})` , where
-:math:`0\leq i_k<M.size[k]` , is computed as:
+:math:`M` is defined by the array ``M.step[]``, so that the address of element
+:math:`(i_0,...,i_{M.dims-1})`, where
+:math:`0\leq i_k<M.size[k]`, is computed as:
 
 .. math::
 
@@ -529,7 +615,7 @@ There are many different ways to create a ``Mat`` object. The most popular optio
 
     ..
 
-    As noted in the introduction to this chapter, ``create()``      allocates only  a new array when the shape or type of the current array are different from the specified ones.
+    As noted in the introduction to this chapter, ``create()`` allocates only  a new array when the shape or type of the current array are different from the specified ones.
 
 *
 
@@ -543,7 +629,7 @@ There are many different ways to create a ``Mat`` object. The most popular optio
 
     ..
 
-    It passes the number of dimensions =1 to the ``Mat``     constructor but the created array will be 2-dimensional with the number of columns set to 1. So, ``Mat::dims``     is always >= 2 (can also be 0 when the array is empty).
+    It passes the number of dimensions =1 to the ``Mat`` constructor but the created array will be 2-dimensional with the number of columns set to 1. So, ``Mat::dims``     is always >= 2 (can also be 0 when the array is empty).
 
 *
 
@@ -573,7 +659,7 @@ There are many different ways to create a ``Mat`` object. The most popular optio
 
     ..
 
-    Due to the additional ``datastart``     and ``dataend``     members, it is possible to compute a relative sub-array position in the main *container* array using ``locateROI()``:
+    Due to the additional ``datastart`` and ``dataend`` members, it is possible to compute a relative sub-array position in the main *container* array using ``locateROI()``:
 
     ::
 
@@ -589,7 +675,7 @@ There are many different ways to create a ``Mat`` object. The most popular optio
 
     ..
 
-    As in case of whole matrices, if you need a deep copy, use the ``clone()``     method of the extracted sub-matrices.
+    As in case of whole matrices, if you need a deep copy, use the ``clone()`` method of the extracted sub-matrices.
 
 *
 
@@ -619,7 +705,7 @@ There are many different ways to create a ``Mat`` object. The most popular optio
 
         ..
 
-    Partial yet very common cases of this *user-allocated data* case are conversions from ``CvMat``     and ``IplImage`` to ``Mat``. For this purpose, there are special constructors taking pointers to ``CvMat``     or ``IplImage`` and the optional flag indicating whether to copy the data or not.
+    Partial yet very common cases of this *user-allocated data* case are conversions from ``CvMat`` and ``IplImage`` to ``Mat``. For this purpose, there are special constructors taking pointers to ``CvMat``     or ``IplImage`` and the optional flag indicating whether to copy the data or not.
 
         Backward conversion from ``Mat`` to ``CvMat`` or ``IplImage`` is provided via cast operators ``Mat::operator CvMat() const`` and ``Mat::operator IplImage()``. The operators do NOT copy the data.
 
@@ -905,7 +991,7 @@ Provides matrix assignment operators.
 
     :param m: Assigned, right-hand-side matrix. Matrix assignment is an O(1) operation. This means that no data is copied but the data is shared and the reference counter, if any, is incremented. Before assigning new data, the old data is de-referenced via  :ocv:func:`Mat::release` .
 
-    :param expr: Assigned matrix expression object. As opposite to the first form of the assignment operation, the second form can reuse already allocated matrix if it has the right size and type to fit the matrix expression result. It is automatically handled by the real function that the matrix expressions is expanded to. For example,  ``C=A+B``  is expanded to  ``add(A, B, C)`` , and  :func:`add`  takes care of automatic  ``C``  reallocation.
+    :param expr: Assigned matrix expression object. As opposite to the first form of the assignment operation, the second form can reuse already allocated matrix if it has the right size and type to fit the matrix expression result. It is automatically handled by the real function that the matrix expressions is expanded to. For example,  ``C=A+B``  is expanded to  ``add(A, B, C)``, and  :func:`add`  takes care of automatic  ``C``  reallocation.
 
     :param s: Scalar assigned to each matrix element. The matrix size or type is not changed.
 
@@ -970,7 +1056,7 @@ Creates a matrix header for the specified row span.
 
     :param endrow: An exclusive 0-based ending index of the row span.
 
-    :param r: :ocv:class:`Range`  structure containing both the start and the end indices.
+    :param r: :ocv:class:`Range` structure containing both the start and the end indices.
 
 The method makes a new header for the specified row span of the matrix. Similarly to
 :ocv:func:`Mat::row` and
@@ -1983,7 +2069,7 @@ The class ``SparseMat`` represents multi-dimensional sparse numerical arrays. Su
 :ocv:class:`Mat` can store. *Sparse* means that only non-zero elements are stored (though, as a result of operations on a sparse matrix, some of its stored elements can actually become 0. It is up to you to detect such elements and delete them using ``SparseMat::erase`` ). The non-zero elements are stored in a hash table that grows when it is filled so that the search time is O(1) in average (regardless of whether element is there or not). Elements can be accessed using the following methods:
 
 *
-    Query operations ( ``SparseMat::ptr``     and the higher-level ``SparseMat::ref``,    ``SparseMat::value``     and ``SparseMat::find``     ), for example:
+    Query operations (``SparseMat::ptr`` and the higher-level ``SparseMat::ref``, ``SparseMat::value`` and ``SparseMat::find``), for example:
 
     ::
 
@@ -2001,7 +2087,7 @@ The class ``SparseMat`` represents multi-dimensional sparse numerical arrays. Su
     ..
 
 *
-    Sparse matrix iterators. They are similar to ``MatIterator`` but different from :ocv:class:`NAryMatIterator`.     That is, the iteration loop is familiar to STL users:
+    Sparse matrix iterators. They are similar to ``MatIterator`` but different from :ocv:class:`NAryMatIterator`. That is, the iteration loop is familiar to STL users:
 
     ::
 
diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index 6bf4e5d21b..d39f0a82d7 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -73,12 +73,16 @@ namespace cv { namespace gpu
         FEATURE_SET_COMPUTE_20 = 20,
         FEATURE_SET_COMPUTE_21 = 21,
         FEATURE_SET_COMPUTE_30 = 30,
+        FEATURE_SET_COMPUTE_35 = 35,
+
         GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
         SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
         NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
-        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
+        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30,
+        DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35
     };
 
+    // Checks whether current device supports the given feature
     CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
 
     // Gives information about what GPU archs this OpenCV GPU module was
@@ -116,8 +120,9 @@ namespace cv { namespace gpu
 
         int multiProcessorCount() const { return multi_processor_count_; }
 
-        size_t sharedMemPerBlock() const { return sharedMemPerBlock_; }
+        size_t sharedMemPerBlock() const;
 
+        void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
         size_t freeMemory() const;
         size_t totalMemory() const;
 
@@ -131,7 +136,6 @@ namespace cv { namespace gpu
 
     private:
         void query();
-        void queryMemory(size_t& free_memory, size_t& total_memory) const;
 
         int device_id_;
 
@@ -139,7 +143,6 @@ namespace cv { namespace gpu
         int multi_processor_count_;
         int majorVersion_;
         int minorVersion_;
-        size_t sharedMemPerBlock_;
     };
 
     CV_EXPORTS void printCudaDeviceInfo(int device);
@@ -546,13 +549,6 @@ namespace cv { namespace gpu
     {
         ensureSizeIsEnough(size.height, size.width, type, m);
     }
-
-    inline GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
-    {
-        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
-            return mat(Rect(0, 0, cols, rows));
-        return mat = GpuMat(rows, cols, type);
-    }
 }}
 
 #endif // __cplusplus
diff --git a/modules/core/perf/perf_compare.cpp b/modules/core/perf/perf_compare.cpp
index 32f8ba7682..5fb755d956 100644
--- a/modules/core/perf/perf_compare.cpp
+++ b/modules/core/perf/perf_compare.cpp
@@ -52,7 +52,8 @@ PERF_TEST_P( Size_MatType_CmpType, compareScalar,
 
     declare.in(src1, src2, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() cv::compare(src1, src2, dst, cmpType);
+    int runs = (sz.width <= 640) ? 8 : 1;
+    TEST_CYCLE_MULTIRUN(runs) cv::compare(src1, src2, dst, cmpType);
 
     SANITY_CHECK(dst);
 }
diff --git a/modules/core/perf/perf_convertTo.cpp b/modules/core/perf/perf_convertTo.cpp
index c4e832f1f7..8007361228 100644
--- a/modules/core/perf/perf_convertTo.cpp
+++ b/modules/core/perf/perf_convertTo.cpp
@@ -29,9 +29,9 @@ PERF_TEST_P( Size_DepthSrc_DepthDst_Channels_alpha, convertTo,
     Mat src(sz, CV_MAKETYPE(depthSrc, channels));
     randu(src, 0, 255);
     Mat dst(sz, CV_MAKETYPE(depthDst, channels));
-    declare.iterations(500);
 
-    TEST_CYCLE() src.convertTo(dst, depthDst, alpha);
+    int runs = (sz.width <= 640) ? 8 : 1;
+    TEST_CYCLE_MULTIRUN(runs) src.convertTo(dst, depthDst, alpha);
 
     SANITY_CHECK(dst, alpha == 1.0 ? 1e-12 : 1e-7);
 }
diff --git a/modules/core/perf/perf_mat.cpp b/modules/core/perf/perf_mat.cpp
index 3749feb384..79a3ecd1ff 100644
--- a/modules/core/perf/perf_mat.cpp
+++ b/modules/core/perf/perf_mat.cpp
@@ -18,7 +18,8 @@ PERF_TEST_P(Size_MatType, Mat_Eye,
 
     declare.out(diagonalMatrix);
 
-    TEST_CYCLE()
+    int runs = (size.width <= 640) ? 15 : 5;
+    TEST_CYCLE_MULTIRUN(runs)
     {
         diagonalMatrix = Mat::eye(size, type);
     }
@@ -38,7 +39,8 @@ PERF_TEST_P(Size_MatType, Mat_Zeros,
 
     declare.out(zeroMatrix);
 
-    TEST_CYCLE()
+    int runs = (size.width <= 640) ? 15 : 5;
+    TEST_CYCLE_MULTIRUN(runs)
     {
         zeroMatrix = Mat::zeros(size, type);
     }
diff --git a/modules/core/perf/perf_merge.cpp b/modules/core/perf/perf_merge.cpp
index d10cc81542..d82941a92b 100644
--- a/modules/core/perf/perf_merge.cpp
+++ b/modules/core/perf/perf_merge.cpp
@@ -30,7 +30,8 @@ PERF_TEST_P( Size_SrcDepth_DstChannels, merge,
     }
 
     Mat dst;
-    TEST_CYCLE() merge( (vector<Mat> &)mv, dst );
+    int runs = (sz.width <= 640) ? 8 : 1;
+    TEST_CYCLE_MULTIRUN(runs) merge( (vector<Mat> &)mv, dst );
 
     SANITY_CHECK(dst, 1e-12);
 }
\ No newline at end of file
diff --git a/modules/core/perf/perf_split.cpp b/modules/core/perf/perf_split.cpp
index f5de9b6707..df9095fc61 100644
--- a/modules/core/perf/perf_split.cpp
+++ b/modules/core/perf/perf_split.cpp
@@ -26,8 +26,8 @@ PERF_TEST_P( Size_Depth_Channels, split,
     randu(m, 0, 255);
 
     vector<Mat> mv;
-
-    TEST_CYCLE() split(m, (vector<Mat>&)mv);
+    int runs = (sz.width <= 640) ? 8 : 1;
+    TEST_CYCLE_MULTIRUN(runs) split(m, (vector<Mat>&)mv);
 
     SANITY_CHECK(mv, 1e-12);
 }
diff --git a/modules/core/perf/perf_stat.cpp b/modules/core/perf/perf_stat.cpp
index 79e849e081..b7fc43d120 100644
--- a/modules/core/perf/perf_stat.cpp
+++ b/modules/core/perf/perf_stat.cpp
@@ -97,7 +97,8 @@ PERF_TEST_P(Size_MatType, countNonZero, testing::Combine( testing::Values( TYPIC
 
     declare.in(src, WARMUP_RNG);
 
-    TEST_CYCLE() cnt = countNonZero(src);
+    int runs = (sz.width <= 640) ? 8 : 1;
+    TEST_CYCLE_MULTIRUN(runs) cnt = countNonZero(src);
 
     SANITY_CHECK(cnt);
 }
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 43f4d613bc..942fb450d8 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -48,8 +48,8 @@
     #include <cuda_runtime.h>
     #include <npp.h>
 
-    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
-    #define NPP_MINIMUM_REQUIRED_VERSION 4100
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4020
+    #define NPP_MINIMUM_REQUIRED_VERSION 4200
 
     #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
         #error "Insufficient Cuda Runtime library version, please update it."
diff --git a/modules/gpu/doc/feature_detection_and_description.rst b/modules/gpu/doc/feature_detection_and_description.rst
index 5a6f85c5f7..0c4caf77c4 100644
--- a/modules/gpu/doc/feature_detection_and_description.rst
+++ b/modules/gpu/doc/feature_detection_and_description.rst
@@ -640,4 +640,3 @@ Converts matrices obtained via :ocv:func:`gpu::BFMatcher_GPU::radiusMatchSingle`
 .. ocv:function:: void gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
 
 If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
-
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index abdbb7f15c..57f8c11579 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1507,6 +1507,7 @@ public:
 
     /* returns number of detected objects */
     int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 
     bool findLargestObject;
     bool visualizeInPlace;
@@ -1519,9 +1520,6 @@ private:
     struct HaarCascade;
     struct LbpCascade;
     friend class CascadeClassifier_GPU_LBP;
-
-public:
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 };
 
 // ======================== GPU version for soft cascade ===================== //
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index ac4fb7c78e..3603933979 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -778,6 +778,8 @@ NCVStatus loadFromXML(const std::string &filename,
     haar.bNeedsTiltedII = false;
     Ncv32u curMaxTreeDepth;
 
+    std::vector<char> xmlFileCont;
+
     std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
     haarStages.resize(0);
     haarClassifierNodes.resize(0);
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
index fecb717cd9..09cf01850e 100644
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -121,9 +121,7 @@ void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, f
     buf.accum.setTo(Scalar::all(0));
 
     DeviceInfo devInfo;
-    cudaDeviceProp prop;
-    cudaSafeCall(cudaGetDeviceProperties(&prop, devInfo.deviceID()));
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, prop.sharedMemPerBlock, devInfo.supports(FEATURE_SET_COMPUTE_20));
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 
     ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
 
@@ -196,9 +194,7 @@ void cv::gpu::HoughLinesP(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf,
     buf.accum.setTo(Scalar::all(0));
 
     DeviceInfo devInfo;
-    cudaDeviceProp prop;
-    cudaSafeCall(cudaGetDeviceProperties(&prop, devInfo.deviceID()));
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, prop.sharedMemPerBlock, devInfo.supports(FEATURE_SET_COMPUTE_20));
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 
     ensureSizeIsEnough(1, maxLines, CV_32SC4, lines);
 
diff --git a/modules/highgui/include/opencv2/highgui/highgui.hpp b/modules/highgui/include/opencv2/highgui/highgui.hpp
index 43cf13dc13..a58dd999c4 100644
--- a/modules/highgui/include/opencv2/highgui/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui/highgui.hpp
@@ -125,7 +125,7 @@ CV_EXPORTS_W void setTrackbarPos(const string& trackbarname, const string& winna
 
 // OpenGL support
 
-typedef void (CV_CDECL *OpenGlDrawCallback)(void* userdata);
+typedef void (*OpenGlDrawCallback)(void* userdata);
 CV_EXPORTS void setOpenGlDrawCallback(const string& winname, OpenGlDrawCallback onOpenGlDraw, void* userdata = 0);
 
 CV_EXPORTS void setOpenGlContext(const string& winname);
diff --git a/modules/highgui/src/cap_ffmpeg_impl.hpp b/modules/highgui/src/cap_ffmpeg_impl.hpp
index 8b571dda1d..642ece177d 100644
--- a/modules/highgui/src/cap_ffmpeg_impl.hpp
+++ b/modules/highgui/src/cap_ffmpeg_impl.hpp
@@ -560,6 +560,10 @@ bool CvCapture_FFMPEG::open( const char* _filename )
 
         if( AVMEDIA_TYPE_VIDEO == enc->codec_type && video_stream < 0)
         {
+            // backup encoder' width/height
+            int enc_width = enc->width;
+            int enc_height = enc->height;
+
             AVCodec *codec = avcodec_find_decoder(enc->codec_id);
             if (!codec ||
 #if LIBAVCODEC_VERSION_INT >= ((53<<16)+(8<<8)+0)
@@ -570,6 +574,10 @@ bool CvCapture_FFMPEG::open( const char* _filename )
                 < 0)
                 goto exit_func;
 
+            // checking width/height (since decoder can sometimes alter it, eg. vp6f)
+            if (enc_width && (enc->width != enc_width)) { enc->width = enc_width; }
+            if (enc_height && (enc->height != enc_height)) { enc->height = enc_height; }
+
             video_stream = i;
             video_st = ic->streams[i];
             picture = avcodec_alloc_frame();
diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp
index 65f9d5cf82..7922383891 100644
--- a/modules/imgproc/perf/perf_cvt_color.cpp
+++ b/modules/imgproc/perf/perf_cvt_color.cpp
@@ -299,10 +299,10 @@ PERF_TEST_P(Size_CvtMode2, cvtColorYUV420,
     Mat src(sz.height + sz.height / 2, sz.width, CV_8UC(ch.scn));
     Mat dst(sz, CV_8UC(ch.dcn));
 
-    declare.time(100);
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() cvtColor(src, dst, mode, ch.dcn);
+    int runs = (sz.width <= 640) ? 8 : 1;
+    TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn);
 
     SANITY_CHECK(dst, 1);
 }
diff --git a/modules/imgproc/perf/perf_matchTemplate.cpp b/modules/imgproc/perf/perf_matchTemplate.cpp
index 0c727d5a48..a89435b535 100644
--- a/modules/imgproc/perf/perf_matchTemplate.cpp
+++ b/modules/imgproc/perf/perf_matchTemplate.cpp
@@ -33,7 +33,8 @@ PERF_TEST_P(ImgSize_TmplSize_Method, matchTemplateSmall,
     declare
         .in(img, WARMUP_RNG)
         .in(tmpl, WARMUP_RNG)
-        .out(result);
+        .out(result)
+        .time(30);
 
     TEST_CYCLE() matchTemplate(img, tmpl, result, method);
 
@@ -66,7 +67,8 @@ PERF_TEST_P(ImgSize_TmplSize_Method, matchTemplateBig,
     declare
         .in(img, WARMUP_RNG)
         .in(tmpl, WARMUP_RNG)
-        .out(result);
+        .out(result)
+        .time(30);
 
     TEST_CYCLE() matchTemplate(img, tmpl, result, method);
 
diff --git a/modules/imgproc/perf/perf_threshold.cpp b/modules/imgproc/perf/perf_threshold.cpp
index dd1602cd20..8727b6339e 100644
--- a/modules/imgproc/perf/perf_threshold.cpp
+++ b/modules/imgproc/perf/perf_threshold.cpp
@@ -31,9 +31,9 @@ PERF_TEST_P(Size_MatType_ThreshType, threshold,
     double maxval = theRNG().uniform(1, 254);
 
     declare.in(src, WARMUP_RNG).out(dst);
-    declare.iterations(500);
 
-    TEST_CYCLE() threshold(src, dst, thresh, maxval, threshType);
+    int runs = (sz.width <= 640) ? 8 : 1;
+    TEST_CYCLE_MULTIRUN(runs) threshold(src, dst, thresh, maxval, threshType);
 
     SANITY_CHECK(dst);
 }
diff --git a/modules/java/generator/src/java/android+OpenCVLoader.java b/modules/java/generator/src/java/android+OpenCVLoader.java
index 4c3655c12c..70e94944dd 100644
--- a/modules/java/generator/src/java/android+OpenCVLoader.java
+++ b/modules/java/generator/src/java/android+OpenCVLoader.java
@@ -17,6 +17,11 @@ public class OpenCVLoader
      */
     public static final String OPENCV_VERSION_2_4_3 = "2.4.3";
 
+    /**
+     * OpenCV Library version 2.4.4.
+     */
+    public static final String OPENCV_VERSION_2_4_4 = "2.4.4";
+
     /**
      * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
      * @return Returns true is initialization of OpenCV was successful.
diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index 84729deb99..6953ef5128 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -102,7 +102,7 @@ namespace cv
 
         //this function enable ocl module to use customized cl_context and cl_command_queue
         //getDevice also need to be called before this function
-        CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0); 
+        CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0);
 
         //////////////////////////////// Error handling ////////////////////////
         CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
@@ -125,6 +125,24 @@ namespace cv
             Impl *impl;
         };
 
+        //! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
+        CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt ,
+                                                        const char **source, string kernelName,
+                                                        size_t globalThreads[3], size_t localThreads[3],
+                                                        std::vector< std::pair<size_t, const void *> > &args,
+                                                        int channels, int depth, const char *build_options,
+                                                        bool finish = true, bool measureKernelTime = false,
+                                                        bool cleanUp = true);
+
+        //! Calls a kernel, by file. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
+        CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt ,
+                                                        const char **fileName, const int numFiles, string kernelName,
+                                                        size_t globalThreads[3], size_t localThreads[3],
+                                                        std::vector< std::pair<size_t, const void *> > &args,
+                                                        int channels, int depth, const char *build_options,
+                                                        bool finish = true, bool measureKernelTime = false,
+                                                        bool cleanUp = true);
+
         class CV_EXPORTS oclMatExpr;
         //////////////////////////////// oclMat ////////////////////////////////
         class CV_EXPORTS oclMat
@@ -469,21 +487,22 @@ namespace cv
         CV_EXPORTS void bitwise_xor(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 
         //! Logical operators
-        CV_EXPORTS oclMatExpr operator ~ (const oclMat &src);
-        CV_EXPORTS oclMatExpr operator | (const oclMat &src1, const oclMat &src2);
-        CV_EXPORTS oclMatExpr operator & (const oclMat &src1, const oclMat &src2);
-        CV_EXPORTS oclMatExpr operator ^ (const oclMat &src1, const oclMat &src2);
+        CV_EXPORTS oclMat operator ~ (const oclMat &);
+        CV_EXPORTS oclMat operator | (const oclMat &, const oclMat &);
+        CV_EXPORTS oclMat operator & (const oclMat &, const oclMat &);
+        CV_EXPORTS oclMat operator ^ (const oclMat &, const oclMat &);
+
 
         //! Mathematics operators
         CV_EXPORTS oclMatExpr operator + (const oclMat &src1, const oclMat &src2);
         CV_EXPORTS oclMatExpr operator - (const oclMat &src1, const oclMat &src2);
         CV_EXPORTS oclMatExpr operator * (const oclMat &src1, const oclMat &src2);
         CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);
- 
+
         //! computes convolution of two images
         //! support only CV_32FC1 type
         CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result);
- 
+
         CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);
 
         //////////////////////////////// Filter Engine ////////////////////////////////
diff --git a/modules/ocl/perf/perf_gemm.cpp b/modules/ocl/perf/perf_gemm.cpp
index 7801c140be..c3dcab34fe 100644
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/ocl/perf/perf_gemm.cpp
@@ -109,5 +109,5 @@ TEST_P(Gemm, Performance)
 INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
                             testing::Values(CV_32FC1, CV_32FC2/* , CV_64FC1, CV_64FC2*/),
                             testing::Values(cv::Size(512, 512), cv::Size(1024, 1024)),
-                            testing::Values(0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_1_T + cv::GEMM_2_T)));
+                            testing::Values(0, (int)cv::GEMM_1_T, (int)cv::GEMM_2_T, (int)(cv::GEMM_1_T + cv::GEMM_2_T))));
 #endif
\ No newline at end of file
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 099d07980d..c4eb0041d9 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -2125,22 +2125,22 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, c
         bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_scalar);
 }
 
-oclMatExpr cv::ocl::operator ~ (const oclMat &src)
+oclMat cv::ocl::operator ~ (const oclMat &src)
 {
     return oclMatExpr(src, oclMat(), MAT_NOT);
 }
 
-oclMatExpr cv::ocl::operator | (const oclMat &src1, const oclMat &src2)
+oclMat cv::ocl::operator | (const oclMat &src1, const oclMat &src2)
 {
     return oclMatExpr(src1, src2, MAT_OR);
 }
 
-oclMatExpr cv::ocl::operator & (const oclMat &src1, const oclMat &src2)
+oclMat cv::ocl::operator & (const oclMat &src1, const oclMat &src2)
 {
     return oclMatExpr(src1, src2, MAT_AND);
 }
 
-oclMatExpr cv::ocl::operator ^ (const oclMat &src1, const oclMat &src2)
+oclMat cv::ocl::operator ^ (const oclMat &src1, const oclMat &src2)
 {
     return oclMatExpr(src1, src2, MAT_XOR);
 }
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index a93f86ecb9..f7d0c43948 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
@@ -23,6 +24,7 @@
 //    Zhang Ying, zhangying913@gmail.com
 //    Xu Pang, pangxu010@163.com
 //    Wu Zailong, bullet@yeah.net
+//    Wenju He, wenju@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -1524,7 +1526,7 @@ namespace cv
             mat_dst.create(mat_src.rows, mat_src.cols, CV_8UC1);
 
             oclMat mat_hist(1, 256, CV_32SC1);
-            //mat_hist.setTo(0);
+
             calcHist(mat_src, mat_hist);
 
             Context *clCxt = mat_src.clCxt;
@@ -1533,10 +1535,10 @@ namespace cv
             size_t globalThreads[3] = { 256, 1, 1};
             oclMat lut(1, 256, CV_8UC1);
             vector<pair<size_t , const void *> > args;
-            float scale = 255.f / (mat_src.rows * mat_src.cols);
+            int total = mat_src.rows * mat_src.cols;
             args.push_back( make_pair( sizeof(cl_mem), (void *)&lut.data));
             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
-            args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
+            args.push_back( make_pair( sizeof(int), (void *)&total));
             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
             LUT(mat_src, lut, mat_dst);
         }
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index 643626bce1..eba92a7dec 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -47,6 +47,7 @@
 
 #include "precomp.hpp"
 #include <iomanip>
+#include <fstream>
 #include "binarycaching.hpp"
 
 using namespace cv;
@@ -730,7 +731,138 @@ namespace cv
 #endif
         }
 
-        cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+       double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName,
+                                 size_t globalThreads[3], size_t localThreads[3],
+                                 vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
+                                 bool finish, bool measureKernelTime, bool cleanUp)
+
+        {
+            //construct kernel name
+            //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
+            //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
+            stringstream idxStr;
+            if(channels != -1)
+                idxStr << "_C" << channels;
+            if(depth != -1)
+                idxStr << "_D" << depth;
+            kernelName += idxStr.str();
+
+            cl_kernel kernel;
+            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
+
+            double kernelTime = 0.0;
+
+            if( globalThreads != NULL)
+            {
+                if ( localThreads != NULL)
+                {
+                    globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
+                    globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
+                    globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
+
+                    //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
+                    cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
+                }
+                for(size_t i = 0; i < args.size(); i ++)
+                    openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
+
+                if(measureKernelTime == false)
+                {
+                    openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
+                                    localThreads, 0, NULL, NULL));
+                }
+                else
+                {
+                    cl_event event = NULL;
+                    openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
+                                    localThreads, 0, NULL, &event));
+
+                    cl_ulong end_time, queue_time;
+
+                    openCLSafeCall(clWaitForEvents(1, &event));
+
+                    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
+                                    sizeof(cl_ulong), &end_time, 0));
+
+                    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+                                    sizeof(cl_ulong), &queue_time, 0));
+
+                    kernelTime = (double)(end_time - queue_time) / (1000 * 1000);
+
+                    clReleaseEvent(event);
+                }
+            }
+
+            if(finish)
+            {
+                clFinish(clCxt->impl->clCmdQueue);
+            }
+
+            if(cleanUp)
+            {
+                openCLSafeCall(clReleaseKernel(kernel));
+            }
+
+            return kernelTime;
+        }
+
+        // Converts the contents of a file into a string
+        static int convertToString(const char *filename, std::string& s)
+        {
+            size_t size;
+            char*  str;
+
+            std::fstream f(filename, (std::fstream::in | std::fstream::binary));
+            if(f.is_open())
+            {
+                size_t fileSize;
+                f.seekg(0, std::fstream::end);
+                size = fileSize = (size_t)f.tellg();
+                f.seekg(0, std::fstream::beg);
+
+                str = new char[size+1];
+                if(!str)
+                {
+                    f.close();
+                    return -1;
+                }
+
+                f.read(str, fileSize);
+                f.close();
+                str[size] = '\0';
+
+                s = str;
+                delete[] str;
+                return 0;
+            }
+            printf("Error: Failed to open file %s\n", filename);
+            return -1;
+        }
+
+        double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName,
+                                 size_t globalThreads[3], size_t localThreads[3],
+                                 vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
+                                 bool finish, bool measureKernelTime, bool cleanUp)
+
+        {
+            std::vector<std::string> fsource;
+            for (int i = 0 ; i < numFiles ; i++)
+            {
+                std::string str;
+                if (convertToString(fileName[i], str) >= 0)
+                    fsource.push_back(str);
+            }
+            const char **source = new const char *[numFiles];
+            for (int i = 0 ; i < numFiles ; i++)
+                source[i] = fsource[i].c_str();
+            double kernelTime = openCLExecuteKernelInterop(clCxt ,source, kernelName, globalThreads, localThreads,
+                                 args, channels, depth, build_options, finish, measureKernelTime, cleanUp);
+            fsource.clear();
+            delete []source;
+            return kernelTime;
+        }
+
+       cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
                              const size_t size)
         {
             int status;
diff --git a/modules/ocl/src/kernels/imgproc_histogram.cl b/modules/ocl/src/kernels/imgproc_histogram.cl
index 11db9b5138..01e333fbc1 100644
--- a/modules/ocl/src/kernels/imgproc_histogram.cl
+++ b/modules/ocl/src/kernels/imgproc_histogram.cl
@@ -3,12 +3,14 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Niko Li, newlife20080214@gmail.com
 //    Jia Haipeng, jiahaipeng95@gmail.com
 //    Xu Pang, pangxu010@163.com
+//    Wenju He, wenju@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -189,24 +191,27 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global
 __kernel __attribute__((reqd_work_group_size(256,1,1)))void calLUT(
                             __global uchar * dst,
                             __constant int * hist,
-                            float scale)
+                            int total)
 {
     int lid = get_local_id(0);
-    __local int sumhist[HISTOGRAM256_BIN_COUNT];
-    //__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
+    __local int sumhist[HISTOGRAM256_BIN_COUNT+1];
 
     sumhist[lid]=hist[lid];
     barrier(CLK_LOCAL_MEM_FENCE);
     if(lid==0)
     {
         int sum = 0;
-        for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
+        int i = 0;
+        while (!sumhist[i]) ++i;
+        sumhist[HISTOGRAM256_BIN_COUNT] = sumhist[i];
+        for(sumhist[i++] = 0; i<HISTOGRAM256_BIN_COUNT; i++)
         {
             sum+=sumhist[i];
             sumhist[i]=sum;
         }
     }
     barrier(CLK_LOCAL_MEM_FENCE);
+    float scale = 255.f/(total - sumhist[HISTOGRAM256_BIN_COUNT]);
     dst[lid]= lid == 0 ? 0 : convert_uchar_sat(convert_float(sumhist[lid])*scale);
 }
 /*
diff --git a/modules/ocl/src/kernels/meanShift.cl b/modules/ocl/src/kernels/meanShift.cl
index bb68331c7b..4b5a08b352 100644
--- a/modules/ocl/src/kernels/meanShift.cl
+++ b/modules/ocl/src/kernels/meanShift.cl
@@ -12,11 +12,13 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Shengen Yan,yanshengen@gmail.com
 //    Xu Pang, pangxu010@163.com
+//    Wenju He, wenju@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,12 +45,6 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-typedef double F;
-#else
-typedef float F;
-#endif
 
 short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
                __global uchar4* in, int in_step, int dst_off, int src_off,
@@ -184,12 +180,11 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
         if( count == 0 )
             break;
 
-        F  icount = 1.0/count;
-        int x1 = convert_int_rtz(sx*icount);
-        int y1 = convert_int_rtz(sy*icount);
-        s.x = convert_int_rtz(s.x*icount);
-        s.y = convert_int_rtz(s.y*icount);
-        s.z = convert_int_rtz(s.z*icount);
+        int x1 = sx/count;
+        int y1 = sy/count;
+        s.x = s.x/count;
+        s.y = s.y/count;
+        s.z = s.z/count;
 
         int4 tmp = s - convert_int4(c);
         int norm2 = tmp.x * tmp.x + tmp.y *  tmp.y +
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index b6c99e1502..32bb8f678c 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -46,6 +46,9 @@
 #include "mcwutil.hpp"
 
 #if defined (HAVE_OPENCL)
+#ifndef CL_VERSION_1_2
+#define CL_VERSION_1_2 0
+#endif
 
 using namespace std;
 
@@ -123,6 +126,101 @@ namespace cv
             openCLExecuteKernel_2(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
                                   build_options, finish_mode);
         }
+
+       cl_mem bindTexture(const oclMat &mat)
+        {
+            cl_mem texture;
+            cl_image_format format;
+            int err;
+            int depth    = mat.depth();
+            int channels = mat.channels();
+
+            switch(depth)
+            {
+            case CV_8U:
+                format.image_channel_data_type = CL_UNSIGNED_INT8;
+                break;
+            case CV_32S:
+                format.image_channel_data_type = CL_UNSIGNED_INT32;
+                break;
+            case CV_32F:
+                format.image_channel_data_type = CL_FLOAT;
+                break;
+            default:
+                throw std::exception();
+                break;
+            }
+            switch(channels)
+            {
+            case 1:
+                format.image_channel_order     = CL_R;
+                break;
+            case 3:
+                format.image_channel_order     = CL_RGB;
+                break;
+            case 4:
+                format.image_channel_order     = CL_RGBA;
+                break;
+            default:
+                throw std::exception();
+                break;
+            }
+#if CL_VERSION_1_2
+            cl_image_desc desc;
+            desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+            desc.image_width      = mat.cols;
+            desc.image_height     = mat.rows;
+            desc.image_depth      = 0;
+            desc.image_array_size = 1;
+            desc.image_row_pitch  = 0;
+            desc.image_slice_pitch = 0;
+            desc.buffer           = NULL;
+            desc.num_mip_levels   = 0;
+            desc.num_samples      = 0;
+            texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+#else
+            texture = clCreateImage2D(
+                mat.clCxt->impl->clContext,
+                CL_MEM_READ_WRITE,
+                &format,
+                mat.cols,
+                mat.rows,
+                0,
+                NULL,
+                &err);
+#endif
+            size_t origin[] = { 0, 0, 0 };
+            size_t region[] = { mat.cols, mat.rows, 1 };
+
+            cl_mem devData;
+            if (mat.cols * mat.elemSize() != mat.step)
+            {
+                devData = clCreateBuffer(mat.clCxt->impl->clContext, CL_MEM_READ_ONLY, mat.cols * mat.rows
+                    * mat.elemSize(), NULL, NULL);
+                const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
+                clEnqueueCopyBufferRect(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, devData, origin, origin,
+                    regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
+            }
+            else
+            {
+                devData = (cl_mem)mat.data;
+            }
+
+            clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, devData, texture, 0, origin, region, 0, NULL, 0);
+            if ((mat.cols * mat.elemSize() != mat.step))
+            {
+                clFinish(mat.clCxt->impl->clCmdQueue);
+                clReleaseMemObject(devData);
+            }
+
+            openCLSafeCall(err);
+            return texture;
+        }
+
+        void releaseTexture(cl_mem& texture)
+        {
+            openCLFree(texture);
+        }
     }//namespace ocl
 
 }//namespace cv
diff --git a/modules/ocl/src/mcwutil.hpp b/modules/ocl/src/mcwutil.hpp
index fe2b49ab51..8db61f1633 100644
--- a/modules/ocl/src/mcwutil.hpp
+++ b/modules/ocl/src/mcwutil.hpp
@@ -67,6 +67,12 @@ namespace cv
         void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
                                   size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
                                   int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
+        // bind oclMat to OpenCL image textures
+        // note:
+        //   1. there is no memory management. User need to explicitly release the resource
+        //   2. for faster clamping, there is no buffer padding for the constructed texture
+        cl_mem bindTexture(const oclMat &mat);
+        void releaseTexture(cl_mem& texture);
     }//namespace ocl
 
 }//namespace cv
diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp
index d14201de1c..87c536ec94 100644
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@ -10,10 +10,15 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
+// @Authors
+//		Dachuan Zhao, dachuan@multicorewareinc.com
+//		Yao Wang, yao@multicorewareinc.com
+//      Nathan, liujun@multicorewareinc.com
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -22,13 +27,13 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
+// any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
@@ -40,6 +45,7 @@
 //
 //M*/
 
+
 #include "precomp.hpp"
 #include "mcwutil.hpp"
 using namespace std;
@@ -568,197 +574,16 @@ static void pyrDown_cus(const oclMat &src, oclMat &dst)
     pyrdown_run_cus(src, dst);
 }
 
-
-//struct MultiplyScalar
-//{
-//    MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}
-//    double operator ()(double a) const
-//    {
-//        return (scale * a * val);
-//    }
-//    const double val;
-//    const double scale;
-//};
-//
-//void callF(const oclMat& src, oclMat& dst, MultiplyScalar op, int mask)
-//{
-//	Mat srcTemp;
-//	Mat dstTemp;
-//	src.download(srcTemp);
-//	dst.download(dstTemp);
-//
-//	int i;
-//	int j;
-//	int k;
-//	for(i = 0; i < srcTemp.rows; i++)
-//	{
-//		for(j = 0; j < srcTemp.cols; j++)
-//		{
-//			for(k = 0; k < srcTemp.channels(); k++)
-//			{
-//				((float*)dstTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k] = (float)op(((float*)srcTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k]);
-//			}
-//		}
-//	}
-//
-//	dst = dstTemp;
-//}
-//
-//static inline bool isAligned(const unsigned char* ptr, size_t size)
-//{
-//    return reinterpret_cast<size_t>(ptr) % size == 0;
-//}
-//
-//static inline bool isAligned(size_t step, size_t size)
-//{
-//    return step % size == 0;
-//}
-//
-//void callT(const oclMat& src, oclMat& dst, MultiplyScalar op, int mask)
-//{
-//    if (!isAligned(src.data, 4 * sizeof(double)) || !isAligned(src.step, 4 * sizeof(double)) ||
-//        !isAligned(dst.data, 4 * sizeof(double)) || !isAligned(dst.step, 4 * sizeof(double)))
-//    {
-//        callF(src, dst, op, mask);
-//        return;
-//    }
-//
-//	Mat srcTemp;
-//	Mat dstTemp;
-//	src.download(srcTemp);
-//	dst.download(dstTemp);
-//
-//	int x_shifted;
-//
-//	int i;
-//	int j;
-//	for(i = 0; i < srcTemp.rows; i++)
-//	{
-//		const double* srcRow = (const double*)srcTemp.data + i * srcTemp.rows;
-//        double* dstRow = (double*)dstTemp.data + i * dstTemp.rows;;
-//
-//		for(j = 0; j < srcTemp.cols; j++)
-//		{
-//			x_shifted = j * 4;
-//
-//			if(x_shifted + 4 - 1 < srcTemp.cols)
-//			{
-//				dstRow[x_shifted    ] = op(srcRow[x_shifted    ]);
-//				dstRow[x_shifted + 1] = op(srcRow[x_shifted + 1]);
-//				dstRow[x_shifted + 2] = op(srcRow[x_shifted + 2]);
-//				dstRow[x_shifted + 3] = op(srcRow[x_shifted + 3]);
-//			}
-//			else
-//			{
-//				for (int real_x = x_shifted; real_x < srcTemp.cols; ++real_x)
-//				{
-//					((float*)dstTemp.data)[i * srcTemp.rows + real_x] = op(((float*)srcTemp.data)[i * srcTemp.rows + real_x]);
-//				}
-//			}
-//		}
-//	}
-//}
-//
-//void multiply(const oclMat& src1, double val, oclMat& dst, double scale = 1.0f);
-//void multiply(const oclMat& src1, double val, oclMat& dst, double scale)
-//{
-//    MultiplyScalar op(val, scale);
-//	//if(src1.channels() == 1 && dst.channels() == 1)
-//	//{
-//	//    callT(src1, dst, op, 0);
-//	//}
-//	//else
-//	//{
-//	    callF(src1, dst, op, 0);
-//	//}
-//}
-
-static cl_mem bindTexture(const oclMat &mat, int depth, int channels)
-{
-    cl_mem texture;
-    cl_image_format format;
-    int err;
-    if(depth == 0)
-    {
-        format.image_channel_data_type = CL_UNSIGNED_INT8;
-    }
-    else if(depth == 5)
-    {
-        format.image_channel_data_type = CL_FLOAT;
-    }
-    if(channels == 1)
-    {
-        format.image_channel_order     = CL_R;
-    }
-    else if(channels == 3)
-    {
-        format.image_channel_order     = CL_RGB;
-    }
-    else if(channels == 4)
-    {
-        format.image_channel_order     = CL_RGBA;
-    }
-#ifdef CL_VERSION_1_2
-    cl_image_desc desc;
-    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
-    desc.image_width      = mat.step / mat.elemSize();
-    desc.image_height     = mat.rows;
-    desc.image_depth      = 0;
-    desc.image_array_size = 1;
-    desc.image_row_pitch  = 0;
-    desc.image_slice_pitch = 0;
-    desc.buffer           = NULL;
-    desc.num_mip_levels   = 0;
-    desc.num_samples      = 0;
-    texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
-#else
-    texture = clCreateImage2D(
-                  mat.clCxt->impl->clContext,
-                  CL_MEM_READ_WRITE,
-                  &format,
-                  mat.step / mat.elemSize(),
-                  mat.rows,
-                  0,
-                  NULL,
-                  &err);
-#endif
-    size_t origin[] = { 0, 0, 0 };
-    size_t region[] = { mat.step / mat.elemSize(), mat.rows, 1 };
-    clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, texture, 0, origin, region, 0, NULL, 0);
-    openCLSafeCall(err);
-
-    return texture;
-}
-
-static void releaseTexture(cl_mem texture)
-{
-    openCLFree(texture);
-}
-
 static void lkSparse_run(oclMat &I, oclMat &J,
                   const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount,
                   int level, /*dim3 block, */dim3 patch, Size winSize, int iters)
 {
     Context  *clCxt = I.clCxt;
-    char platform[256] = {0};
-    cl_platform_id pid;
-    clGetDeviceInfo(clCxt->impl->devices, CL_DEVICE_PLATFORM, sizeof(pid), &pid, NULL);
-    clGetPlatformInfo(pid, CL_PLATFORM_NAME, 256, platform, NULL);
-    std::string namestr = platform;
-    bool isImageSupported = true;
-    if(namestr.find("NVIDIA")!=string::npos || namestr.find("Intel")!=string::npos)
-        isImageSupported = false;
-
     int elemCntPerRow = I.step / I.elemSize();
-
     string kernelName = "lkSparse";
-
-
-    size_t localThreads[3]  = { 8, isImageSupported?8:32, 1 };
-    size_t globalThreads[3] = { 8 * ptcount, isImageSupported?8:32, 1};
-
+    size_t localThreads[3]  = { 8, 8, 1 };
+    size_t globalThreads[3] = { 8 * ptcount, 8, 1};
     int cn = I.oclchannels();
-
     char calcErr;
     if (level == 0)
     {
@@ -770,22 +595,11 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     }
 
     vector<pair<size_t , const void *> > args;
-    cl_mem ITex;
-    cl_mem JTex;
-    if (isImageSupported)
-    {
-        ITex = bindTexture(I, I.depth(), cn);
-        JTex = bindTexture(J, J.depth(), cn);
-    }
-    else
-    {
-        ITex = (cl_mem)I.data;
-        JTex = (cl_mem)J.data;
-    }
+    cl_mem ITex = bindTexture(I);
+    cl_mem JTex = bindTexture(J);
 
     args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&JTex ));
-    //cl_mem clmD = clCreateBuffer(clCxt, CL_MEM_READ_WRITE, ptcount * sizeof(float), NULL, NULL);
     args.push_back( make_pair( sizeof(cl_mem), (void *)&prevPts.data ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&prevPts.step ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&nextPts.data ));
@@ -795,10 +609,6 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     args.push_back( make_pair( sizeof(cl_int), (void *)&level ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols ));
-    if (!isImageSupported)
-    {
-        args.push_back( make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
-    }
     args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&cn ));
@@ -806,18 +616,20 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.height ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&iters ));
     args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr ));
-    //args.push_back( make_pair( sizeof(cl_char), (void *)&GET_MIN_EIGENVALS ));
 
-    if (isImageSupported)
+    try
     {
         openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
-
-        releaseTexture(ITex);
-        releaseTexture(JTex);
     }
-    else
+    catch(Exception&)
     {
-        //printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n");
+        printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n");
+        releaseTexture(ITex);
+        releaseTexture(JTex);
+        ITex = (cl_mem)I.data;
+        JTex = (cl_mem)J.data;
+        localThreads[1] = globalThreads[1] = 32;
+        args.insert( args.begin()+11, make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
         openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
     }
 }
@@ -927,8 +739,6 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v,
     size_t localThreads[3]  = { 16, 16, 1 };
     size_t globalThreads[3] = { I.cols, I.rows, 1};
 
-    int cn = I.oclchannels();
-
     bool calcErr;
     if (err)
     {
@@ -944,8 +754,8 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v,
 
     if (isImageSupported)
     {
-        ITex = bindTexture(I, I.depth(), cn);
-        JTex = bindTexture(J, J.depth(), cn);
+        ITex = bindTexture(I);
+        JTex = bindTexture(J);
     }
     else
     {
diff --git a/modules/ocl/test/test_gemm.cpp b/modules/ocl/test/test_gemm.cpp
index 4ec3337b2c..c26a8481f2 100644
--- a/modules/ocl/test/test_gemm.cpp
+++ b/modules/ocl/test/test_gemm.cpp
@@ -81,5 +81,5 @@ TEST_P(Gemm, Accuracy)
 INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
                             testing::Values(CV_32FC1, CV_32FC2/*, CV_64FC1, CV_64FC2*/),
                             testing::Values(cv::Size(20, 20), cv::Size(300, 300)),
-                            testing::Values(0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_1_T + cv::GEMM_2_T)));
+                            testing::Values(0, (int)cv::GEMM_1_T, (int)cv::GEMM_2_T, (int)(cv::GEMM_1_T + cv::GEMM_2_T))));
 #endif
diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp
index 97174ecbd3..5bf08c80e1 100644
--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
@@ -183,12 +183,11 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
         if( count == 0 )
             break;
 
-        double icount = 1.0 / count;
-        int x1 = cvFloor(sx * icount);
-        int y1 = cvFloor(sy * icount);
-        s0 = cvFloor(s0 * icount);
-        s1 = cvFloor(s1 * icount);
-        s2 = cvFloor(s2 * icount);
+        int x1 = sx / count;
+        int y1 = sy / count;
+        s0 = s0 / count;
+        s1 = s1 / count;
+        s2 = s2 / count;
 
         bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
                         tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
@@ -1370,9 +1369,7 @@ TEST_P(meanShiftFiltering, Mat)
         gdst.download(cpu_gdst);
 
         char sss[1024];
-        char warning[300] = "Warning: If the selected device doesn't support double, a deviation will exist.\nIf the accuracy is acceptable, please ignore it.\n";
         sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty);
-        strcat(sss, warning);
         EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss);
 
     }
@@ -1398,9 +1395,7 @@ TEST_P(meanShiftProc, Mat)
         gdstCoor.download(cpu_gdstCoor);
 
         char sss[1024];
-        char warning[300] = "Warning: If the selected device doesn't support double, a deviation will exist.\nIf the accuracy is acceptable, please ignore it.\n";
         sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty);
-        strcat(sss, warning);
         EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss);
         EXPECT_MAT_NEAR(dstCoor, cpu_gdstCoor, 0.0, sss);
     }
diff --git a/modules/ts/src/ts_arrtest.cpp b/modules/ts/src/ts_arrtest.cpp
index 47dcbaf53a..ec3f18330d 100644
--- a/modules/ts/src/ts_arrtest.cpp
+++ b/modules/ts/src/ts_arrtest.cpp
@@ -296,37 +296,15 @@ int ArrayTest::validate_test_results( int test_case_idx )
         for( j = 0; j < sizei; j++ )
         {
             double err_level;
-            vector<int> idx;
-            double max_diff = 0;
             int code;
-            char msg[100];
 
             if( !test_array[i1][j] )
                 continue;
 
             err_level = get_success_error_level( test_case_idx, i0, (int)j );
-            code = cmpEps( test_mat[i0][j], test_mat[i1][j], &max_diff, err_level, &idx, element_wise_relative_error );
+            code = cmpEps2(ts, test_mat[i0][j], test_mat[i1][j], err_level, element_wise_relative_error, arr_names[i0]);
 
-            switch( code )
-            {
-            case -1:
-                sprintf( msg, "Too big difference (=%g)", max_diff );
-                code = TS::FAIL_BAD_ACCURACY;
-                break;
-            case -2:
-                strcpy( msg, "Invalid output" );
-                code = TS::FAIL_INVALID_OUTPUT;
-                break;
-            case -3:
-                strcpy( msg, "Invalid output in the reference array" );
-                code = TS::FAIL_INVALID_OUTPUT;
-                break;
-            default:
-                continue;
-            }
-            string idxstr = vec2str(", ", &idx[0], idx.size());
-
-            ts->printf( TS::LOG, "%s in %s array %d at (%s)", msg, arr_names[i0], j, idxstr.c_str() );
+            if (code == 0) continue;
 
             for( i0 = 0; i0 < (int)test_array.size(); i0++ )
             {
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index fbbe8ecc6e..62e16fee4d 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -1934,6 +1934,10 @@ int check( const Mat& a, double fmin, double fmax, vector<int>* _idx )
     return idx == 0 ? 0 : -1;
 }
 
+#define CMP_EPS_OK 0
+#define CMP_EPS_BIG_DIFF -1
+#define CMP_EPS_INVALID_TEST_DATA -2 // there is NaN or Inf value in test data
+#define CMP_EPS_INVALID_REF_DATA -3 // there is NaN or Inf value in reference data
 
 // compares two arrays. max_diff is the maximum actual difference,
 // success_err_level is maximum allowed difference, idx is the index of the first
@@ -1946,7 +1950,7 @@ int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
     CV_Assert( arr.type() == refarr.type() && arr.size == refarr.size );
 
     int ilevel = refarr.depth() <= CV_32S ? cvFloor(success_err_level) : 0;
-    int result = 0;
+    int result = CMP_EPS_OK;
 
     const Mat *arrays[]={&arr, &refarr, 0};
     Mat planes[2];
@@ -1998,13 +2002,13 @@ int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
                     continue;
                 if( cvIsNaN(a_val) || cvIsInf(a_val) )
                 {
-                    result = -2;
+                    result = CMP_EPS_INVALID_TEST_DATA;
                     idx = startidx + j;
                     break;
                 }
                 if( cvIsNaN(b_val) || cvIsInf(b_val) )
                 {
-                    result = -3;
+                    result = CMP_EPS_INVALID_REF_DATA;
                     idx = startidx + j;
                     break;
                 }
@@ -2029,13 +2033,13 @@ int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
                     continue;
                 if( cvIsNaN(a_val) || cvIsInf(a_val) )
                 {
-                    result = -2;
+                    result = CMP_EPS_INVALID_TEST_DATA;
                     idx = startidx + j;
                     break;
                 }
                 if( cvIsNaN(b_val) || cvIsInf(b_val) )
                 {
-                    result = -3;
+                    result = CMP_EPS_INVALID_REF_DATA;
                     idx = startidx + j;
                     break;
                 }
@@ -2051,7 +2055,7 @@ int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
             break;
         default:
             assert(0);
-            return -1;
+            return CMP_EPS_BIG_DIFF;
         }
         if(_realmaxdiff)
             *_realmaxdiff = MAX(*_realmaxdiff, realmaxdiff);
@@ -2060,7 +2064,7 @@ int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
     }
 
     if( result == 0 && idx != 0 )
-        result = -1;
+        result = CMP_EPS_BIG_DIFF;
 
     if( result < -1 && _realmaxdiff )
         *_realmaxdiff = exp(1000.);
@@ -2081,15 +2085,15 @@ int cmpEps2( TS* ts, const Mat& a, const Mat& b, double success_err_level,
 
     switch( code )
     {
-    case -1:
+    case CMP_EPS_BIG_DIFF:
         sprintf( msg, "%s: Too big difference (=%g)", desc, diff );
         code = TS::FAIL_BAD_ACCURACY;
         break;
-    case -2:
+    case CMP_EPS_INVALID_TEST_DATA:
         sprintf( msg, "%s: Invalid output", desc );
         code = TS::FAIL_INVALID_OUTPUT;
         break;
-    case -3:
+    case CMP_EPS_INVALID_REF_DATA:
         sprintf( msg, "%s: Invalid reference output", desc );
         code = TS::FAIL_INVALID_OUTPUT;
         break;
diff --git a/modules/video/perf/perf_tvl1optflow.cpp b/modules/video/perf/perf_tvl1optflow.cpp
index ad90915128..36f16d9943 100644
--- a/modules/video/perf/perf_tvl1optflow.cpp
+++ b/modules/video/perf/perf_tvl1optflow.cpp
@@ -13,7 +13,7 @@ pair<string, string> impair(const char* im1, const char* im2)
 
 PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1, testing::Values(impair("cv/optflow/RubberWhale1.png", "cv/optflow/RubberWhale2.png")))
 {
-    declare.time(40);
+    declare.time(260);
 
     Mat frame1 = imread(getDataPath(GetParam().first), IMREAD_GRAYSCALE);
     Mat frame2 = imread(getDataPath(GetParam().second), IMREAD_GRAYSCALE);
diff --git a/samples/gpu/morfology.cpp b/samples/gpu/morphology.cpp
similarity index 100%
rename from samples/gpu/morfology.cpp
rename to samples/gpu/morphology.cpp