Merge remote-tracking branch 'origin/2.4'

Conflicts: doc/tutorials/features2d/feature_detection/feature_detection.rst modules/bioinspired/doc/retina/index.rst modules/core/include/opencv2/core/core.hpp modules/core/include/opencv2/core/mat.hpp modules/core/include/opencv2/core/operations.hpp modules/core/src/stat.cpp modules/features2d/include/opencv2/features2d/features2d.hpp modules/imgproc/src/filter.cpp modules/legacy/src/dpstereo.cpp modules/nonfree/src/surf.ocl.cpp modules/ocl/doc/image_processing.rst modules/ocl/doc/object_detection.rst modules/ocl/include/opencv2/ocl/ocl.hpp modules/ocl/include/opencv2/ocl/private/util.hpp modules/ocl/src/arithm.cpp modules/ocl/src/canny.cpp modules/ocl/src/filtering.cpp modules/ocl/src/imgproc.cpp modules/ocl/src/initialization.cpp modules/ocl/src/matrix_operations.cpp modules/ocl/src/pyrdown.cpp modules/ocl/src/pyrup.cpp modules/ocl/src/split_merge.cpp modules/ocl/test/test_objdetect.cpp modules/ocl/test/utility.hpp
12 years ago · 81eb4bd8be
parent ada858e1c0 e35bc11504
commit 81eb4bd8be
142 changed files with 7748 additions and 13894 deletions
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@ -231,9 +231,11 @@ if(ENABLE_SOLUTION_FOLDERS)
  set_target_properties(tbb PROPERTIES FOLDER "3rdparty")
 endif()

-if(NOT BUILD_SHARED_LIBS)
-  install(TARGETS tbb ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
-endif()
+install(TARGETS tbb
+    RUNTIME DESTINATION bin COMPONENT main
+    LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main
+    ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main
+    )

 # get TBB version
 ocv_parse_header("${tbb_src_dir}/include/tbb/tbb_stddef.h" TBB_VERSION_LINES TBB_VERSION_MAJOR TBB_VERSION_MINOR TBB_INTERFACE_VERSION CACHE)
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -522,11 +522,7 @@ macro(ocv_create_module)

  if((NOT DEFINED OPENCV_MODULE_TYPE AND BUILD_SHARED_LIBS)
      OR (DEFINED OPENCV_MODULE_TYPE AND OPENCV_MODULE_TYPE STREQUAL SHARED))
-    if(MSVC)
-      set_target_properties(${the_module} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
-    else()
-      add_definitions(-DCVAPI_EXPORTS)
-    endif()
+    set_target_properties(${the_module} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
  endif()

  if(MSVC)
--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@ -25,11 +25,13 @@ IF(CMAKE_COMPILER_IS_GNUCXX)

    SET(_PCH_include_prefix "-I")
    SET(_PCH_isystem_prefix "-isystem")
+    SET(_PCH_define_prefix "-D")

 ELSEIF(CMAKE_GENERATOR MATCHES "^Visual.*$")
    SET(PCHSupport_FOUND TRUE)
    SET(_PCH_include_prefix "/I")
    SET(_PCH_isystem_prefix "/I")
+    SET(_PCH_define_prefix "/D")
 ELSE()
    SET(PCHSupport_FOUND FALSE)
 ENDIF()
@ -244,6 +246,14 @@ MACRO(ADD_PRECOMPILED_HEADER _targetName _input)

    _PCH_GET_COMPILE_FLAGS(_compile_FLAGS)

+    get_target_property(type ${_targetName} TYPE)
+    if(type STREQUAL "SHARED_LIBRARY")
+        get_target_property(__DEFINES ${_targetName} DEFINE_SYMBOL)
+        if(NOT __DEFINES MATCHES __DEFINES-NOTFOUND)
+            list(APPEND _compile_FLAGS "${_PCH_define_prefix}${__DEFINES}")
+        endif()
+    endif()
+
    #MESSAGE("_compile_FLAGS: ${_compile_FLAGS}")
    #message("COMMAND ${CMAKE_CXX_COMPILER}	${_compile_FLAGS} -x c++-header -o ${_output} ${_input}")

--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@ -52,9 +52,6 @@
 /* IEEE1394 capturing support */
 #cmakedefine HAVE_DC1394

-/* libdc1394 0.9.4 or 0.9.5 */
-#cmakedefine HAVE_DC1394_095
-
 /* IEEE1394 capturing support - libdc1394 v2.x */
 #cmakedefine HAVE_DC1394_2

--- a/doc/tutorials/bioinspired/retina_model/retina_model.rst~
+++ b/doc/tutorials/bioinspired/retina_model/retina_model.rst~
@ -44,7 +44,7 @@ In the following image, applying the ideas proposed in [benoit2010]_, as your re
   :align: center


-*Note :* image sample can be downloaded from the `OpenEXR website <http://www.openexr.com>`_. Regarding this demonstration, before retina processing, input image has been linearly rescaled within 0-255 keeping its channels float format. 5% of its histogram ends has been cut (mostly removes wrong HDR pixels). Check out the sample *opencv/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp* for similar processing. The following demonstration will only consider classical 8bit/channel images.
+*Note :* image sample can be downloaded from the `OpenEXR website <http://www.openexr.com>`_. Regarding this demonstration, before retina processing, input image has been linearly rescaled within 0-255 keeping its channels float format. 5% of its histogram ends has been cut (mostly removes wrong HDR pixels). Check out the sample *opencv/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp* for similar processing. The following demonstration will only consider classical 8bit/channel images.

 The retina model output channels
 ================================
--- a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
+++ b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
@ -21,7 +21,7 @@ OpenCV has been around since 2001. In those days the library was built around a

 Luckily C++ came around and introduced the concept of classes making easier for the user through automatic memory management (more or less). The good news is that C++ is fully compatible with C so no compatibility issues can arise from making the change. Therefore, OpenCV 2.0 introduced a new C++ interface which offered a new way of doing things which means you do not need to fiddle with memory management, making your code concise (less to write, to achieve more). The main downside of the C++ interface is that many embedded development systems at the moment support only C. Therefore, unless you are targeting embedded platforms, there's no point to using the *old* methods (unless you're a masochist programmer and you're asking for trouble).

-The first thing you need to know about *Mat* is that you no longer need to manually allocate its memory and release it as soon as you do not need it. While doing this is still a possibility, most of the OpenCV functions will allocate its output data manually. As a nice bonus if you pass on an already existing *Mat* object, which has already  allocated the required space for the matrix, this will be reused. In other words we use at all times only as much memory as we need to perform the task.
+The first thing you need to know about *Mat* is that you no longer need to manually allocate its memory and release it as soon as you do not need it. While doing this is still a possibility, most of the OpenCV functions will allocate its output data automatically. As a nice bonus if you pass on an already existing *Mat* object, which has already  allocated the required space for the matrix, this will be reused. In other words we use at all times only as much memory as we need to perform the task.

 *Mat* is basically a class with two data parts: the matrix header (containing information such as the size of the matrix, the method used for storing, at which address is the matrix stored, and so on) and a pointer to the matrix containing the pixel values (taking any dimensionality depending on the method chosen for storing) . The matrix header size is constant, however the size of the matrix itself may vary from image to image and usually is larger by orders of magnitude.

--- a/doc/tutorials/definitions/tocDefinitions.rst
+++ b/doc/tutorials/definitions/tocDefinitions.rst
@ -11,3 +11,4 @@
 .. |Author_EricCh| unicode:: Eric U+0020 Christiansen
 .. |Author_AndreyP| unicode:: Andrey U+0020 Pavlenko
 .. |Author_AlexS| unicode:: Alexander U+0020 Smorkalov
+.. |Author_BarisD| unicode:: Bar U+0131 U+015F U+0020 Evrim U+0020 Demir U+00F6 z
--- a/doc/tutorials/features2d/feature_detection/feature_detection.rst
+++ b/doc/tutorials/features2d/feature_detection/feature_detection.rst
@ -30,6 +30,7 @@ This tutorial code's is shown lines below. You can also download it from `here <
   #include <iostream>
   #include "opencv2/core.hpp"
   #include "opencv2/features2d.hpp"
+   #include "opencv2/nonfree/features2d.hpp"
   #include "opencv2/highgui.hpp"
   #include "opencv2/nonfree.hpp"

--- a/doc/tutorials/introduction/desktop_java/images/eclipse_main_class.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_main_class.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_new_java_prj.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_new_java_prj.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_run.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_run.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib2.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib2.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib3.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib3.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib4.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib4.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib5.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib5.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib6.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib6.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib7.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib7.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib8.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib8.png
--- a/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
+++ b/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
@ -7,10 +7,9 @@ Introduction to Java Development

 As of OpenCV 2.4.4, OpenCV supports desktop Java development using nearly the same interface as for
 Android development. This guide will help you to create your first Java (or Scala) application using OpenCV.
-We will use either `Eclipse <http://eclipse.org/>`_, `Apache Ant <http://ant.apache.org/>`_ or the
-`Simple Build Tool (SBT) <http://www.scala-sbt.org/>`_ to build the application.
+We will use either `Apache Ant <http://ant.apache.org/>`_ or `Simple Build Tool (SBT) <http://www.scala-sbt.org/>`_ to build the application.

-For further reading after this guide, look at the :ref:`Android_Dev_Intro` tutorials.
+If you want to use Eclipse head to :ref:`Java_Eclipse`. For further reading after this guide, look at the :ref:`Android_Dev_Intro` tutorials.

 What we'll do in this guide
 ===========================
@ -19,7 +18,7 @@ In this guide, we will:

 * Get OpenCV with desktop Java support

-* Create an ``Ant``, ``Eclipse`` or ``SBT`` project
+* Create an ``Ant`` or ``SBT`` project

 * Write a simple OpenCV application in Java or Scala

@ -233,97 +232,6 @@ Java sample with Ant
        :alt: run app with Ant
        :align: center

-Java project in Eclipse
-=======================
-
-Now let's look at the possiblity of using OpenCV in Java when developing in Eclipse IDE.
-
-* Create a new Eclipse workspace
-* Create a new Java project via :guilabel:`File --> New --> Java Project`
-
-  .. image:: images/eclipse_new_java_prj.png
-     :alt: Eclipse: new Java project
-     :align: center
-
-  Call it say "HelloCV".
-
-* Open :guilabel:`Java Build Path` tab on :guilabel:`Project Properties` dialog
-  and configure additional library (OpenCV) reference (jar and native library location):
-
-  .. image:: images/eclipse_user_lib.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib2.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib3.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib4.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib5.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib6.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib7.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib8.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-
-* Add a new Java class (say ``Main``) containing the application entry:
-
-  .. image:: images/eclipse_main_class.png
-     :alt: Eclipse: Main class
-     :align: center
-
-* Put some simple OpenCV calls there, e.g.:
-
-  .. code-block:: java
-
-    import org.opencv.core.Core;
-    import org.opencv.core.CvType;
-    import org.opencv.core.Mat;
-
-    public class Main {
-        public static void main(String[] args) {
-            System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
-            Mat m  = Mat.eye(3, 3, CvType.CV_8UC1);
-            System.out.println("m = " + m.dump());
-        }
-    }
-
-* Press :guilabel:`Run` button and find the identity matrix content in the Eclipse ``Console`` window.
-
-  .. image:: images/eclipse_run.png
-     :alt: Eclipse: run
-     :align: center

 SBT project for Java and Scala
 ==============================
--- a/doc/tutorials/introduction/java_eclipse/images/1-window-preferences.png
+++ b/doc/tutorials/introduction/java_eclipse/images/1-window-preferences.png
--- a/doc/tutorials/introduction/java_eclipse/images/10-new-project-created.png
+++ b/doc/tutorials/introduction/java_eclipse/images/10-new-project-created.png
--- a/doc/tutorials/introduction/java_eclipse/images/11-the-code.png
+++ b/doc/tutorials/introduction/java_eclipse/images/11-the-code.png
--- a/doc/tutorials/introduction/java_eclipse/images/2-user-library-new.png
+++ b/doc/tutorials/introduction/java_eclipse/images/2-user-library-new.png
--- a/doc/tutorials/introduction/java_eclipse/images/3-library-name.png
+++ b/doc/tutorials/introduction/java_eclipse/images/3-library-name.png
--- a/doc/tutorials/introduction/java_eclipse/images/4-add-external-jars.png
+++ b/doc/tutorials/introduction/java_eclipse/images/4-add-external-jars.png
--- a/doc/tutorials/introduction/java_eclipse/images/5-native-library.png
+++ b/doc/tutorials/introduction/java_eclipse/images/5-native-library.png
--- a/doc/tutorials/introduction/java_eclipse/images/6-external-folder.png
+++ b/doc/tutorials/introduction/java_eclipse/images/6-external-folder.png
--- a/doc/tutorials/introduction/java_eclipse/images/7-user-library-final.png
+++ b/doc/tutorials/introduction/java_eclipse/images/7-user-library-final.png
--- a/doc/tutorials/introduction/java_eclipse/images/7_5-new-java-project.png
+++ b/doc/tutorials/introduction/java_eclipse/images/7_5-new-java-project.png
--- a/doc/tutorials/introduction/java_eclipse/images/8-add-library.png
+++ b/doc/tutorials/introduction/java_eclipse/images/8-add-library.png
--- a/doc/tutorials/introduction/java_eclipse/images/9-select-user-lib.png
+++ b/doc/tutorials/introduction/java_eclipse/images/9-select-user-lib.png
--- a/doc/tutorials/introduction/java_eclipse/java_eclipse.rst
+++ b/doc/tutorials/introduction/java_eclipse/java_eclipse.rst
@ -0,0 +1,110 @@
+
+.. _Java_Eclipse:
+
+
+Using OpenCV Java with Eclipse
+*********************************************
+
+Since version 2.4.4 `OpenCV supports Java <http://opencv.org/opencv-java-api.html>`_. In this tutorial I will explain how to setup development environment for using OpenCV Java with Eclipse in **Windows**, so you can enjoy the benefits of garbage collected, very refactorable (rename variable, extract method and whatnot) modern language that enables you to write code with less effort and make less mistakes. Here we go.
+
+
+Configuring Eclipse
+===================
+
+First, obtain a fresh release of OpenCV `from download page <http://opencv.org/downloads.html>`_ and extract it under a simple location like ``C:\OpenCV-2.4.6\``. I am using version 2.4.6, but the steps are more or less the same for other versions.
+
+Now, we will define OpenCV as a user library in Eclipse, so we can reuse the configuration for any project. Launch Eclipse and select :guilabel:`Window --> Preferences` from the menu.
+
+.. image:: images/1-window-preferences.png
+     :alt: Eclipse preferences
+     :align: center
+
+Navigate under :guilabel:`Java --> Build Path --> User Libraries` and click :guilabel:`New...`.
+
+.. image:: images/2-user-library-new.png
+     :alt: Creating a new library
+     :align: center
+
+Enter a name, e.g. ``OpenCV-2.4.6``, for your new library.
+
+.. image:: images/3-library-name.png
+     :alt: Naming the new library
+     :align: center
+
+Now select your new user library and click :guilabel:`Add External JARs...`.
+
+.. image:: images/4-add-external-jars.png
+     :alt: Adding external jar
+     :align: center
+
+Browse through ``C:\OpenCV-2.4.6\build\java\`` and select ``opencv-246.jar``. After adding the jar, extend the :guilabel:`opencv-246.jar` and select :guilabel:`Native library location` and press :guilabel:`Edit...`.
+
+.. image:: images/5-native-library.png
+     :alt: Selecting native library location 1
+     :align: center
+
+Select :guilabel:`External Folder...` and browse to select the folder ``C:\OpenCV-2.4.6\build\java\x64``. If you have a 32-bit system you need to select the ``x86`` folder instead of ``x64``.
+
+.. image:: images/6-external-folder.png
+     :alt: Selecting native library location 2
+     :align: center
+
+Your user library configuration should look like this:
+
+.. image:: images/7-user-library-final.png
+     :alt: Selecting native library location 2
+     :align: center
+
+
+Testing the configuration on a new Java project
+=====================================================
+
+Now start creating a new Java project.
+
+.. image:: images/7_5-new-java-project.png
+     :alt: Creating new Java project
+     :align: center
+
+On the :guilabel:`Java Settings` step, under :guilabel:`Libraries` tab, select :guilabel:`Add Library...` and select :guilabel:`OpenCV-2.4.6`, then click :guilabel:`Finish`.
+
+.. image:: images/8-add-library.png
+     :alt: Adding user defined library 1
+     :align: center
+
+.. image:: images/9-select-user-lib.png
+     :alt: Adding user defined library 2
+     :align: center
+
+
+Libraries should look like this:
+
+.. image:: images/10-new-project-created.png
+     :alt: Adding user defined library
+     :align: center
+
+
+Now you have created and configured a new Java project it is time to test it. Create a new java file. Here is a starter code for your convenience:
+
+.. code-block:: java
+
+   import org.opencv.core.Core;
+   import org.opencv.core.CvType;
+   import org.opencv.core.Mat;
+
+   public class Hello
+   {
+      public static void main( String[] args )
+      {
+         System.loadLibrary( Core.NATIVE_LIBRARY_NAME );
+         Mat mat = Mat.eye( 3, 3, CvType.CV_8UC1 );
+         System.out.println( "mat = " + mat.dump() );
+      }
+   }
+
+When you run the code you should see 3x3 identity matrix as output.
+
+.. image:: images/11-the-code.png
+     :alt: Adding user defined library
+     :align: center
+
+That is it, whenever you start a new project just add the OpenCV user library that you have defined to your project and you are good to go. Enjoy your powerful, less painful development environment :)
--- a/doc/tutorials/introduction/table_of_content_introduction/images/eclipse-logo.png
+++ b/doc/tutorials/introduction/table_of_content_introduction/images/eclipse-logo.png
--- a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
+++ b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
@ -138,6 +138,24 @@ world of the OpenCV.
                        :height: 90pt
                        :width:  90pt

+  .. tabularcolumns:: m{100pt} m{300pt}
+  .. cssclass:: toctableopencv
+
+  ================ =================================================
+  |EclipseLogo|    **Title:** :ref:`Java_Eclipse`
+
+                   *Compatibility:* > OpenCV 2.4.4
+
+                   *Author:* |Author_BarisD|
+
+                   A tutorial on how to use OpenCV Java with Eclipse.
+
+  ================ =================================================
+
+     .. |EclipseLogo| image:: images/eclipse-logo.png
+                        :height: 90pt
+                        :width:  90pt
+
 * **Android**

  .. tabularcolumns:: m{100pt} m{300pt}
@ -295,6 +313,7 @@ world of the OpenCV.
   ../windows_visual_studio_Opencv/windows_visual_studio_Opencv
   ../windows_visual_studio_image_watch/windows_visual_studio_image_watch
   ../desktop_java/java_dev_intro
+   ../java_eclipse/java_eclipse
   ../android_binary_package/android_dev_intro
   ../android_binary_package/O4A_SDK
   ../android_binary_package/dev_with_OCV_on_Android
--- a/modules/bioinspired/doc/retina/index.rst
+++ b/modules/bioinspired/doc/retina/index.rst
@ -110,8 +110,8 @@ Here is an overview of the abstract Retina interface, allocate one instance with

 .. Sample code::

-   * An example on retina tone mapping can be found at opencv_source_code/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp
-   * An example on retina tone mapping on video input can be found at opencv_source_code/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp
+   * An example on retina tone mapping can be found at opencv_source_code/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
+   * An example on retina tone mapping on video input can be found at opencv_source_code/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
   * A complete example illustrating the retina interface can be found at opencv_source_code/samples/cpp/retinaDemo.cpp

 Description
@ -182,13 +182,13 @@ Take a look at the provided C++ examples provided with OpenCV :
   **Note :** This demo generates the file *RetinaDefaultParameters.xml* which contains the default parameters of the retina. Then, rename this as *RetinaSpecificParameters.xml*, adjust the parameters the way you want and reload the program to check the effect.


-* **samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp** shows how to use the retina to perform High Dynamic Range (HDR) luminance compression
+* **samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp** shows how to use the retina to perform High Dynamic Range (HDR) luminance compression

   Then, take a HDR image using bracketing with your camera and generate an OpenEXR image and then process it using the demo.

   Typical use, supposing that you have the OpenEXR image such as *memorial.exr* (present in the samples/cpp/ folder)

-   **OpenCVReleaseFolder/bin/OpenEXRimages_HighDynamicRange_Retina_toneMapping memorial.exr [optionnal: 'fast']**
+   **OpenCVReleaseFolder/bin/OpenEXRimages_HDR_Retina_toneMapping memorial.exr [optional: 'fast']**

      Note that some sliders are made available to allow you to play with luminance compression.

--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@ -1144,7 +1144,7 @@ CVAPI(void)   cvSetRemove( CvSet* set_header, int index );
   NULL is returned */
 CV_INLINE CvSetElem* cvGetSetElem( const CvSet* set_header, int idx )
 {
-    CvSetElem* elem = (CvSetElem*)cvGetSeqElem( (CvSeq*)set_header, idx );
+    CvSetElem* elem = (CvSetElem*)(void *)cvGetSeqElem( (CvSeq*)set_header, idx );
    return elem && CV_IS_SET_ELEM( elem ) ? elem : 0;
 }

--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -1734,13 +1734,13 @@ const _Tp& SparseMat::value(const Node* n) const
 inline
 SparseMat::Node* SparseMat::node(size_t nidx)
 {
-    return (Node*)&hdr->pool[nidx];
+    return (Node*)(void*)&hdr->pool[nidx];
 }

 inline
 const SparseMat::Node* SparseMat::node(size_t nidx) const
 {
-    return (const Node*)&hdr->pool[nidx];
+    return (const Node*)(const void*)&hdr->pool[nidx];
 }

 inline
@ -2488,7 +2488,7 @@ const _Tp& SparseMatConstIterator::value() const
 inline
 const SparseMat::Node* SparseMatConstIterator::node() const
 {
-    return (ptr && m && m->hdr) ? (const SparseMat::Node*)(ptr - m->hdr->valueOffset) : 0;
+    return (ptr && m && m->hdr) ? (const SparseMat::Node*)(const void*)(ptr - m->hdr->valueOffset) : 0;
 }

 inline
--- a/modules/core/include/opencv2/core/persistence.hpp
+++ b/modules/core/include/opencv2/core/persistence.hpp
@ -861,8 +861,8 @@ inline FileNode::operator String() const { String value; read(*this, value, valu
 inline FileNodeIterator FileNode::begin() const { return FileNodeIterator(fs, node); }
 inline FileNodeIterator FileNode::end() const   { return FileNodeIterator(fs, node, size()); }
 inline void FileNode::readRaw( const String& fmt, uchar* vec, size_t len ) const { begin().readRaw( fmt, vec, len ); }
-inline FileNode FileNodeIterator::operator *() const  { return FileNode(fs, (const CvFileNode*)reader.ptr); }
-inline FileNode FileNodeIterator::operator ->() const { return FileNode(fs, (const CvFileNode*)reader.ptr); }
+inline FileNode FileNodeIterator::operator *() const  { return FileNode(fs, (const CvFileNode*)(const void*)reader.ptr); }
+inline FileNode FileNodeIterator::operator ->() const { return FileNode(fs, (const CvFileNode*)(const void*)reader.ptr); }
 inline String::String(const FileNode& fn): cstr_(0), len_(0) { read(fn, *this, *this); }

 } // cv
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@ -523,11 +523,11 @@ CV_INLINE  double  cvmGet( const CvMat* mat, int row, int col )
            (unsigned)col < (unsigned)mat->cols );

    if( type == CV_32FC1 )
-        return ((float*)(mat->data.ptr + (size_t)mat->step*row))[col];
+        return ((float*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col];
    else
    {
        assert( type == CV_64FC1 );
-        return ((double*)(mat->data.ptr + (size_t)mat->step*row))[col];
+        return ((double*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col];
    }
 }

@ -540,11 +540,11 @@ CV_INLINE  void  cvmSet( CvMat* mat, int row, int col, double value )
            (unsigned)col < (unsigned)mat->cols );

    if( type == CV_32FC1 )
-        ((float*)(mat->data.ptr + (size_t)mat->step*row))[col] = (float)value;
+        ((float*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col] = (float)value;
    else
    {
        assert( type == CV_64FC1 );
-        ((double*)(mat->data.ptr + (size_t)mat->step*row))[col] = (double)value;
+        ((double*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col] = (double)value;
    }
 }

--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@ -238,6 +238,7 @@ template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_
 */
 static inline size_t alignSize(size_t sz, int n)
 {
+    CV_DbgAssert((n & (n - 1)) == 0); // n is a power of 2
    return (sz + n-1) & -n;
 }

--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@ -50,7 +50,7 @@ namespace cv
 # pragma warning(disable: 4748)
 #endif

-#if defined HAVE_IPP && IPP_VERSION_MAJOR >= 7
+#if defined HAVE_IPP && IPP_VERSION_MAJOR*100 + IPP_VERSION_MINOR >= 701
 #define USE_IPP_DFT 1
 #else
 #undef USE_IPP_DFT
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@ -1610,7 +1610,8 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
    int depth = src.depth(), cn = src.channels();

    normType &= 7;
-    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
+               normType == NORM_L2 || normType == NORM_L2SQR ||
               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src.type() == CV_8U) );

 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
@ -1981,7 +1982,8 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
    CV_Assert( src1.size == src2.size && src1.type() == src2.type() );

    normType &= 7;
-    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
+               normType == NORM_L2 || normType == NORM_L2SQR ||
              ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );

 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
--- a/modules/flann/include/opencv2/flann/lsh_table.h
+++ b/modules/flann/include/opencv2/flann/lsh_table.h
@ -384,7 +384,7 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons
 {
    // no need to check if T is dividable by sizeof(size_t) like in the Hamming
    // distance computation as we have a mask
-    const size_t* feature_block_ptr = reinterpret_cast<const size_t*> (feature);
+    const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);

    // Figure out the subsignature of the feature
    // Given the feature ABCDEF, and the mask 001011, the output will be
--- a/modules/highgui/src/cap_dc1394.cpp
+++ b/modules/highgui/src/cap_dc1394.cpp
@ -296,11 +296,7 @@ static CvCaptureCAM_DC1394 * icvCaptureFromCAM_DC1394 (int index)
    if (pcap->format!=FORMAT_SCALABLE_IMAGE_SIZE) { // everything except Format 7
        if (dc1394_dma_setup_capture(pcap->handle, pcap->camera->node, index+1 /*channel*/,
                    pcap->format, pcap->mode, SPEED_400,
-                    pcap->frame_rate, NUM_BUFFERS,
-#ifdef HAVE_DC1394_095
-                    0 /*do_extra_buffering*/,
-#endif
-                    1 /*DROP_FRAMES*/,
+                    pcap->frame_rate, NUM_BUFFERS, 1 /*drop_frames*/,
                    pcap->device_name, pcap->camera) != DC1394_SUCCESS) {
            fprintf(stderr,"%s:%d: Failed to setup DMA capture with VIDEO1394\n",__FILE__,__LINE__);
            goto ERROR;
@ -311,11 +307,7 @@ static CvCaptureCAM_DC1394 * icvCaptureFromCAM_DC1394 (int index)
                    pcap->mode, SPEED_400, QUERY_FROM_CAMERA,
                    (unsigned int)QUERY_FROM_CAMERA, (unsigned int)QUERY_FROM_CAMERA,
                    (unsigned int)QUERY_FROM_CAMERA, (unsigned int)QUERY_FROM_CAMERA,
-                    NUM_BUFFERS,
-#ifdef HAVE_DC1394_095
-                    0 /*do_extra_buffering*/,
-#endif
-                    1 /*DROP_FRAMES*/,
+                    NUM_BUFFERS, 1 /*drop_frames*/,
                    pcap->device_name, pcap->camera) != DC1394_SUCCESS) {
            fprintf(stderr,"%s:%d: Failed to setup DMA capture with VIDEO1394\n",__FILE__,__LINE__);
            goto ERROR;
@ -661,11 +653,7 @@ icvSetModeCAM_DC1394( CvCaptureCAM_DC1394 * capture, int mode ){
    dc1394_dma_unlisten(capture->handle, capture->camera);
    if (dc1394_dma_setup_capture(capture->handle, capture->camera->node, capture->camera->channel /*channel*/,
                format, mode, SPEED_400,
-                frame_rate, NUM_BUFFERS,
-#ifdef HAVE_DC1394_095
-                0 /*do_extra_buffering*/,
-#endif
-                1 /*DROP_FRAMES*/,
+                frame_rate, NUM_BUFFERS, 1 /*drop_frames*/,
                capture->device_name, capture->camera) != DC1394_SUCCESS) {
        fprintf(stderr,"%s:%d: Failed to setup DMA capture with VIDEO1394\n",__FILE__,__LINE__);
        return 0;
--- a/modules/highgui/src/cap_qtkit.mm
+++ b/modules/highgui/src/cap_qtkit.mm
@ -287,11 +287,17 @@ bool CvCaptureCAM::grabFrame(double timeOut) {
    double sleepTime = 0.005;
    double total = 0;

-    NSDate *loopUntil = [NSDate dateWithTimeIntervalSinceNow:sleepTime];
-    while (![capture updateImage] && (total += sleepTime)<=timeOut &&
-           [[NSRunLoop currentRunLoop] runMode: NSDefaultRunLoopMode
-                                    beforeDate:loopUntil])
-        loopUntil = [NSDate dateWithTimeIntervalSinceNow:sleepTime];
+    // If the capture is launched in a separate thread, then
+    // [NSRunLoop currentRunLoop] is not the same as in the main thread, and has no timer.
+    //see https://developer.apple.com/library/mac/#documentation/Cocoa/Reference/Foundation/Classes/nsrunloop_Class/Reference/Reference.html
+    // "If no input sources or timers are attached to the run loop, this
+    // method exits immediately"
+    // using usleep() is not a good alternative, because it may block the GUI.
+    // Create a dummy timer so that runUntilDate does not exit immediately:
+    [NSTimer scheduledTimerWithTimeInterval:100 target:nil selector:@selector(doFireTimer:) userInfo:nil repeats:YES];
+    while (![capture updateImage] && (total += sleepTime)<=timeOut) {
+        [[NSRunLoop currentRunLoop] runUntilDate:[NSDate dateWithTimeIntervalSinceNow:sleepTime]];
+    }

    [localpool drain];

@ -336,9 +342,11 @@ int CvCaptureCAM::startCaptureDevice(int cameraNum) {
    }

    if (cameraNum >= 0) {
-        int nCameras = [devices count];
-        if( cameraNum < 0 || cameraNum >= nCameras )
+        NSUInteger nCameras = [devices count];
+        if( (NSUInteger)cameraNum >= nCameras ) {
+            [localpool drain];
            return 0;
+        }
        device = [devices objectAtIndex:cameraNum] ;
    } else {
        device = [QTCaptureDevice defaultInputDeviceWithMediaType:QTMediaTypeVideo]  ;
@ -402,6 +410,7 @@ int CvCaptureCAM::startCaptureDevice(int cameraNum) {

        grabFrame(60);

+        [localpool drain];
        return 1;
    }

@ -431,6 +440,7 @@ void CvCaptureCAM::setWidthHeight() {


 double CvCaptureCAM::getProperty(int property_id){
+    int retval;
    NSAutoreleasePool* localpool = [[NSAutoreleasePool alloc] init];

    NSArray* connections = [mCaptureDeviceInput	connections];
@ -440,15 +450,18 @@ double CvCaptureCAM::getProperty(int property_id){
    int width=s1.width, height=s1.height;
    switch (property_id) {
        case CV_CAP_PROP_FRAME_WIDTH:
-            return width;
+            retval = width;
+            break;
        case CV_CAP_PROP_FRAME_HEIGHT:
-            return height;
+            retval = height;
+            break;
        default:
-            return 0;
+            retval = 0;
+            break;
    }

    [localpool drain];
-
+    return retval;
 }

 bool CvCaptureCAM::setProperty(int property_id, double value) {
@ -496,13 +509,15 @@ bool CvCaptureCAM::setProperty(int property_id, double value) {
@implementation CaptureDelegate

 - (id)init {
-    [super init];
-    newFrame = 0;
-    imagedata = NULL;
-    bgr_imagedata = NULL;
-    currSize = 0;
-    image = NULL;
-    bgr_image = NULL;
+    self = [super init];
+    if (self) {
+        newFrame = 0;
+        imagedata = NULL;
+        bgr_imagedata = NULL;
+        currSize = 0;
+        image = NULL;
+        bgr_image = NULL;
+    }
    return self;
 }

@ -577,26 +592,26 @@ didDropVideoFrameWithSampleBuffer:(QTSampleBuffer *)sampleBuffer
        memcpy(imagedata, baseaddress, currSize);

        if (image == NULL) {
-            image = cvCreateImageHeader(cvSize(width,height), IPL_DEPTH_8U, 4);
+            image = cvCreateImageHeader(cvSize((int)width,(int)height), IPL_DEPTH_8U, 4);
        }
-        image->width =width;
-        image->height = height;
+        image->width = (int)width;
+        image->height = (int)height;
        image->nChannels = 4;
        image->depth = IPL_DEPTH_8U;
-        image->widthStep = rowBytes;
+        image->widthStep = (int)rowBytes;
        image->imageData = imagedata;
-        image->imageSize = currSize;
+        image->imageSize = (int)currSize;

        if (bgr_image == NULL) {
-            bgr_image = cvCreateImageHeader(cvSize(width,height), IPL_DEPTH_8U, 3);
+            bgr_image = cvCreateImageHeader(cvSize((int)width,(int)height), IPL_DEPTH_8U, 3);
        }
-        bgr_image->width =width;
-        bgr_image->height = height;
+        bgr_image->width = (int)width;
+        bgr_image->height = (int)height;
        bgr_image->nChannels = 3;
        bgr_image->depth = IPL_DEPTH_8U;
-        bgr_image->widthStep = rowBytes;
+        bgr_image->widthStep = (int)rowBytes;
        bgr_image->imageData = bgr_imagedata;
-        bgr_image->imageSize = currSize;
+        bgr_image->imageSize = (int)currSize;

        cvCvtColor(image, bgr_image, CV_BGRA2BGR);

@ -750,29 +765,29 @@ IplImage* CvCaptureFile::retrieveFramePixelBuffer() {
        }

        if (image == NULL) {
-            image = cvCreateImageHeader(cvSize(width,height), IPL_DEPTH_8U, 4);
+            image = cvCreateImageHeader(cvSize((int)width,(int)height), IPL_DEPTH_8U, 4);
        }

-        image->width =width;
-        image->height = height;
+        image->width = (int)width;
+        image->height = (int)height;
        image->nChannels = 4;
        image->depth = IPL_DEPTH_8U;
-        image->widthStep = rowBytes;
+        image->widthStep = (int)rowBytes;
        image->imageData = imagedata;
-        image->imageSize = currSize;
+        image->imageSize = (int)currSize;


        if (bgr_image == NULL) {
-            bgr_image = cvCreateImageHeader(cvSize(width,height), IPL_DEPTH_8U, 3);
+            bgr_image = cvCreateImageHeader(cvSize((int)width,(int)height), IPL_DEPTH_8U, 3);
        }

-        bgr_image->width =width;
-        bgr_image->height = height;
+        bgr_image->width = (int)width;
+        bgr_image->height = (int)height;
        bgr_image->nChannels = 3;
        bgr_image->depth = IPL_DEPTH_8U;
-        bgr_image->widthStep = rowBytes;
+        bgr_image->widthStep = (int)rowBytes;
        bgr_image->imageData = bgr_imagedata;
-        bgr_image->imageSize = currSize;
+        bgr_image->imageSize = (int)currSize;

        cvCvtColor(image, bgr_image,CV_BGRA2BGR);

--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@ -46,6 +46,12 @@
                                    Base Image Filter
 \****************************************************************************************/

+#if defined HAVE_IPP && IPP_VERSION_MAJOR*100 + IPP_VERSION_MINOR >= 701
+#define USE_IPP_SEP_FILTERS 1
+#else
+#undef USE_IPP_SEP_FILTERS
+#endif
+
 namespace cv
 {

@ -1401,21 +1407,53 @@ struct RowVec_32f
    RowVec_32f( const Mat& _kernel )
    {
        kernel = _kernel;
+        haveSSE = checkHardwareSupport(CV_CPU_SSE);
+#ifdef USE_IPP_SEP_FILTERS
+        bufsz = -1;
+#endif
    }

    int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
+        int _ksize = kernel.rows + kernel.cols - 1;
+        const float* src0 = (const float*)_src;
        float* dst = (float*)_dst;
        const float* _kx = (const float*)kernel.data;
+
+#ifdef USE_IPP_SEP_FILTERS
+        IppiSize roisz = { width, 1 };
+        if( (cn == 1 || cn == 3) && width >= _ksize*8 )
+        {
+            if( bufsz < 0 )
+            {
+                if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) ||
+                    (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0))
+                    return 0;
+            }
+            AutoBuffer<uchar> buf(bufsz + 64);
+            uchar* bufptr = alignPtr((uchar*)buf, 32);
+            int step = (int)(width*sizeof(dst[0])*cn);
+            float borderValue[] = {0.f, 0.f, 0.f};
+            // here is the trick. IPP needs border type and extrapolates the row. We did it already.
+            // So we pass anchor=0 and ignore the right tail of results since they are incorrect there.
+            if( (cn == 1 && ippiFilterRowBorderPipeline_32f_C1R(src0, step, &dst, roisz, _kx, _ksize, 0,
+                                                                ippBorderRepl, borderValue[0], bufptr) < 0) ||
+                (cn == 3 && ippiFilterRowBorderPipeline_32f_C3R(src0, step, &dst, roisz, _kx, _ksize, 0,
+                                                                ippBorderRepl, borderValue, bufptr) < 0))
+                return 0;
+            return width - _ksize + 1;
+        }
+#endif
+
+        if( !haveSSE )
+            return 0;
+
+        int i = 0, k;
        width *= cn;

        for( ; i <= width - 8; i += 8 )
        {
-            const float* src = (const float*)_src + i;
+            const float* src = src0 + i;
            __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
            for( k = 0; k < _ksize; k++, src += cn )
            {
@ -1434,6 +1472,10 @@ struct RowVec_32f
    }

    Mat kernel;
+    bool haveSSE;
+#ifdef USE_IPP_SEP_FILTERS
+    mutable int bufsz;
+#endif
 };


--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -1880,12 +1880,10 @@ public:
          IppiRect dstroi = { 0, dsty, dstwidth, dstheight - dsty };
          int bufsize;
          ippiResizeGetBufSize( srcroi, dstroi, cn, mode, &bufsize );
-          Ipp8u *buf;
-          buf = ippsMalloc_8u( bufsize );
-          IppStatus sts;
-          if( func( src.data, ippiSize(src.cols, src.rows), (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, inv_scale_x, inv_scale_y, 0, 0, mode, buf ) < 0 )
+          AutoBuffer<uchar> buf(bufsize + 64);
+          uchar* bufptr = alignPtr((uchar*)buf, 32);
+          if( func( src.data, ippiSize(src.cols, src.rows), (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, inv_scale_x, inv_scale_y, 0, 0, mode, bufptr ) < 0 )
              *ok = false;
-          ippsFree(buf);
      }
 private:
    Mat &src;
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@ -193,7 +193,7 @@ if(ANDROID AND ANDROID_EXECUTABLE)
  set(lib_target_files ${ANDROID_LIB_PROJECT_FILES})
  ocv_list_add_prefix(lib_target_files "${OpenCV_BINARY_DIR}/")

-  android_get_compatible_target(lib_target_sdk_target ${ANDROID_NATIVE_API_LEVEL} ${ANDROID_SDK_TARGET} 11)
+  android_get_compatible_target(lib_target_sdk_target ${ANDROID_NATIVE_API_LEVEL} ${ANDROID_SDK_TARGET} 14)
  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/android_lib/${ANDROID_MANIFEST_FILE}" "${CMAKE_CURRENT_BINARY_DIR}/${ANDROID_MANIFEST_FILE}" @ONLY)

  add_custom_command(OUTPUT ${lib_target_files} "${OpenCV_BINARY_DIR}/${ANDROID_MANIFEST_FILE}"
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@ -146,6 +146,9 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                    Log.d(TAG, "Set preview size to " + Integer.valueOf((int)frameSize.width) + "x" + Integer.valueOf((int)frameSize.height));
                    params.setPreviewSize((int)frameSize.width, (int)frameSize.height);

+                    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.ICE_CREAM_SANDWICH)
+                        params.setRecordingHint(true);
+
                    List<String> FocusModes = params.getSupportedFocusModes();
                    if (FocusModes != null && FocusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_VIDEO))
                    {
--- a/modules/legacy/src/dpstereo.cpp
+++ b/modules/legacy/src/dpstereo.cpp
@ -76,8 +76,8 @@ typedef struct _CvRightImData
    uchar min_val, max_val;
 } _CvRightImData;

-#define CV_IMAX3(a,b,c) ((temp2 = (a) >= (b) ? (a) : (b)),(temp2 >= (c) ? temp2 : (c)))
-#define CV_IMIN3(a,b,c) ((temp3 = (a) <= (b) ? (a) : (b)),(temp3 <= (c) ? temp3 : (c)))
+#define CV_IMAX3(a,b,c) (std::max(std::max((a), (b)), (c)))
+#define CV_IMIN3(a,b,c) (std::min(std::min((a), (b)), (c)))

 static void icvFindStereoCorrespondenceByBirchfieldDP( uchar* src1, uchar* src2,
                                                uchar* disparities,
@ -87,7 +87,7 @@ static void icvFindStereoCorrespondenceByBirchfieldDP( uchar* src1, uchar* src2,
                                                float  _param3, float _param4,
                                                float  _param5 )
 {
-    int     x, y, i, j, temp2, temp3;
+    int     x, y, i, j;
    int     d, s;
    int     dispH =  maxDisparity + 3;
    uchar  *dispdata;
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@ -210,6 +210,8 @@ bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _res
                prod_data[c2] += train_vec[c2]*val1;
        }
    }
+    cvReleaseMat( &responses );
+    responses = 0;

    /* calculate avg, covariance matrix, c */
    for( cls = 0; cls < nclasses; cls++ )
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@ -81,11 +81,6 @@ namespace cv
    }
 }

-static inline size_t divUp(size_t total, size_t grain)
-{
-    return (total + grain - 1) / grain;
-}
-
 static inline int calcSize(int octave, int layer)
 {
    /* Wavelet size at first layer of first octave. */
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@ -3,5 +3,5 @@ if(NOT HAVE_OPENCL)
 endif()

 set(the_description "OpenCL-accelerated Computer Vision")
-ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_calib3d)
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_calib3d opencv_ml)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
--- a/modules/ocl/doc/camera_calibration_and_3D_reconstruction.rst
+++ b/modules/ocl/doc/camera_calibration_and_3D_reconstruction.rst
@ -0,0 +1,334 @@
+Camera Calibration and 3D Reconstruction
+========================================
+
+.. highlight:: cpp
+
+
+
+ocl::StereoBM_OCL
+---------------------
+.. ocv:class:: ocl::StereoBM_OCL
+
+Class computing stereo correspondence (disparity map) using the block matching algorithm. ::
+
+    class CV_EXPORTS StereoBM_OCL
+    {
+    public:
+        enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
+
+        enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
+
+        //! the default constructor
+        StereoBM_OCL();
+        //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
+        StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
+
+        //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
+        //! Output disparity has CV_8U type.
+        void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
+
+        //! Some heuristics that tries to estmate
+        // if current GPU will be faster then CPU in this algorithm.
+        // It queries current active device.
+        static bool checkIfGpuCallReasonable();
+
+        int preset;
+        int ndisp;
+        int winSize;
+
+        // If avergeTexThreshold  == 0 => post procesing is disabled
+        // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
+        // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
+        // i.e. input left image is low textured.
+        float avergeTexThreshold;
+    private:
+        /* hidden */
+    };
+
+
+The class also performs pre- and post-filtering steps: Sobel pre-filtering (if ``PREFILTER_XSOBEL`` flag is set) and low textureness filtering (if ``averageTexThreshols > 0`` ). If ``avergeTexThreshold = 0`` , low textureness filtering is disabled. Otherwise, the disparity is set to 0 in each point ``(x, y)`` , where for the left image
+
+.. math::
+    \sum HorizontalGradiensInWindow(x, y, winSize) < (winSize \cdot winSize) \cdot avergeTexThreshold
+
+This means that the input left image is low textured.
+
+
+ocl::StereoBM_OCL::StereoBM_OCL
+-----------------------------------
+Enables :ocv:class:`ocl::StereoBM_OCL` constructors.
+
+.. ocv:function:: ocl::StereoBM_OCL::StereoBM_OCL()
+
+.. ocv:function:: ocl::StereoBM_OCL::StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ)
+
+    :param preset: Parameter presetting:
+
+        * **BASIC_PRESET** Basic mode without pre-processing.
+
+        * **PREFILTER_XSOBEL** Sobel pre-filtering mode.
+
+    :param ndisparities: Number of disparities. It must be a multiple of 8 and less or equal to 256.
+
+    :param winSize: Block size.
+
+
+
+ocl::StereoBM_OCL::operator ()
+----------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
+
+.. ocv:function:: void ocl::StereoBM_OCL::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
+
+    :param left: Left image. Only  ``CV_8UC1``  type is supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param disparity: Output disparity map. It is a  ``CV_8UC1``  image with the same size as the input images.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::StereoBM_OCL::checkIfGpuCallReasonable
+-----------------------------------------------
+Uses a heuristic method to estimate whether the current GPU is faster than the CPU in this algorithm. It queries the currently active device.
+
+.. ocv:function:: bool ocl::StereoBM_OCL::checkIfGpuCallReasonable()
+
+ocl::StereoBeliefPropagation
+--------------------------------
+.. ocv:class:: ocl::StereoBeliefPropagation
+
+Class computing stereo correspondence using the belief propagation algorithm. ::
+
+    class CV_EXPORTS StereoBeliefPropagation
+    {
+    public:
+        enum { DEFAULT_NDISP  = 64 };
+        enum { DEFAULT_ITERS  = 5  };
+        enum { DEFAULT_LEVELS = 5  };
+        static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
+        explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
+                                         int iters  = DEFAULT_ITERS,
+                                         int levels = DEFAULT_LEVELS,
+                                         int msg_type = CV_16S);
+        StereoBeliefPropagation(int ndisp, int iters, int levels,
+                                float max_data_term, float data_weight,
+                                float max_disc_term, float disc_single_jump,
+                                int msg_type = CV_32F);
+        void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+        void operator()(const oclMat &data, oclMat &disparity);
+        int ndisp;
+        int iters;
+        int levels;
+        float max_data_term;
+        float data_weight;
+        float max_disc_term;
+        float disc_single_jump;
+        int msg_type;
+    private:
+        /* hidden */
+    };
+
+The class implements algorithm described in [Felzenszwalb2006]_ . It can compute own data cost (using a truncated linear model) or use a user-provided data cost.
+
+.. note::
+
+    ``StereoBeliefPropagation`` requires a lot of memory for message storage:
+
+    .. math::
+
+        width \_ step  \cdot height  \cdot ndisp  \cdot 4  \cdot (1 + 0.25)
+
+    and for data cost storage:
+
+    .. math::
+
+        width\_step \cdot height \cdot ndisp \cdot (1 + 0.25 + 0.0625 +  \dotsm + \frac{1}{4^{levels}})
+
+    ``width_step`` is the number of bytes in a line including padding.
+
+
+
+ocl::StereoBeliefPropagation::StereoBeliefPropagation
+---------------------------------------------------------
+Enables the :ocv:class:`ocl::StereoBeliefPropagation` constructors.
+
+.. ocv:function:: ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int msg_type = CV_16S)
+
+.. ocv:function:: ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp, int iters, int levels, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param max_data_term: Threshold for data cost truncation.
+
+    :param data_weight: Data weight.
+
+    :param max_disc_term: Threshold for discontinuity truncation.
+
+    :param disc_single_jump: Discontinuity single jump.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
+``StereoBeliefPropagation`` uses a truncated linear model for the data cost and discontinuity terms:
+
+.. math::
+
+    DataCost = data \_ weight  \cdot \min ( \lvert Img_Left(x,y)-Img_Right(x-d,y)  \rvert , max \_ data \_ term)
+
+.. math::
+
+    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
+
+For more details, see [Felzenszwalb2006]_.
+
+By default, :ocv:class:`ocl::StereoBeliefPropagation` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
+
+.. math::
+
+    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
+
+
+
+ocl::StereoBeliefPropagation::estimateRecommendedParams
+-----------------------------------------------------------
+Uses a heuristic method to compute the recommended parameters ( ``ndisp``, ``iters`` and ``levels`` ) for the specified image size ( ``width`` and ``height`` ).
+
+.. ocv:function:: void ocl::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
+
+
+
+ocl::StereoBeliefPropagation::operator ()
+---------------------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair or data cost.
+
+.. ocv:function:: void ocl::StereoBeliefPropagation::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
+
+.. ocv:function:: void ocl::StereoBeliefPropagation::operator ()(const oclMat& data, oclMat& disparity)
+
+    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param data: User-specified data cost, a matrix of ``msg_type`` type and ``Size(<image columns>*ndisp, <image rows>)`` size.
+
+    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the type is retained.
+
+    :param stream: Stream for the asynchronous version.
+
+ocl::StereoConstantSpaceBP
+------------------------------
+.. ocv:class:: ocl::StereoConstantSpaceBP
+
+Class computing stereo correspondence using the constant space belief propagation algorithm. ::
+
+    class CV_EXPORTS StereoConstantSpaceBP
+    {
+    public:
+        enum { DEFAULT_NDISP    = 128 };
+        enum { DEFAULT_ITERS    = 8   };
+        enum { DEFAULT_LEVELS   = 4   };
+        enum { DEFAULT_NR_PLANE = 4   };
+        static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
+        explicit StereoConstantSpaceBP(
+            int ndisp    = DEFAULT_NDISP,
+            int iters    = DEFAULT_ITERS,
+            int levels   = DEFAULT_LEVELS,
+            int nr_plane = DEFAULT_NR_PLANE,
+            int msg_type = CV_32F);
+        StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
+            float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
+            int min_disp_th = 0,
+            int msg_type = CV_32F);
+        void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+        int ndisp;
+        int iters;
+        int levels;
+        int nr_plane;
+        float max_data_term;
+        float data_weight;
+        float max_disc_term;
+        float disc_single_jump;
+        int min_disp_th;
+        int msg_type;
+        bool use_local_init_data_cost;
+    private:
+        /* hidden */
+    };
+
+The class implements algorithm described in [Yang2010]_. ``StereoConstantSpaceBP`` supports both local minimum and global minimum data cost initialization algorithms. For more details, see the paper mentioned above. By default, a local algorithm is used. To enable a global algorithm, set ``use_local_init_data_cost`` to ``false`` .
+
+
+ocl::StereoConstantSpaceBP::StereoConstantSpaceBP
+-----------------------------------------------------
+Enables the :ocv:class:`ocl::StereoConstantSpaceBP` constructors.
+
+.. ocv:function:: ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int nr_plane = DEFAULT_NR_PLANE, int msg_type = CV_32F)
+
+.. ocv:function:: ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th = 0, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param nr_plane: Number of disparity levels on the first level.
+
+    :param max_data_term: Truncation of data cost.
+
+    :param data_weight: Data weight.
+
+    :param max_disc_term: Truncation of discontinuity.
+
+    :param disc_single_jump: Discontinuity single jump.
+
+    :param min_disp_th: Minimal disparity threshold.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
+``StereoConstantSpaceBP`` uses a truncated linear model for the data cost and discontinuity terms:
+
+.. math::
+
+    DataCost = data \_ weight  \cdot \min ( \lvert I_2-I_1  \rvert , max \_ data \_ term)
+
+.. math::
+
+    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
+
+For more details, see [Yang2010]_.
+
+By default, ``StereoConstantSpaceBP`` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
+
+.. math::
+
+    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
+
+
+
+ocl::StereoConstantSpaceBP::estimateRecommendedParams
+---------------------------------------------------------
+Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified image size (widthand height).
+
+.. ocv:function:: void ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
+
+
+
+ocl::StereoConstantSpaceBP::operator ()
+-------------------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
+
+.. ocv:function:: void ocl::StereoConstantSpaceBP::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
+
+    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the output type is  ``disparity.type()`` .
+
+    :param stream: Stream for the asynchronous version.
--- a/modules/ocl/doc/feature_detection_and_description.rst
+++ b/modules/ocl/doc/feature_detection_and_description.rst
@ -37,7 +37,7 @@ Finds edges in an image using the [Canny86]_ algorithm.


 ocl::BruteForceMatcher_OCL_base
-------------------------------
+-----------------------------------
 .. ocv:class:: ocl::BruteForceMatcher_OCL_base

 Brute-force descriptor matcher. For each descriptor in the first set, this matcher finds the closest descriptor in the second set by trying each one. This descriptor matcher supports masking permissible matches between descriptor sets. ::
@ -153,7 +153,7 @@ The class ``BruteForceMatcher_OCL_base`` has an interface similar to the class :


 ocl::BruteForceMatcher_OCL_base::match
--------------------------------------
+------------------------------------------
 Finds the best match for each descriptor from a query set with train descriptors.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::match(const oclMat& query, const oclMat& train, std::vector<DMatch>& matches, const oclMat& mask = oclMat())
@ -169,14 +169,14 @@ Finds the best match for each descriptor from a query set with train descriptors


 ocl::BruteForceMatcher_OCL_base::makeGpuCollection
--------------------------------------------------
+------------------------------------------------------
 Performs a GPU collection of train descriptors and masks in a suitable format for the :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` function.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::makeGpuCollection(oclMat& trainCollection, oclMat& maskCollection, const vector<oclMat>& masks = std::vector<oclMat>())


 ocl::BruteForceMatcher_OCL_base::matchDownload
----------------------------------------------
+--------------------------------------------------
 Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` to vector with :ocv:class:`DMatch`.

 .. ocv:function:: static void ocl::BruteForceMatcher_OCL_base::matchDownload( const oclMat& trainIdx, const oclMat& distance, std::vector<DMatch>& matches )
@ -185,7 +185,7 @@ Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::matc


 ocl::BruteForceMatcher_OCL_base::matchConvert
---------------------------------------------
+-------------------------------------------------
 Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` to vector with :ocv:class:`DMatch`.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>&matches)
@ -195,7 +195,7 @@ Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::match


 ocl::BruteForceMatcher_OCL_base::knnMatch
-----------------------------------------
+---------------------------------------------
 Finds the ``k`` best matches for each descriptor from a query set with train descriptors.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat& query, const oclMat& train, std::vector< std::vector<DMatch> >&matches, int k, const oclMat& mask = oclMat(), bool compactResult = false)
@ -226,7 +226,7 @@ The third variant of the method stores the results in GPU memory.


 ocl::BruteForceMatcher_OCL_base::knnMatchDownload
-------------------------------------------------
+-----------------------------------------------------
 Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatch2Collection` to vector with :ocv:class:`DMatch`.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat& trainIdx, const oclMat& distance, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
@ -238,7 +238,7 @@ If ``compactResult`` is ``true`` , the ``matches`` vector does not contain match


 ocl::BruteForceMatcher_OCL_base::knnMatchConvert
------------------------------------------------
+----------------------------------------------------
 Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatch2Collection` to CPU vector with :ocv:class:`DMatch`.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
@ -250,7 +250,7 @@ If ``compactResult`` is ``true`` , the ``matches`` vector does not contain match


 ocl::BruteForceMatcher_OCL_base::radiusMatch
--------------------------------------------
+------------------------------------------------
 For each query descriptor, finds the best matches with a distance less than a given threshold.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat& query, const oclMat& train, std::vector< std::vector<DMatch> >&matches, float maxDistance, const oclMat& mask = oclMat(), bool compactResult = false)
@ -283,7 +283,7 @@ The third variant of the method stores the results in GPU memory and does not st


 ocl::BruteForceMatcher_OCL_base::radiusMatchDownload
----------------------------------------------------
+--------------------------------------------------------
 Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchCollection` to vector with :ocv:class:`DMatch`.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat& trainIdx, const oclMat& distance, const oclMat& nMatches, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
@ -296,7 +296,7 @@ If ``compactResult`` is ``true`` , the ``matches`` vector does not contain match


 ocl::BruteForceMatcher_OCL_base::radiusMatchConvert
---------------------------------------------------
+-------------------------------------------------------
 Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchCollection` to vector with :ocv:class:`DMatch`.

 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
@ -306,7 +306,7 @@ Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiu
 If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.

 ocl::HOGDescriptor
------------------
+----------------------

 .. ocv:struct:: ocl::HOGDescriptor

--- a/modules/ocl/doc/image_filtering.rst
+++ b/modules/ocl/doc/image_filtering.rst
@ -3,6 +3,360 @@ Image Filtering

 .. highlight:: cpp

+ocl::BaseRowFilter_GPU
+--------------------------
+.. ocv:class:: ocl::BaseRowFilter_GPU
+
+Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
+
+    class CV_EXPORTS BaseRowFilter_GPU
+    {
+    public:
+        BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
+        virtual ~BaseRowFilter_GPU() {}
+        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+        int ksize, anchor, bordertype;
+    };
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`.
+
+ocl::BaseColumnFilter_GPU
+-----------------------------
+.. ocv:class:: ocl::BaseColumnFilter_GPU
+
+Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
+
+    class CV_EXPORTS BaseColumnFilter_GPU
+    {
+    public:
+        BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
+        virtual ~BaseColumnFilter_GPU() {}
+        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+        int ksize, anchor, bordertype;
+    };
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`.
+
+ocl::BaseFilter_GPU
+-----------------------
+.. ocv:class:: ocl::BaseFilter_GPU
+
+Base class for non-separable 2D filters. ::
+
+    class CV_EXPORTS BaseFilter_GPU
+    {
+    public:
+        BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_)
+            : ksize(ksize_), anchor(anchor_), borderType(borderType_) {}
+        virtual ~BaseFilter_GPU() {}
+        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+        Size ksize;
+        Point anchor;
+        int borderType;
+    };
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`
+
+ocl::FilterEngine_GPU
+------------------------
+.. ocv:class:: ocl::FilterEngine_GPU
+
+Base class for the Filter Engine. ::
+
+    class CV_EXPORTS FilterEngine_GPU
+    {
+    public:
+        virtual ~FilterEngine_GPU() {}
+
+        virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0;
+    };
+
+The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`ocl::filter2D`, :ocv:func:`ocl::erode`, :ocv:func:`ocl::Sobel` , and others.
+
+By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
+
+    while (...)
+    {
+        ocl::oclMat src = getImg();
+        ocl::oclMat dst;
+        // Allocate and release buffers at each iterations
+        ocl::GaussianBlur(src, dst, ksize, sigma1);
+    }
+
+    // Allocate buffers only once
+    cv::Ptr<ocl::FilterEngine_GPU> filter =
+        ocl::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
+    while (...)
+    {
+        ocl::oclMat src = getImg();
+        ocl::oclMat dst;
+        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
+    }
+    // Release buffers only once
+    filter.release();
+
+
+``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
+
+.. note:: The GPU filters do not support the in-place mode.
+
+.. seealso:: :ocv:class:`ocl::BaseRowFilter_GPU`, :ocv:class:`ocl::BaseColumnFilter_GPU`, :ocv:class:`ocl::BaseFilter_GPU`, :ocv:func:`ocl::createFilter2D_GPU`, :ocv:func:`ocl::createSeparableFilter_GPU`, :ocv:func:`ocl::createBoxFilter_GPU`, :ocv:func:`ocl::createMorphologyFilter_GPU`, :ocv:func:`ocl::createLinearFilter_GPU`, :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`ocl::createDerivFilter_GPU`, :ocv:func:`ocl::createGaussianFilter_GPU`
+
+ocl::createFilter2D_GPU
+---------------------------
+Creates a non-separable filter engine with the specified filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createFilter2D_GPU( const Ptr<BaseFilter_GPU> filter2D)
+
+    :param filter2D: Non-separable 2D filter.
+
+Usually this function is used inside such high-level functions as :ocv:func:`ocl::createLinearFilter_GPU`, :ocv:func:`ocl::createBoxFilter_GPU`.
+
+
+ocl::createSeparableFilter_GPU
+----------------------------------
+Creates a separable filter engine with the specified filters.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter, const Ptr<BaseColumnFilter_GPU> &columnFilter)
+
+    :param rowFilter: "Horizontal" 1D filter.
+
+    :param columnFilter: "Vertical" 1D filter.
+
+Usually this function is used inside such high-level functions as :ocv:func:`ocl::createSeparableLinearFilter_GPU`.
+
+ocl::createBoxFilter_GPU
+----------------------------
+Creates a normalized 2D box filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createBoxFilter_GPU(int srcType, int dstType, const Size &ksize, const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+.. ocv:function:: Ptr<BaseFilter_GPU> ocl::getBoxFilter_GPU(int srcType, int dstType, const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
+
+    :param dstType: Output image type.  It supports only the same values as the source type.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`boxFilter`
+
+ocl::boxFilter
+------------------
+Smooths the image using the normalized box filter.
+
+.. ocv:function:: void ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+
+    :param dst: Output image type. The size and type is the same as ``src`` .
+
+    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
+
+Smoothes image using box filter.Supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4.
+
+.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+ocl::blur
+-------------
+Acts as a synonym for the normalized box filter.
+
+.. ocv:function:: void ocl::blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_CONSTANT)
+
+    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+
+    :param dst: Output image type with the same size and type as  ``src`` .
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
+
+    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`blur`, :ocv:func:`ocl::boxFilter`
+
+ocl::createMorphologyFilter_GPU
+-----------------------------------
+Creates a 2D morphological filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Point &anchor = Point(-1, -1), int iterations = 1)
+
+.. ocv:function:: Ptr<BaseFilter_GPU> ocl::getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, Point anchor = Point(-1, -1))
+
+    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
+
+    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
+
+    :param kernel: 2D 8-bit structuring element for the morphological operation.
+
+    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
+
+    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`createMorphologyFilter`
+
+ocl::createLinearFilter_GPU
+-------------------------------
+Creates a non-separable linear filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dstType: Output image type. The same type as ``src`` is supported.
+
+    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:func:`ocl::convolve`.
+
+    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+.. seealso:: :ocv:func:`createLinearFilter`
+
+
+ocl::filter2D
+-----------------
+Applies the non-separable 2D linear filter to an image.
+
+.. ocv:function:: void ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
+
+    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
+
+    :param kernel: 2D array of filter coefficients.
+
+    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param stream: Stream for the asynchronous version.
+
+ocl::getLinearRowFilter_GPU
+-------------------------------
+Creates a primitive row filter with the specified kernel.
+
+.. ocv:function:: Ptr<BaseRowFilter_GPU> ocl::getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel, int anchor = -1, int bordertype = BORDER_DEFAULT)
+
+    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
+
+    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`.
+
+.. seealso:: :ocv:func:`createSeparableLinearFilter` .
+
+
+ocl::getLinearColumnFilter_GPU
+----------------------------------
+Creates a primitive column filter with the specified kernel.
+
+.. ocv:function:: Ptr<BaseColumnFilter_GPU> ocl::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel, int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0)
+
+    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
+
+    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
+
+    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
+
+    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` .
+
+    :param delta: default value is 0.0.
+
+.. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
+
+ocl::createSeparableLinearFilter_GPU
+----------------------------------------
+Creates a separable linear filter engine.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
+
+    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
+
+    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
+
+    :param delta: default value is 0.0.
+
+    :param bordertype: Pixel extrapolation method.
+
+.. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
+
+
+ocl::sepFilter2D
+--------------------
+Applies a separable 2D linear filter to an image.
+
+.. ocv:function:: void ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
+
+    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dst: Destination image with the same size and number of channels as  ``src`` .
+
+    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
+
+    :param kernelX: Horizontal filter coefficients.
+
+    :param kernelY: Vertical filter coefficients.
+
+    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
+
+    :param delta: default value is 0.0.
+
+    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
+
+.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
+
+ocl::createDerivFilter_GPU
+------------------------------
+Creates a filter engine for the generalized Sobel operator.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT )
+
+    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
+
+    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
+
+.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
+
+
 ocl::Sobel
 ------------------
 Returns void
@ -53,43 +407,41 @@ Returns void

 The function computes the first x- or y- spatial image derivative using Scharr operator. Surpport 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data type.

-ocl::GaussianBlur
------------------
-Returns void
+ocl::createGaussianFilter_GPU
+---------------------------------
+Creates a Gaussian filter engine.

-.. ocv:function:: void ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)

-    :param src: The source image
+    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.

-    :param dst: The destination image; It will have the same size and the same type as src
+    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.

-    :param ksize: The Gaussian kernel size; ksize.width and ksize.height can differ, but they both must be positive and odd. Or, they can be zero's, then they are computed from sigma
+    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.

-    :param sigma1sigma2: The Gaussian kernel standard deviations in X and Y direction. If sigmaY is zero, it is set to be equal to sigmaX. If they are both zeros, they are computed from ksize.width and ksize.height. To fully control the result regardless of possible future modification of all this semantics, it is recommended to specify all of ksize, sigmaX and sigmaY
+    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .

-    :param bordertype: Pixel extrapolation method.
+    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.

-The function convolves the source image with the specified Gaussian kernel. In-place filtering is supported.  Surpport 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data type.
+.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`

-ocl::boxFilter
------------------
+ocl::GaussianBlur
+---------------------
 Returns void

-.. ocv:function:: void ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+.. ocv:function:: void ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)

    :param src: The source image

    :param dst: The destination image; It will have the same size and the same type as src

-    :param ddepth: The desired depth of the destination image
-
-    :param ksize: The smoothing kernel size. It must be positive and odd
+    :param ksize: The Gaussian kernel size; ksize.width and ksize.height can differ, but they both must be positive and odd. Or, they can be zero's, then they are computed from sigma

-    :param anchor: The anchor point. The default value Point(-1,-1) means that the anchor is at the kernel center.
+    :param sigma1sigma2: The Gaussian kernel standard deviations in X and Y direction. If sigmaY is zero, it is set to be equal to sigmaX. If they are both zeros, they are computed from ksize.width and ksize.height. To fully control the result regardless of possible future modification of all this semantics, it is recommended to specify all of ksize, sigmaX and sigmaY

    :param bordertype: Pixel extrapolation method.

-Smoothes image using box filter.Supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4.
+The function convolves the source image with the specified Gaussian kernel. In-place filtering is supported.  Surpport 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data type.

 ocl::Laplacian
 ------------------
@ -159,7 +511,7 @@ Returns void
 Convolves an image with the kernel. Supports only CV_32FC1 data types and do not support ROI.

 ocl::bilateralFilter
--------------------
+------------------------
 Returns void

 .. ocv:function:: void ocl::bilateralFilter(const oclMat &src, oclMat &dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT)
@ -178,8 +530,42 @@ Returns void

 Applies bilateral filter to the image. Supports 8UC1 8UC4 data types.

+ocl::adaptiveBilateralFilter
+--------------------------------
+Returns void
+
+.. ocv:function:: void ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT)
+
+    :param src: The source image
+
+    :param dst: The destination image; will have the same size and the same type as src
+
+    :param ksize: The kernel size
+
+    :param sigmaSpace: Filter sigma in the coordinate space. Larger value of the parameter means that farther pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
+
+    :param borderType: Pixel extrapolation method.
+
+A main part of our strategy will be to load each raw pixel once, and reuse it to calculate all pixels in the output (filtered) image that need this pixel value.
+
+.. math::
+
+    \emph{O}_i = \frac{1}{W_i}\sum\limits_{j\in{N(i)}}{\frac{1}{1+\frac{(V_i-V_j)^2}{\sigma_{N{'}(i)}^2}}*\frac{1}{1+\frac{d(i,j)^2}{\sum^2}}}V_j
+
+Local memory organization
+
+
+.. image:: images/adaptiveBilateralFilter.jpg
+                 :height: 250pt
+                 :width:  350pt
+                 :alt: Introduction Icon
+
+.. note:: We partition the image to non-overlapping blocks of size (Ux, Uy). Each such block will correspond to the pixel locations where we will calculate the filter result in one workgroup. Considering neighbourhoods of sizes (kx, ky), where kx = 2 dx + 1, and ky = 2 dy + 1 (in image ML, dx = dy = 1, and kx = ky = 3), it is clear that we need to load data of size Wx = Ux + 2 dx, Wy = Uy + 2 dy. Furthermore, if (Sx, Sy) is the top left pixel coordinates for a particular block, and (Sx + Ux - 1, Sy + Uy -1) is to botom right coordinate of the block, we need to load data starting at top left coordinate (PSx, PSy) = (Sx - dx, Sy - dy), and ending at bottom right coordinate (Sx + Ux - 1 + dx, Sy + Uy - 1 + dy). The workgroup layout is (Wx,1). However, to take advantage of the natural hardware properties (preferred wavefront sizes), we restrict Wx to be a multiple of that preferred wavefront size (for current AMD hardware this is typically 64). Each thread in the workgroup will load Wy elements (under the constraint that Wx*Wy*pixel width <= max local memory).
+
+Applies bilateral filter to the image. Supports 8UC1 8UC3 data types.
+
 ocl::copyMakeBorder
--------------------
+-----------------------
 Returns void

 .. ocv:function:: void ocl::copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar())
@ -241,7 +627,7 @@ Returns void
 The function erodes the source image using the specified structuring element that determines the shape of a pixel neighborhood over which the minimum is taken. Supports 8UC1 8UC4 data types.

 ocl::morphologyEx
------------------
+---------------------
 Returns void

 .. ocv:function:: void ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue())
@ -277,7 +663,6 @@ Smoothes an image and downsamples it.
 .. seealso:: :ocv:func:`pyrDown`


-
 ocl::pyrUp
 -------------------
 Upsamples an image and then smoothes it.
@ -302,7 +687,7 @@ Computes a vertical (column) sum.


 ocl::blendLinear
-------------------
+--------------------
 Performs linear blending of two images.

 .. ocv:function:: void ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, oclMat& result)
--- a/modules/ocl/doc/image_processing.rst
+++ b/modules/ocl/doc/image_processing.rst
@ -3,8 +3,82 @@ Image Processing

 .. highlight:: cpp

+ocl::meanShiftFiltering
+---------------------------
+Performs mean-shift filtering for each point of the source image.
+
+.. ocv:function:: void ocl::meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dst: Destination image containing the color of mapped points. It has the same size and type as  ``src`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+It maps each point of the source image into another point. As a result, you have a new color and new position of each point.
+
+
+ocl::meanShiftProc
+----------------------
+Performs a mean-shift procedure and stores information about processed points (their colors and positions) in two images.
+
+.. ocv:function:: void ocl::meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dstr: Destination image containing the color of mapped points. The size and type is the same as  ``src`` .
+
+    :param dstsp: Destination image containing the position of mapped points. The size is the same as  ``src`` size. The type is  ``CV_16SC2`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+.. seealso:: :ocv:func:`ocl::meanShiftFiltering`
+
+
+ocl::meanShiftSegmentation
+------------------------------
+Performs a mean-shift segmentation of the source image and eliminates small segments.
+
+.. ocv:function:: void ocl::meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dst: Segmented image with the same size and type as  ``src`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param minsize: Minimum segment size. Smaller segments are merged.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+ocl::integral
+-----------------
+Computes an integral image.
+
+.. ocv:function:: void ocl::integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
+
+.. ocv:function:: void ocl::integral(const oclMat &src, oclMat &sum)
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
+
+    :param sqsum: Sqsum values is ``CV_32FC1`` type.
+
+.. seealso:: :ocv:func:`integral`
+
 ocl::cornerHarris
------------------
+---------------------
 Returns void

 .. ocv:function:: void ocl::cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT)
@ -24,7 +98,7 @@ Returns void
 Calculate Harris corner.

 ocl::cornerMinEigenVal
------------------------
+--------------------------
 Returns void

 .. ocv:function:: void ocl::cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT)
@ -53,6 +127,19 @@ Returns void

 Calculates histogram of one or more arrays. Supports only 8UC1 data type.

+ocl::equalizeHist
+---------------------
+Equalizes the histogram of a grayscale image.
+
+.. ocv:function:: void ocl::equalizeHist(const oclMat &mat_src, oclMat &mat_dst)
+
+    :param mat_src: Source image.
+
+    :param mat_dst: Destination image.
+
+.. seealso:: :ocv:func:`equalizeHist`
+
+
 ocl::remap
 ------------------
 Returns void
@ -96,7 +183,7 @@ Returns void
 Resizes an image. Supports CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 , CV_32FC3 and CV_32FC4 data types.

 ocl::warpAffine
------------------
+-------------------
 Returns void

 .. ocv:function:: void ocl::warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR)
@ -114,7 +201,7 @@ Returns void
 The function warpAffine transforms the source image using the specified matrix. Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC types.

 ocl::warpPerspective
---------------------
+------------------------
 Returns void

 .. ocv:function:: void ocl::warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR)
@ -209,7 +296,7 @@ Builds transformation maps for perspective transformation.


 ocl::buildWarpAffineMaps
------------------------
+----------------------------
 Builds transformation maps for affine transformation.

 .. ocv:function:: void ocl::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, oclMat& xmap, oclMat& ymap)
@ -226,114 +313,6 @@ Builds transformation maps for affine transformation.

 .. seealso:: :ocv:func:`ocl::warpAffine` , :ocv:func:`ocl::remap`

-ocl::PyrLKOpticalFlow
---------------------
-.. ocv:class:: ocl::PyrLKOpticalFlow
-
-Class used for calculating an optical flow. ::
-
-    class PyrLKOpticalFlow
-    {
-    public:
-        PyrLKOpticalFlow();
-
-        void sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts,
-            oclMat& status, oclMat* err = 0);
-
-        void dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0);
-
-        Size winSize;
-        int maxLevel;
-        int iters;
-        double derivLambda;
-        bool useInitialFlow;
-        float minEigThreshold;
-        bool getMinEigenVals;
-
-        void releaseMemory();
-    };
-
-The class can calculate an optical flow for a sparse feature set or dense optical flow using the iterative Lucas-Kanade method with pyramids.
-
-.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
-
-.. note::
-
-   (Ocl) An example the Lucas Kanade optical flow pyramid method can be found at opencv_source_code/samples/ocl/pyrlk_optical_flow.cpp
-   (Ocl) An example for square detection can be found at opencv_source_code/samples/ocl/squares.cpp
-
-ocl::PyrLKOpticalFlow::sparse
-----------------------------
-Calculate an optical flow for a sparse feature set.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err = 0)
-
-    :param prevImg: First 8-bit input image (supports both grayscale and color images).
-
-    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
-
-    :param prevPts: Vector of 2D points for which the flow needs to be found. It must be one row matrix with CV_32FC2 type.
-
-    :param nextPts: Output vector of 2D points (with single-precision floating-point coordinates) containing the calculated new positions of input features in the second image. When ``useInitialFlow`` is true, the vector must have the same size as in the input.
-
-    :param status: Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the flow for the corresponding features has been found. Otherwise, it is set to 0.
-
-    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
-
-.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
-
-
-
-ocl::PyrLKOpticalFlow::dense
-----------------------------
-Calculate dense optical flow.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0)
-
-    :param prevImg: First 8-bit grayscale input image.
-
-    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
-
-    :param u: Horizontal component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
-
-    :param v: Vertical component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
-
-    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
-
-
-
-ocl::PyrLKOpticalFlow::releaseMemory
------------------------------------
-Releases inner buffers memory.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::releaseMemory()
-
-
-ocl::interpolateFrames
----------------------
-Interpolate frames (images) using provided optical flow (displacement field).
-
-.. ocv:function:: void ocl::interpolateFrames(const oclMat& frame0, const oclMat& frame1, const oclMat& fu, const oclMat& fv, const oclMat& bu, const oclMat& bv, float pos, oclMat& newFrame, oclMat& buf)
-
-    :param frame0: First frame (32-bit floating point images, single channel).
-
-    :param frame1: Second frame. Must have the same type and size as ``frame0`` .
-
-    :param fu: Forward horizontal displacement.
-
-    :param fv: Forward vertical displacement.
-
-    :param bu: Backward horizontal displacement.
-
-    :param bv: Backward vertical displacement.
-
-    :param pos: New frame position.
-
-    :param newFrame: Output image.
-
-    :param buf: Temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat: occlusion masks for first frame, occlusion masks for second, interpolated forward horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow, interpolated backward vertical flow.
-
-
 ocl::HoughCircles
 -----------------
 Finds circles in a grayscale image using the Hough transform.
--- a/modules/ocl/doc/images/adaptiveBilateralFilter.jpg
+++ b/modules/ocl/doc/images/adaptiveBilateralFilter.jpg
--- a/modules/ocl/doc/matrix_reductions.rst
+++ b/modules/ocl/doc/matrix_reductions.rst
@ -4,7 +4,7 @@ Matrix Reductions
 .. highlight:: cpp

 ocl::countNonZero
------------------
+---------------------
 Returns the number of non-zero elements in src

 .. ocv:function:: int ocl::countNonZero(const oclMat &src)
@ -55,16 +55,26 @@ Returns the sum of matrix elements for each channel

 .. ocv:function:: Scalar ocl::sum(const oclMat &m)

-    :param m: The Source image of all depth
+    :param m: The Source image of all depth.

 Counts the sum of matrix elements for each channel.

+ocl::absSum
+---------------
+Returns the sum of absolute values for matrix elements.
+
+.. ocv:function:: Scalar ocl::absSum(const oclMat &m)
+
+    :param m: The Source image of all depth.
+
+Counts the abs sum of matrix elements for each channel.
+
 ocl::sqrSum
 ------------------
 Returns the squared sum of matrix elements for each channel

 .. ocv:function:: Scalar ocl::sqrSum(const oclMat &m)

-    :param m: The Source image of all depth
+    :param m: The Source image of all depth.

 Counts the squared sum of matrix elements for each channel.
--- a/modules/ocl/doc/ml_machine_learning.rst
+++ b/modules/ocl/doc/ml_machine_learning.rst
@ -0,0 +1,88 @@
+ml.Machine Learning
+=============================
+
+.. highlight:: cpp
+
+ocl::KNearestNeighbour
+--------------------------
+.. ocv:class:: ocl::KNearestNeighbour : public ocl::CvKNearest
+
+The class implements K-Nearest Neighbors model as described in the beginning of this section.
+
+ocl::KNearestNeighbour
+--------------------------
+Computes the weighted sum of two arrays. ::
+
+    class CV_EXPORTS KNearestNeighbour: public CvKNearest
+    {
+    public:
+        KNearestNeighbour();
+        ~KNearestNeighbour();
+
+        bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)),
+            bool isRegression = false, int max_k = 32, bool updateBase = false);
+
+        void clear();
+
+        void find_nearest(const oclMat& samples, int k, oclMat& lables);
+
+    private:
+        /* hidden */
+    };
+
+ocl::KNearestNeighbour::train
+---------------------------------
+Trains the model.
+
+.. ocv:function:: bool ocl::KNearestNeighbour::train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)), bool isRegression = false, int max_k = 32, bool updateBase = false)
+
+    :param isRegression: Type of the problem: ``true`` for regression and ``false`` for classification.
+
+    :param maxK: Number of maximum neighbors that may be passed to the method :ocv:func:`CvKNearest::find_nearest`.
+
+    :param updateBase: Specifies whether the model is trained from scratch (``update_base=false``), or it is updated using the new training data (``update_base=true``). In the latter case, the parameter ``maxK`` must not be larger than the original value.
+
+The method trains the K-Nearest model. It follows the conventions of the generic :ocv:func:`CvStatModel::train` approach with the following limitations:
+
+* Only ``CV_ROW_SAMPLE`` data layout is supported.
+* Input variables are all ordered.
+* Output variables can be either categorical ( ``is_regression=false`` ) or ordered ( ``is_regression=true`` ).
+* Variable subsets (``var_idx``) and missing measurements are not supported.
+
+ocl::KNearestNeighbour::find_nearest
+----------------------------------------
+Finds the neighbors and predicts responses for input vectors.
+
+.. ocv:function:: void ocl::KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lables )
+
+    :param samples: Input samples stored by rows. It is a single-precision floating-point matrix of :math:`number\_of\_samples \times number\_of\_features` size.
+
+    :param k: Number of used nearest neighbors. It must satisfy constraint: :math:`k \le` :ocv:func:`CvKNearest::get_max_k`.
+
+    :param labels: Vector with results of prediction (regression or classification) for each input sample. It is a single-precision floating-point vector with ``number_of_samples`` elements.
+
+ocl::kmeans
+---------------
+Finds centers of clusters and groups input samples around the clusters.
+
+.. ocv:function:: double ocl::kmeans(const oclMat &src, int K, oclMat &bestLabels, TermCriteria criteria, int attemps, int flags, oclMat &centers)
+
+    :param src: Floating-point matrix of input samples, one row per sample.
+
+    :param K: Number of clusters to split the set by.
+
+    :param bestLabels: Input/output integer array that stores the cluster indices for every sample.
+
+    :param criteria: The algorithm termination criteria, that is, the maximum number of iterations and/or the desired accuracy. The accuracy is specified as ``criteria.epsilon``. As soon as each of the cluster centers moves by less than ``criteria.epsilon`` on some iteration, the algorithm stops.
+
+    :param attempts: Flag to specify the number of times the algorithm is executed using different initial labellings. The algorithm returns the labels that yield the best compactness (see the last function parameter).
+
+    :param flags: Flag that can take the following values:
+
+            * **KMEANS_RANDOM_CENTERS** Select random initial centers in each attempt.
+
+            * **KMEANS_PP_CENTERS** Use ``kmeans++`` center initialization by Arthur and Vassilvitskii [Arthur2007].
+
+            * **KMEANS_USE_INITIAL_LABELS** During the first (and possibly the only) attempt, use the user-supplied labels instead of computing them from the initial centers. For the second and further attempts, use the random or semi-random centers. Use one of  ``KMEANS_*_CENTERS``  flag to specify the exact method.
+
+    :param centers: Output matrix of the cluster centers, one row per each cluster center.
--- a/modules/ocl/doc/object_detection.rst
+++ b/modules/ocl/doc/object_detection.rst
@ -4,7 +4,7 @@ Object Detection
 .. highlight:: cpp

 ocl::OclCascadeClassifier
-------------------------
+-----------------------------
 .. ocv:class:: ocl::OclCascadeClassifier : public CascadeClassifier

 Cascade classifier class used for object detection. Supports HAAR cascade classifier  in the form of cross link ::
@ -21,20 +21,22 @@ Cascade classifier class used for object detection. Supports HAAR cascade classi

   (Ocl) A face detection example using cascade classifiers can be found at opencv_source_code/samples/ocl/facedetect.cpp

-ocl::OclCascadeClassifier::oclHaarDetectObjects
+ocl::OclCascadeClassifier::detectMultiScale
 ------------------------------------------------------
 Detects objects of different sizes in the input image.

 .. ocv:function:: void ocl::OclCascadeClassifier::detectMultiScale(oclMat &image, std::vector<cv::Rect>& faces, double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0, Size minSize = Size(), Size maxSize = Size())

-    :param image:  Matrix of type CV_8U containing an image where objects should be detected.
-
    :param faces: Vector of rectangles where each rectangle contains the detected object.

+    :param image:  Matrix of type CV_8U containing an image where objects should be detected.
+
    :param scaleFactor: Parameter specifying how much the image size is reduced at each image scale.

    :param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.

+    :param flags: Parameter with the same meaning for an old cascade as in the function ``cvHaarDetectObjects``. It is not used for a new cascade.
+
    :param minSize: Minimum possible object size. Objects smaller than that are ignored.

    :param maxSize: Maximum possible object size. Objects larger than that are ignored.
@ -42,7 +44,7 @@ Detects objects of different sizes in the input image.
 The function provides a very similar interface with that in CascadeClassifier class, except using oclMat as input image.

 ocl::MatchTemplateBuf
---------------------
+-------------------------
 .. ocv:struct:: ocl::MatchTemplateBuf

 Class providing memory buffers for :ocv:func:`ocl::matchTemplate` function, plus it allows to adjust some specific parameters. ::
@ -59,7 +61,7 @@ Class providing memory buffers for :ocv:func:`ocl::matchTemplate` function, plus
 You can use field `user_block_size` to set specific block size for :ocv:func:`ocl::matchTemplate` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.

 ocl::matchTemplate
------------------
+----------------------
 Computes a proximity map for a raster template and an image where the template is searched for.

 .. ocv:function:: void ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
--- a/modules/ocl/doc/ocl.rst
+++ b/modules/ocl/doc/ocl.rst
@ -12,7 +12,10 @@ ocl. OpenCL-accelerated Computer Vision
    matrix_reductions
    image_filtering
    image_processing
+    ml_machine_learning
    object_detection
    feature_detection_and_description
+    video_analysis
+    camera_calibration_and_3D_reconstruction
 ..    camera_calibration_and_3d_reconstruction
 ..    video
--- a/modules/ocl/doc/operations_on_matrices.rst
+++ b/modules/ocl/doc/operations_on_matrices.rst
@ -4,204 +4,224 @@ Operations on Matrics
 .. highlight:: cpp

 ocl::oclMat::convertTo
----------------------
+--------------------------
 Returns void

-.. ocv:function:: void ocl::oclMat::convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const
+.. ocv:function:: void ocl::oclMat::convertTo(oclMat &m, int rtype, double alpha = 1, double beta = 0) const

-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated
+    :param m: the destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.

-    :param rtype: The desired destination matrix type, or rather, the depth(since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.
+    :param rtype: the desired destination matrix type, or rather, the depth (since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.

-    :param alpha: must be default now
+    :param alpha: optional scale factor.

-    :param beta: must be default now
+    :param beta: optional delta added to the scaled values.

-The method converts source pixel values to the target datatype. saturate cast is applied in the end to avoid possible overflows. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4.
+The method converts source pixel values to the target datatype. Saturate cast is applied in the end to avoid possible overflows. Supports all data types.

 ocl::oclMat::copyTo
-------------------
+-----------------------
 Returns void

-.. ocv:function:: void ocl::oclMat::copyTo( oclMat &m, const oclMat &mask ) const
+.. ocv:function:: void ocl::oclMat::copyTo(oclMat &m, const oclMat &mask = oclMat()) const

-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated
+    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.

-    :param mask(optional): The operation mask. Its non-zero elements indicate, which matrix elements need to be copied
+    :param mask: The operation mask. Its non-zero elements indicate, which matrix elements need to be copied.

-Copies the matrix to another one. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4
+Copies the matrix to another one. Supports all data types.

 ocl::oclMat::setTo
------------------
+----------------------
 Returns oclMat

 .. ocv:function:: oclMat& ocl::oclMat::setTo(const Scalar &s, const oclMat &mask = oclMat())

-    :param s: Assigned scalar, which is converted to the actual array type
+    :param s: Assigned scalar, which is converted to the actual array type.

-    :param mask: The operation mask of the same size as ``*this``
+    :param mask: The operation mask of the same size as ``*this`` and type ``CV_8UC1``.

-Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4.
+Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports all data types.

 ocl::absdiff
 ------------------
 Returns void

-.. ocv:function:: void ocl::absdiff( const oclMat& a, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::absdiff(const oclMat& src1, const oclMat& src2, oclMat& dst)

-.. ocv:function:: void ocl::absdiff( const oclMat& a, const Scalar& s, oclMat& c )
+.. ocv:function:: void ocl::absdiff(const oclMat& src1, const Scalar& s, oclMat& dst)

+    :param src1: the first input array.

-    :param a: The first input array
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param b: The second input array, must be the same size and same type as a
+    :param s: scalar, the second input parameter.

-    :param s: Scalar, the second input parameter
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param c: The destination array, it will have the same size and same type as a
-
-Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types.

 ocl::add
 ------------------
 Returns void

-.. ocv:function:: void ocl::add( const oclMat & a, const oclMat & b, oclMat & c )
+.. ocv:function:: void ocl::add(const oclMat & src1, const oclMat & src2, oclMat & dst, const oclMat & mask = oclMat())

-.. ocv:function:: void ocl::add( const oclMat & a, const oclMat & b, oclMat & c, const oclMat & mask )
+.. ocv:function:: void ocl::add(const oclMat & src1, const Scalar & s, oclMat & dst, const oclMat & mask = oclMat())

-.. ocv:function:: void ocl::add( const oclMat & a, const Scalar & sc, oclMat & c, const oclMat & mask=oclMat() )
+    :param src1: the first input array.

-    :param a: The first input array
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param b: The second input array, must be the same size and same type as src1
+    :param s: scalar, the second input parameter

-    :param sc: Scalar, the second input parameter
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param c: The destination array, it will have the same size and same type as src1
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-    :param mask: he optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+Computes per-element additon between two arrays or between array and a scalar. Supports all data types.

-Computes per-element additon between two arrays or between array and a scalar. Supports all data types except CV_8S.
+ocl::addWeighted
+--------------------
+Computes the weighted sum of two arrays.

-ocl::subtract
------------------
-Returns void
+.. ocv:function:: void ocl::addWeighted(const oclMat& src1, double  alpha, const oclMat& src2, double beta, double gama, oclMat& dst)
+
+    :param src1: First source array.

-.. ocv:function:: void ocl::subtract( const oclMat& a, const oclMat& b, oclMat& c )
+    :param alpha: Weight for the first array elements.

-.. ocv:function:: void ocl::subtract( const oclMat& a, const oclMat& b, oclMat& c, const oclMat& mask )
+    :param src2: Second source array of the same size and channel number as  ``src1`` .

-.. ocv:function:: void ocl::subtract( const oclMat& a, const Scalar& sc, oclMat& c, const oclMat& mask=oclMat() )
+    :param beta: Weight for the second array elements.

-.. ocv:function:: void ocl::subtract( const Scalar& sc, const oclMat& a, oclMat& c, const oclMat& mask=oclMat() )
+    :param dst: Destination array that has the same size and number of channels as the input arrays.

+    :param gamma: Scalar added to each sum.

-    :param a: The first input array
+The function ``addWeighted`` calculates the weighted sum of two arrays as follows:

-    :param b: The second input array, must be the same size and same type as src1
+.. math::

-    :param sc: Scalar, the second input parameter
+    \texttt{c} (I)= \texttt{saturate} ( \texttt{a} (I)* \texttt{alpha} +  \texttt{b} (I)* \texttt{beta} +  \texttt{gamma} )

-    :param c: The destination array, it will have the same size and same type as src1
+where ``I`` is a multi-dimensional index of array elements. In case of multi-channel arrays, each channel is processed independently.

-    :param mask: he optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+.. seealso:: :ocv:func:`addWeighted`

-Computes per-element subtract between two arrays or between array and a scalar. Supports all data types except CV_8S.
+ocl::subtract
+------------------
+Returns void
+
+.. ocv:function:: void ocl::subtract(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
+
+.. ocv:function:: void ocl::subtract(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
+
+    :param src1: the first input array.
+
+    :param src2: the second input array, must be the same size and same type as ``src1``.
+
+    :param s: scalar, the second input parameter.
+
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
+
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
+
+Computes per-element subtract between two arrays or between array and a scalar. Supports all data types.

 ocl::multiply
 ------------------
 Returns void

-.. ocv:function:: void ocl::multiply( const oclMat& a, const oclMat& b, oclMat& c, double scale=1 )
+.. ocv:function:: void ocl::multiply(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)

-    :param a: The first input array
+    :param src1: the first input array.

-    :param b: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param c: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param scale: must be 1 now
+    :param scale: optional scale factor.

-Computes per-element multiply between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element multiply between two arrays or between array and a scalar. Supports all data types.

 ocl::divide
 ------------------
 Returns void

-.. ocv:function:: void ocl::divide( const oclMat& a, const oclMat& b, oclMat& c, double scale=1 )
+.. ocv:function:: void ocl::divide(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)

-.. ocv:function:: void ocl::divide( double scale, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::divide(double scale, const oclMat& src1, oclMat& dst)

-    :param a: The first input array
+    :param src1: the first input array.

-    :param b: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param c: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param scale: must be 1 now
+    :param scale: scalar factor.

-Computes per-element divide between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element divide between two arrays or between array and a scalar. Supports all data types.

 ocl::bitwise_and
 ------------------
 Returns void

-.. ocv:function:: void ocl::bitwise_and( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())

-.. ocv:function:: void ocl::bitwise_and( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())

-    :param src1: The first input array
+    :param src1: the first input array.

-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param s: Scalar, the second input parameter
+    :param s: scalar, the second input parameter.

-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types.

 ocl::bitwise_or
 ------------------
 Returns void

-.. ocv:function:: void ocl::bitwise_or( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())

-.. ocv:function:: void ocl::bitwise_or( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())

-    :param src1: The first input array
+    :param src1: the first input array.

-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param s: Scalar, the second input parameter
+    :param s: scalar, the second input parameter.

-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types.

 ocl::bitwise_xor
 ------------------
 Returns void

-.. ocv:function:: void ocl::bitwise_xor( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())

-.. ocv:function:: void ocl::bitwise_xor( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())

-    :param src1: The first input array
+    :param src1: the first input array.

-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param sc: Scalar, the second input parameter
+    :param sc: scalar, the second input parameter.

-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types.

 ocl::bitwise_not
 ------------------
@ -209,11 +229,11 @@ Returns void

 .. ocv:function:: void ocl::bitwise_not(const oclMat &src, oclMat &dst)

-    :param src: The input array
+    :param src: the input array.

-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src``.

-The functions bitwise not compute per-element bit-wise inversion of the source array:. Supports all data types except CV_8S.
+The functions bitwise not compute per-element bit-wise inversion of the source array. Supports all data types.

 ocl::cartToPolar
 ------------------
@ -221,17 +241,17 @@ Returns void

 .. ocv:function:: void ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false)

-    :param x: The array of x-coordinates; must be single-precision or double-precision floating-point array
+    :param x: the array of x-coordinates; must be single-precision or double-precision floating-point array.

-    :param y: The array of y-coordinates; it must have the same size and same type as x
+    :param y: the array of y-coordinates; it must have the same size and same type as ``x``.

-    :param magnitude: The destination array of magnitudes of the same size and same type as x
+    :param magnitude: the destination array of magnitudes of the same size and same type as ``x``.

-    :param angle: The destination array of angles of the same size and same type as x. The angles are measured in radians (0 to 2pi ) or in degrees (0 to 360 degrees).
+    :param angle: the destination array of angles of the same size and same type as ``x``. The angles are measured in radians (0 to 2pi) or in degrees (0 to 360 degrees).

-    :param angleInDegrees: The flag indicating whether the angles are measured in radians, which is default mode, or in degrees
+    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.

-Calculates the magnitude and angle of 2d vectors. Supports only CV_32F and CV_64F data types.
+Calculates the magnitude and angle of 2D vectors. Supports only ``CV_32F`` and ``CV_64F`` data types.

 ocl::polarToCart
 ------------------
@ -239,57 +259,57 @@ Returns void

 .. ocv:function:: void ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false)

-    :param magnitude: The source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are =1. If it's not empty, it must have the same size and same type as angle
+    :param magnitude: the source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are = 1. If it's not empty, it must have the same size and same type as ``angle``.

-    :param angle: The source floating-point array of angles of the 2D vectors
+    :param angle: the source floating-point array of angles of the 2D vectors.

-    :param x: The destination array of x-coordinates of 2D vectors; will have the same size and the same type as angle
+    :param x: the destination array of x-coordinates of 2D vectors; will have the same size and the same type as ``angle``.

-    :param y: The destination array of y-coordinates of 2D vectors; will have the same size and the same type as angle
+    :param y: the destination array of y-coordinates of 2D vectors; will have the same size and the same type as ``angle``.

-    :param angleInDegrees: The flag indicating whether the angles are measured in radians, which is default mode, or in degrees
+    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.

-The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only CV_32F and CV_64F data types.
+The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only ``CV_32F`` and ``CV_64F`` data types.

 ocl::compare
 ------------------
 Returns void

-.. ocv:function:: void ocl::compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop)
+.. ocv:function:: void ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop)

-    :param a: The first source array
+    :param src1: the first source array.

-    :param b: The second source array; must have the same size and same type as a
+    :param src2: the second source array; must have the same size and same type as ``src1``.

-    :param c: The destination array; will have the same size as a
+    :param dst: the destination array; will have the same size as ``src1`` and type ``CV_8UC1``.

-    :param cmpop: The flag specifying the relation between the elements to be checked
+    :param cmpop: the flag specifying the relation between the elements to be checked.

-Performs per-element comparison of two arrays or an array and scalar value. Supports all the 1 channel data types except CV_8S.
+Performs per-element comparison of two arrays or an array and scalar value. Supports all data types.

 ocl::exp
 ------------------
 Returns void

-.. ocv:function:: void ocl::exp(const oclMat &a, oclMat &b)
+.. ocv:function:: void ocl::exp(const oclMat &src, oclMat &dst)

-    :param a: The first source array
+    :param src: the first source array.

-    :param b: The dst array; must have the same size and same type as a
+    :param dst: the dst array; must have the same size and same type as ``src``.

-The function exp calculates the exponent of every element of the input array. Supports only CV_32FC1 data type.
+The function exp calculates the exponent of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.

 ocl::log
 ------------------
 Returns void

-.. ocv:function:: void ocl::log(const oclMat &a, oclMat &b)
+.. ocv:function:: void ocl::log(const oclMat &src, oclMat &dst)

-    :param a: The first source array
+    :param src: the first source array.

-    :param b: The dst array; must have the same size and same type as a
+    :param dst: the dst array; must have the same size and same type as ``src``.

-The function log calculates the log of every element of the input array. Supports only CV_32FC1 data type.
+The function log calculates the log of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.

 ocl::LUT
 ------------------
@ -297,13 +317,13 @@ Returns void

 .. ocv:function:: void ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)

-    :param src: Source array of 8-bit elements
+    :param src: source array of 8-bit elements.

-    :param lut: Look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array
+    :param lut: look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array.

-    :param dst: Destination array; will have the same size and the same number of channels as src, and the same depth as lut
+    :param dst: destination array; will have the same size and the same number of channels as ``src``, and the same depth as ``lut``.

-Performs a look-up table transform of an array. Supports only CV_8UC1 and CV_8UC4 data type.
+Performs a look-up table transform of an array.

 ocl::magnitude
 ------------------
@ -311,25 +331,25 @@ Returns void

 .. ocv:function:: void ocl::magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude)

-    :param x: The floating-point array of x-coordinates of the vectors
+    :param x: the floating-point array of x-coordinates of the vectors.

-    :param y: he floating-point array of y-coordinates of the vectors; must have the same size as x
+    :param y: the floating-point array of y-coordinates of the vectors; must have the same size as ``x``.

-    :param magnitude: The destination array; will have the same size and same type as x
+    :param magnitude: the destination array; will have the same size and same type as ``x``.

-The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of x and y arrays. Supports only CV_32F and CV_64F data type.
+The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of ``x`` and ``y`` arrays. Supports only ``CV_32F`` and ``CV_64F`` data types.

 ocl::flip
 ------------------
 Returns void

-.. ocv:function:: void ocl::flip( const oclMat& a, oclMat& b, int flipCode )
+.. ocv:function:: void ocl::flip(const oclMat& src, oclMat& dst, int flipCode)

-    :param a: Source image.
+    :param src: source image.

-    :param b: Destination image
+    :param dst: destination image.

-    :param flipCode: Specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.
+    :param flipCode: specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.

 The function flip flips the array in one of three different ways (row and column indices are 0-based). Supports all data types.

@ -339,13 +359,13 @@ Returns void

 .. ocv:function:: void ocl::meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev)

-    :param mtx: Source image.
+    :param mtx: source image.

-    :param mean: The output parameter: computed mean value
+    :param mean: the output parameter: computed mean value.

-    :param stddev: The output parameter: computed standard deviation
+    :param stddev: the output parameter: computed standard deviation.

-The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types except CV_32F,CV_64F
+The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types except ``CV_32F``, ``CV_64F``.

 ocl::merge
 ------------------
@ -353,9 +373,9 @@ Returns void

 .. ocv:function:: void ocl::merge(const vector<oclMat> &src, oclMat &dst)

-    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type
+    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type.

-    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices
+    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices.

 Composes a multi-channel array from several single-channel arrays. Supports all data types.

@ -379,13 +399,13 @@ Returns the calculated norm

 .. ocv:function:: double ocl::norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2)

-    :param src1: The first source array
+    :param src1: the first source array.

-    :param src2: The second source array of the same size and the same type as src1
+    :param src2: the second source array of the same size and the same type as ``src1``.

-    :param normType: Type of the norm
+    :param normType: type of the norm.

-Calculates absolute array norm, absolute difference norm, or relative difference norm. Supports only CV_8UC1 data type.
+Calculates absolute array norm, absolute difference norm, or relative difference norm. Supports only ``CV_8UC1`` data type.

 ocl::phase
 ------------------
@ -393,15 +413,15 @@ Returns void

 .. ocv:function:: void ocl::phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false)

-    :param x: The source floating-point array of x-coordinates of 2D vectors
+    :param x: the source floating-point array of x-coordinates of 2D vectors

-    :param y: The source array of y-coordinates of 2D vectors; must have the same size and the same type as x
+    :param y: the source array of y-coordinates of 2D vectors; must have the same size and the same type as ``x``.

-    :param angle: The destination array of vector angles; it will have the same size and same type as x
+    :param angle: the destination array of vector angles; it will have the same size and same type as ``x``.

-    :param angleInDegrees: When it is true, the function will compute angle in degrees, otherwise they will be measured in radians
+    :param angleInDegrees: when it is true, the function will compute angle in degrees, otherwise they will be measured in radians.

-The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of x and y. Supports only CV_32FC1 and CV_64FC1 data type.
+The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of ``x`` and ``y``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data type.

 ocl::pow
 ------------------
@ -409,13 +429,13 @@ Returns void

 .. ocv:function:: void ocl::pow(const oclMat &x, double p, oclMat &y)

-    :param x: The source array
+    :param x: the source array.

-    :param power: The exponent of power;The source floating-point array of angles of the 2D vectors
+    :param p: the exponent of power; the source floating-point array of angles of the 2D vectors.

-    :param y: The destination array, should be the same type as the source
+    :param y: the destination array, should be the same type as the source.

-The function pow raises every element of the input array to p. Supports only CV_32FC1 and CV_64FC1 data type.
+The function pow raises every element of the input array to ``p``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data types.

 ocl::transpose
 ------------------
@ -423,26 +443,26 @@ Returns void

 .. ocv:function:: void ocl::transpose(const oclMat &src, oclMat &dst)

-    :param src: The source array
+    :param src: the source array.

-    :param dst: The destination array of the same type as src
+    :param dst: the destination array of the same type as ``src``.

-Transposes a matrix. Supports 8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1 data types.
+Transposes a matrix (in case when ``src`` == ``dst`` and matrix is square the operation are performed inplace)


 ocl::dft
 ------------
 Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.

-.. ocv:function:: void ocl::dft( const oclMat& src, oclMat& dst, Size dft_size=Size(0, 0), int flags=0 )
+.. ocv:function:: void ocl::dft(const oclMat& src, oclMat& dst, Size dft_size = Size(), int flags = 0)

-    :param src: Source matrix (real or complex).
+    :param src: source matrix (real or complex).

-    :param dst: Destination matrix (real or complex).
+    :param dst: destination matrix (real or complex).

-    :param dft_size: Size of original input, which is used for transformation from complex to real.
+    :param dft_size: size of original input, which is used for transformation from complex to real.

-    :param flags: Optional flags:
+    :param flags: optional flags:

        * **DFT_ROWS** transforms each individual row of the source matrix.

@ -452,9 +472,9 @@ Performs a forward or inverse discrete Fourier transform (1D or 2D) of the float

        * **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real.

-Use to handle real matrices ( ``CV32FC1`` ) and complex matrices in the interleaved format ( ``CV32FC2`` ).
+Use to handle real matrices (``CV_32FC1``) and complex matrices in the interleaved format (``CV_32FC2``).

-The dft_size must be powers of 2, 3 and 5. Real to complex dft output is not the same with cpu version. real to complex and complex to real does not support DFT_ROWS
+The ``dft_size`` must be powers of ``2``, ``3`` and ``5``. Real to complex dft output is not the same with cpu version. Real to complex and complex to real does not support ``DFT_ROWS``.

 .. seealso:: :ocv:func:`dft`

@ -464,22 +484,22 @@ Performs generalized matrix multiplication.

 .. ocv:function:: void ocl::gemm(const oclMat& src1, const oclMat& src2, double alpha, const oclMat& src3, double beta, oclMat& dst, int flags = 0)

-    :param src1: First multiplied input matrix that should be ``CV_32FC1`` type.
+    :param src1: first multiplied input matrix that should be ``CV_32FC1`` type.

-    :param src2: Second multiplied input matrix of the same type as  ``src1`` .
+    :param src2: second multiplied input matrix of the same type as ``src1``.

-    :param alpha: Weight of the matrix product.
+    :param alpha: weight of the matrix product.

-    :param src3: Third optional delta matrix added to the matrix product. It should have the same type as  ``src1``  and  ``src2`` .
+    :param src3: third optional delta matrix added to the matrix product. It should have the same type as ``src1`` and ``src2``.

-    :param beta: Weight of  ``src3`` .
+    :param beta: weight of ``src3``.

-    :param dst: Destination matrix. It has the proper size and the same type as input matrices.
+    :param dst: destination matrix. It has the proper size and the same type as input matrices.

-    :param flags: Operation flags:
+    :param flags: operation flags:

-            * **GEMM_1_T** transpose  ``src1``
-            * **GEMM_2_T** transpose  ``src2``
+            * **GEMM_1_T** transpose ``src1``.
+            * **GEMM_2_T** transpose ``src2``.

 .. seealso:: :ocv:func:`gemm`

@ -489,28 +509,29 @@ Returns void

 .. ocv:function:: void ocl::sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false)

-    :param keys:   The keys to be used as sorting indices.
+    :param keys: the keys to be used as sorting indices.

-    :param values: The array of values.
+    :param values: the array of values.

-    :param isGreaterThan: Determine sorting order.
+    :param isGreaterThan: determine sorting order.

    :param method: supported sorting methods:
-            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size
-            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys
-            * **SORT_MERGE**     merge sort
-            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``)
+
+            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size.
+            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys.
+            * **SORT_MERGE**     merge sort.
+            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``).

 Returns the sorted result of all the elements in values based on equivalent keys.

-The element unit in the values to be sorted is determined from the data type,
-i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
+The element unit in the values to be sorted is determined from the data type, i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.

 Both keys and values will be sorted inplace.

-Keys needs to be a **single** channel `oclMat`.
+Keys needs to be a **single** channel ``oclMat``.

 Example::
+
    input -
    keys   = {2,    3,   1}   (CV_8UC1)
    values = {10,5, 4,3, 6,2} (CV_8UC2)
--- a/modules/ocl/doc/structures_and_utility_functions.rst
+++ b/modules/ocl/doc/structures_and_utility_functions.rst
@ -4,7 +4,7 @@ Data Structures and Utility Functions
 .. highlight:: cpp

 ocl::Info
---------
+-------------
 .. ocv:class:: ocl::Info

 this class should be maintained by the user and be passed to getDevice
@ -42,7 +42,7 @@ Returns void
 If you call this function and set a valid path, the OCL module will save the compiled kernel to the address in the first time and reload the binary since that. It can save compilation time at the runtime.

 ocl::getoclContext
------------------
+----------------------
 Returns the pointer to the opencl context

 .. ocv:function:: void* ocl::getoclContext()
--- a/modules/ocl/doc/video_analysis.rst
+++ b/modules/ocl/doc/video_analysis.rst
@ -0,0 +1,570 @@
+Video Analysis
+=============================
+
+.. highlight:: cpp
+
+ocl::GoodFeaturesToTrackDetector_OCL
+----------------------------------------
+.. ocv:class:: ocl::GoodFeaturesToTrackDetector_OCL
+
+Class used for strong corners detection on an image. ::
+
+    class GoodFeaturesToTrackDetector_OCL
+    {
+    public:
+        explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
+            int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
+
+        //! return 1 rows matrix with CV_32FC2 type
+        void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
+        //! download points of type Point2f to a vector. the vector's content will be erased
+        void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
+
+        int maxCorners;
+        double qualityLevel;
+        double minDistance;
+
+        int blockSize;
+        bool useHarrisDetector;
+        double harrisK;
+        void releaseMemory()
+        {
+            Dx_.release();
+            Dy_.release();
+            eig_.release();
+            minMaxbuf_.release();
+            tmpCorners_.release();
+        }
+    };
+
+The class finds the most prominent corners in the image.
+
+.. seealso:: :ocv:func:`goodFeaturesToTrack()`
+
+ocl::GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL
+-------------------------------------------------------------------------
+Constructor.
+
+.. ocv:function:: ocl::GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0, int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04)
+
+    :param maxCorners: Maximum number of corners to return. If there are more corners than are found, the strongest of them is returned.
+
+    :param qualityLevel: Parameter characterizing the minimal accepted quality of image corners. The parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue (see  :ocv:func:`ocl::cornerMinEigenVal` ) or the Harris function response (see  :ocv:func:`ocl::cornerHarris` ). The corners with the quality measure less than the product are rejected. For example, if the best corner has the quality measure = 1500, and the  ``qualityLevel=0.01`` , then all the corners with the quality measure less than 15 are rejected.
+
+    :param minDistance: Minimum possible Euclidean distance between the returned corners.
+
+    :param blockSize: Size of an average block for computing a derivative covariation matrix over each pixel neighborhood. See  :ocv:func:`cornerEigenValsAndVecs` .
+
+    :param useHarrisDetector: Parameter indicating whether to use a Harris detector (see :ocv:func:`ocl::cornerHarris`) or :ocv:func:`ocl::cornerMinEigenVal`.
+
+    :param harrisK: Free parameter of the Harris detector.
+
+ocl::GoodFeaturesToTrackDetector_OCL::operator ()
+-------------------------------------------------------
+Finds the most prominent corners in the image.
+
+.. ocv:function:: void ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat())
+
+    :param image: Input 8-bit, single-channel image.
+
+    :param corners: Output vector of detected corners (it will be one row matrix with CV_32FC2 type).
+
+    :param mask: Optional region of interest. If the image is not empty (it needs to have the type  ``CV_8UC1``  and the same size as  ``image`` ), it  specifies the region in which the corners are detected.
+
+.. seealso:: :ocv:func:`goodFeaturesToTrack`
+
+ocl::GoodFeaturesToTrackDetector_OCL::releaseMemory
+--------------------------------------------------------
+Releases inner buffers memory.
+
+.. ocv:function:: void ocl::GoodFeaturesToTrackDetector_OCL::releaseMemory()
+
+ocl::FarnebackOpticalFlow
+-------------------------------
+.. ocv:class:: ocl::FarnebackOpticalFlow
+
+Class computing a dense optical flow using the Gunnar Farneback's algorithm. ::
+
+    class CV_EXPORTS FarnebackOpticalFlow
+    {
+    public:
+        FarnebackOpticalFlow();
+
+        int numLevels;
+        double pyrScale;
+        bool fastPyramids;
+        int winSize;
+        int numIters;
+        int polyN;
+        double polySigma;
+        int flags;
+
+        void operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy);
+
+        void releaseMemory();
+
+    private:
+        /* hidden */
+    };
+
+ocl::FarnebackOpticalFlow::operator ()
+------------------------------------------
+Computes a dense optical flow using the Gunnar Farneback's algorithm.
+
+.. ocv:function:: void ocl::FarnebackOpticalFlow::operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy)
+
+    :param frame0: First 8-bit gray-scale input image
+    :param frame1: Second 8-bit gray-scale input image
+    :param flowx: Flow horizontal component
+    :param flowy: Flow vertical component
+    :param s: Stream
+
+.. seealso:: :ocv:func:`calcOpticalFlowFarneback`
+
+ocl::FarnebackOpticalFlow::releaseMemory
+--------------------------------------------
+Releases unused auxiliary memory buffers.
+
+.. ocv:function:: void ocl::FarnebackOpticalFlow::releaseMemory()
+
+
+ocl::PyrLKOpticalFlow
+-------------------------
+.. ocv:class:: ocl::PyrLKOpticalFlow
+
+Class used for calculating an optical flow. ::
+
+    class PyrLKOpticalFlow
+    {
+    public:
+        PyrLKOpticalFlow();
+
+        void sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts,
+            oclMat& status, oclMat* err = 0);
+
+        void dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0);
+
+        Size winSize;
+        int maxLevel;
+        int iters;
+        double derivLambda;
+        bool useInitialFlow;
+        float minEigThreshold;
+        bool getMinEigenVals;
+
+        void releaseMemory();
+
+    private:
+        /* hidden */
+    };
+
+The class can calculate an optical flow for a sparse feature set or dense optical flow using the iterative Lucas-Kanade method with pyramids.
+
+.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
+
+ocl::PyrLKOpticalFlow::sparse
+---------------------------------
+Calculate an optical flow for a sparse feature set.
+
+.. ocv:function:: void ocl::PyrLKOpticalFlow::sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err = 0)
+
+    :param prevImg: First 8-bit input image (supports both grayscale and color images).
+
+    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
+
+    :param prevPts: Vector of 2D points for which the flow needs to be found. It must be one row matrix with CV_32FC2 type.
+
+    :param nextPts: Output vector of 2D points (with single-precision floating-point coordinates) containing the calculated new positions of input features in the second image. When ``useInitialFlow`` is true, the vector must have the same size as in the input.
+
+    :param status: Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the flow for the corresponding features has been found. Otherwise, it is set to 0.
+
+    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
+
+.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
+
+
+ocl::PyrLKOpticalFlow::dense
+---------------------------------
+Calculate dense optical flow.
+
+.. ocv:function:: void ocl::PyrLKOpticalFlow::dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0)
+
+    :param prevImg: First 8-bit grayscale input image.
+
+    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
+
+    :param u: Horizontal component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
+
+    :param v: Vertical component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
+
+    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
+
+
+ocl::PyrLKOpticalFlow::releaseMemory
+----------------------------------------
+Releases inner buffers memory.
+
+.. ocv:function:: void ocl::PyrLKOpticalFlow::releaseMemory()
+
+ocl::interpolateFrames
+--------------------------
+Interpolates frames (images) using provided optical flow (displacement field).
+
+.. ocv:function:: void ocl::interpolateFrames(const oclMat& frame0, const oclMat& frame1, const oclMat& fu, const oclMat& fv, const oclMat& bu, const oclMat& bv, float pos, oclMat& newFrame, oclMat& buf)
+
+    :param frame0: First frame (32-bit floating point images, single channel).
+
+    :param frame1: Second frame. Must have the same type and size as ``frame0`` .
+
+    :param fu: Forward horizontal displacement.
+
+    :param fv: Forward vertical displacement.
+
+    :param bu: Backward horizontal displacement.
+
+    :param bv: Backward vertical displacement.
+
+    :param pos: New frame position.
+
+    :param newFrame: Output image.
+
+    :param buf: Temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat: occlusion masks for first frame, occlusion masks for second, interpolated forward horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow, interpolated backward vertical flow.
+
+    :param stream: Stream for the asynchronous version.
+
+ocl::KalmanFilter
+--------------------
+.. ocv:class:: ocl::KalmanFilter
+
+Kalman filter class. ::
+
+    class CV_EXPORTS KalmanFilter
+    {
+    public:
+        KalmanFilter();
+        //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector
+        KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
+        //! re-initializes Kalman filter. The previous content is destroyed.
+        void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
+
+        const oclMat& predict(const oclMat& control=oclMat());
+        const oclMat& correct(const oclMat& measurement);
+
+        oclMat statePre; //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
+        oclMat statePost; //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+        oclMat transitionMatrix; //!< state transition matrix (A)
+        oclMat controlMatrix; //!< control matrix (B) (not used if there is no control)
+        oclMat measurementMatrix; //!< measurement matrix (H)
+        oclMat processNoiseCov; //!< process noise covariance matrix (Q)
+        oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R)
+        oclMat errorCovPre; //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
+        oclMat gain; //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
+        oclMat errorCovPost; //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
+    private:
+        /* hidden */
+    };
+
+ocl::KalmanFilter::KalmanFilter
+----------------------------------
+The constructors.
+
+.. ocv:function:: ocl::KalmanFilter::KalmanFilter()
+
+.. ocv:function:: ocl::KalmanFilter::KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F)
+
+    The full constructor.
+
+    :param dynamParams: Dimensionality of the state.
+
+    :param measureParams: Dimensionality of the measurement.
+
+    :param controlParams: Dimensionality of the control vector.
+
+    :param type: Type of the created matrices that should be ``CV_32F`` or ``CV_64F``.
+
+
+ocl::KalmanFilter::init
+---------------------------
+Re-initializes Kalman filter. The previous content is destroyed.
+
+.. ocv:function:: void ocl::KalmanFilter::init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F)
+
+    :param dynamParams: Dimensionalityensionality of the state.
+
+    :param measureParams: Dimensionality of the measurement.
+
+    :param controlParams: Dimensionality of the control vector.
+
+    :param type: Type of the created matrices that should be ``CV_32F`` or ``CV_64F``.
+
+
+ocl::KalmanFilter::predict
+------------------------------
+Computes a predicted state.
+
+.. ocv:function:: const oclMat& ocl::KalmanFilter::predict(const oclMat& control=oclMat())
+
+    :param control: The optional input control
+
+
+ocl::KalmanFilter::correct
+-----------------------------
+Updates the predicted state from the measurement.
+
+.. ocv:function:: const oclMat& ocl::KalmanFilter::correct(const oclMat& measurement)
+
+    :param measurement: The measured system parameters
+
+
+ocl::BackgroundSubtractor
+----------------------------
+.. ocv:class:: ocl::BackgroundSubtractor
+
+Base class for background/foreground segmentation. ::
+
+    class CV_EXPORTS BackgroundSubtractor
+    {
+    public:
+        //! the virtual destructor
+        virtual ~BackgroundSubtractor();
+        //! the update operator that takes the next video frame and returns the current foreground mask as 8-bit binary image.
+        virtual void operator()(const oclMat& image, oclMat& fgmask, float learningRate);
+
+        //! computes a background image
+        virtual void getBackgroundImage(oclMat& backgroundImage) const = 0;
+    };
+
+
+The class is only used to define the common interface for the whole family of background/foreground segmentation algorithms.
+
+
+ocl::BackgroundSubtractor::operator()
+-----------------------------------------
+Computes a foreground mask.
+
+.. ocv:function:: void ocl::BackgroundSubtractor::operator()(const oclMat& image, oclMat& fgmask, float learningRate)
+
+    :param image: Next video frame.
+
+    :param fgmask: The output foreground mask as an 8-bit binary image.
+
+
+ocl::BackgroundSubtractor::getBackgroundImage
+-------------------------------------------------
+Computes a background image.
+
+.. ocv:function:: void ocl::BackgroundSubtractor::getBackgroundImage(oclMat& backgroundImage) const
+
+    :param backgroundImage: The output background image.
+
+.. note:: Sometimes the background image can be very blurry, as it contain the average background statistics.
+
+ocl::MOG
+------------
+.. ocv:class:: ocl::MOG : public ocl::BackgroundSubtractor
+
+Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm. ::
+
+    class CV_EXPORTS MOG: public cv::ocl::BackgroundSubtractor
+    {
+    public:
+        //! the default constructor
+        MOG(int nmixtures = -1);
+
+        //! re-initiaization method
+        void initialize(Size frameSize, int frameType);
+
+        //! the update operator
+        void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f);
+
+        //! computes a background image which are the mean of all background gaussians
+        void getBackgroundImage(oclMat& backgroundImage) const;
+
+        //! releases all inner buffers
+        void release();
+
+        int history;
+        float varThreshold;
+        float backgroundRatio;
+        float noiseSigma;
+
+    private:
+        /* hidden */
+    };
+
+The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2001]_.
+
+.. seealso:: :ocv:class:`BackgroundSubtractorMOG`
+
+
+ocl::MOG::MOG
+---------------------
+The constructor.
+
+.. ocv:function:: ocl::MOG::MOG(int nmixtures = -1)
+
+    :param nmixtures: Number of Gaussian mixtures.
+
+Default constructor sets all parameters to default values.
+
+
+ocl::MOG::operator()
+------------------------
+Updates the background model and returns the foreground mask.
+
+.. ocv:function:: void ocl::MOG::operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f)
+
+    :param frame: Next video frame.
+
+    :param fgmask: The output foreground mask as an 8-bit binary image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::MOG::getBackgroundImage
+--------------------------------
+Computes a background image.
+
+.. ocv:function:: void ocl::MOG::getBackgroundImage(oclMat& backgroundImage) const
+
+    :param backgroundImage: The output background image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::MOG::release
+---------------------
+Releases all inner buffer's memory.
+
+.. ocv:function:: void ocl::MOG::release()
+
+
+ocl::MOG2
+-------------
+.. ocv:class:: ocl::MOG2 : public ocl::BackgroundSubtractor
+
+Gaussian Mixture-based Background/Foreground Segmentation Algorithm. ::
+
+    class CV_EXPORTS MOG2: public cv::ocl::BackgroundSubtractor
+    {
+    public:
+        //! the default constructor
+        MOG2(int nmixtures = -1);
+
+        //! re-initiaization method
+        void initialize(Size frameSize, int frameType);
+
+        //! the update operator
+        void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = -1.0f);
+
+        //! computes a background image which are the mean of all background gaussians
+        void getBackgroundImage(oclMat& backgroundImage) const;
+
+        //! releases all inner buffers
+        void release();
+
+        int history;
+
+        float varThreshold;
+
+        float backgroundRatio;
+
+        float varThresholdGen;
+
+        float fVarInit;
+        float fVarMin;
+        float fVarMax;
+
+        float fCT;
+
+        bool bShadowDetection;
+        unsigned char nShadowDetection;
+        float fTau;
+
+    private:
+        /* hidden */
+    };
+
+  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2004]_.
+
+  Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
+
+    .. ocv:member:: float backgroundRatio
+
+        Threshold defining whether the component is significant enough to be included into the background model. ``cf=0.1 => TB=0.9`` is default. For ``alpha=0.001``, it means that the mode should exist for approximately 105 frames before it is considered foreground.
+
+    .. ocv:member:: float varThreshold
+
+        Threshold for the squared Mahalanobis distance that helps decide when a sample is close to the existing components (corresponds to ``Tg``). If it is not close to any component, a new component is generated. ``3 sigma => Tg=3*3=9`` is default. A smaller ``Tg`` value generates more components. A higher ``Tg`` value may result in a small number of components but they can grow too large.
+
+    .. ocv:member:: float fVarInit
+
+        Initial variance for the newly generated components. It affects the speed of adaptation. The parameter value is based on your estimate of the typical standard deviation from the images. OpenCV uses 15 as a reasonable value.
+
+    .. ocv:member:: float fVarMin
+
+        Parameter used to further control the variance.
+
+    .. ocv:member:: float fVarMax
+
+        Parameter used to further control the variance.
+
+    .. ocv:member:: float fCT
+
+        Complexity reduction parameter. This parameter defines the number of samples needed to accept to prove the component exists. ``CT=0.05`` is a default value for all the samples. By setting ``CT=0`` you get an algorithm very similar to the standard Stauffer&Grimson algorithm.
+
+    .. ocv:member:: uchar nShadowDetection
+
+        The value for marking shadow pixels in the output foreground mask. Default value is 127.
+
+    .. ocv:member:: float fTau
+
+        Shadow threshold. The shadow is detected if the pixel is a darker version of the background. ``Tau`` is a threshold defining how much darker the shadow can be. ``Tau= 0.5`` means that if a pixel is more than twice darker then it is not shadow. See [ShadowDetect2003]_.
+
+    .. ocv:member:: bool bShadowDetection
+
+        Parameter defining whether shadow detection should be enabled.
+
+.. seealso:: :ocv:class:`BackgroundSubtractorMOG2`
+
+
+ocl::MOG2::MOG2
+-----------------------
+The constructor.
+
+.. ocv:function:: ocl::MOG2::MOG2(int nmixtures = -1)
+
+    :param nmixtures: Number of Gaussian mixtures.
+
+Default constructor sets all parameters to default values.
+
+
+ocl::MOG2::operator()
+-------------------------
+Updates the background model and returns the foreground mask.
+
+.. ocv:function:: void ocl::MOG2::operator()( const oclMat& frame, oclMat& fgmask, float learningRate=-1.0f)
+
+    :param frame: Next video frame.
+
+    :param fgmask: The output foreground mask as an 8-bit binary image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::MOG2::getBackgroundImage
+---------------------------------
+Computes a background image.
+
+.. ocv:function:: void ocl::MOG2::getBackgroundImage(oclMat& backgroundImage) const
+
+    :param backgroundImage: The output background image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::MOG2::release
+----------------------
+Releases all inner buffer's memory.
+
+.. ocv:function:: void ocl::MOG2::release()
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@ -50,6 +50,7 @@
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
+#include "opencv2/ml.hpp"

 namespace cv
 {
@ -153,8 +154,8 @@ namespace cv
            static void setContext(Info &oclinfo);

            enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
-            bool supportsFeature(int ftype);
-            size_t computeUnits();
+            bool supportsFeature(int ftype) const;
+            size_t computeUnits() const;
            size_t maxWorkGroupSize();
            void* oclContext();
            void* oclCommandQueue();
@ -264,13 +265,12 @@ namespace cv

            //! returns deep copy of the oclMatrix, i.e. the data is copied
            oclMat clone() const;
-            //! copies the oclMatrix content to "m".
+
+            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
            // It calls m.create(this->size(), this->type()).
            // It supports any data type
-            void copyTo( oclMat &m ) const;
-            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
-            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
-            void copyTo( oclMat &m, const oclMat &mask ) const;
+            void copyTo( oclMat &m, const oclMat &mask = oclMat()) const;
+
            //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
            void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
@ -407,61 +407,52 @@ namespace cv
        CV_EXPORTS void split(const oclMat &src, std::vector<oclMat> &dst);

        ////////////////////////////// Arithmetics ///////////////////////////////////
-        //#if defined DOUBLE_SUPPORT
-        //typedef double F;
-        //#else
-        //typedef float F;
-        //#endif
-        //	CV_EXPORTS void addWeighted(const oclMat& a,F  alpha, const oclMat& b,F beta,F gama, oclMat& c);
-        CV_EXPORTS void addWeighted(const oclMat &a, double  alpha, const oclMat &b, double beta, double gama, oclMat &c);
-        //! adds one matrix to another (c = a + b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c);
-        //! adds one matrix to another (c = a + b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
-        //! adds scalar to a matrix (c = a + s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
-        //! subtracts one matrix from another (c = a - b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c);
-        //! subtracts one matrix from another (c = a - b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
-        //! subtracts scalar from a matrix (c = a - s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
-        //! subtracts scalar from a matrix (c = a - s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const Scalar &sc, const oclMat &a, oclMat &c, const oclMat &mask = oclMat());
-        //! computes element-wise product of the two arrays (c = a * b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
+
+        //! adds one matrix to another with scale (dst = src1 * alpha + src2 * beta + gama)
+        CV_EXPORTS void addWeighted(const oclMat &src1, double  alpha, const oclMat &src2, double beta, double gama, oclMat &dst);
+
+        //! adds one matrix to another (dst = src1 + src2)
+        // supports all data types
+        CV_EXPORTS void add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        //! adds scalar to a matrix (dst = src1 + s)
+        // supports all data types
+        CV_EXPORTS void add(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
+        //! subtracts one matrix from another (dst = src1 - src2)
+        // supports all data types
+        CV_EXPORTS void subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        //! subtracts scalar from a matrix (dst = src1 - s)
+        // supports all data types
+        CV_EXPORTS void subtract(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
+        //! computes element-wise product of the two arrays (dst = src1 * scale * src2)
+        // supports all data types
+        CV_EXPORTS void multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
        //! multiplies matrix to a number (dst = scalar * src)
-        // supports CV_32FC1 only
+        // supports all data types
        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void divide(double scale, const oclMat &b, oclMat &c);

-        //! compares elements of two arrays (c = a <cmpop> b)
-        // supports except CV_8SC1,CV_8SC2,CV8SC3,CV_8SC4 types
-        CV_EXPORTS void compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop);
+        //! computes element-wise quotient of the two arrays (dst = src1 * scale / src2)
+        // supports all data types
+        CV_EXPORTS void divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
+        //! computes element-wise quotient of the two arrays (dst = scale / src)
+        // supports all data types
+        CV_EXPORTS void divide(double scale, const oclMat &src1, oclMat &dst);
+
+        //! compares elements of two arrays (dst = src1 <cmpop> src2)
+        // supports all data types
+        CV_EXPORTS void compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop);

        //! transposes the matrix
-        // supports  CV_8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1.(the same as cuda)
+        // supports all data types
        CV_EXPORTS void transpose(const oclMat &src, oclMat &dst);

-        //! computes element-wise absolute difference of two arrays (c = abs(a - b))
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void absdiff(const oclMat &a, const oclMat &b, oclMat &c);
-        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void absdiff(const oclMat &a, const Scalar &s, oclMat &c);
+        //! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
+        // supports all data types
+        CV_EXPORTS void absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst);
+        //! computes element-wise absolute difference of array and scalar (dst = abs(src1 - s))
+        // supports all data types
+        CV_EXPORTS void absdiff(const oclMat &src1, const Scalar &s, oclMat &dst);

        //! computes mean value and standard deviation of all or selected array elements
        // supports except CV_32F,CV_64F
@ -479,7 +470,7 @@ namespace cv

        //! reverses the order of the rows, columns or both in a matrix
        // supports all types
-        CV_EXPORTS void flip(const oclMat &a, oclMat &b, int flipCode);
+        CV_EXPORTS void flip(const oclMat &src, oclMat &dst, int flipCode);

        //! computes sum of array elements
        // disabled until fix crash
@ -490,13 +481,11 @@ namespace cv

        //! finds global minimum and maximum array elements and returns their values
        // support all C1 types
-
        CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
        CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);

        //! finds global minimum and maximum array elements and returns their values with locations
        // support all C1 types
-
        CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
                                  const oclMat &mask = oclMat());

@ -525,30 +514,27 @@ namespace cv
        //  This is not truly a bilateral filter. Instead of using user provided fixed parameters,
        //  the function calculates a constant at each window based on local standard deviation,
        //  and use this constant to do filtering.
-        //  supports 8UC1 8UC3
+        //  supports 8UC1, 8UC3
        CV_EXPORTS void adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT);

-        //! computes exponent of each matrix element (b = e**a)
-        // supports only CV_32FC1 type
-        CV_EXPORTS void exp(const oclMat &a, oclMat &b);
+        //! computes exponent of each matrix element (dst = e**src)
+        // supports only CV_32FC1, CV_64FC1 type
+        CV_EXPORTS void exp(const oclMat &src, oclMat &dst);

-        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-        // supports only CV_32FC1 type
-        CV_EXPORTS void log(const oclMat &a, oclMat &b);
+        //! computes natural logarithm of absolute value of each matrix element: dst = log(abs(src))
+        // supports only CV_32FC1, CV_64FC1 type
+        CV_EXPORTS void log(const oclMat &src, oclMat &dst);

        //! computes magnitude of each (x(i), y(i)) vector
-        // supports only CV_32F CV_64F type
+        // supports only CV_32F, CV_64F type
        CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
-        CV_EXPORTS void magnitudeSqr(const oclMat &x, const oclMat &y, oclMat &magnitude);
-
-        CV_EXPORTS void magnitudeSqr(const oclMat &x, oclMat &magnitude);

        //! computes angle (angle(i)) of each (x(i), y(i)) vector
-        // supports only CV_32F CV_64F type
+        // supports only CV_32F, CV_64F type
        CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);

        //! the function raises every element of tne input array to p
-        //! support only CV_32F CV_64F type
+        // support only CV_32F, CV_64F type
        CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);

        //! converts Cartesian coordinates to polar
@ -562,14 +548,17 @@ namespace cv
        //! perfroms per-elements bit-wise inversion
        // supports all types
        CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
+
        //! calculates per-element bit-wise disjunction of two arrays
        // supports all types
        CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
        CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
        //! calculates per-element bit-wise conjunction of two arrays
        // supports all types
        CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
        CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
        //! calculates per-element bit-wise "exclusive or" operation
        // supports all types
        CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
@ -603,7 +592,7 @@ namespace cv
        };

        //! computes convolution of two images, may use discrete Fourier transform
-        //! support only CV_32FC1 type
+        // support only CV_32FC1 type
        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false);
        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf);

@ -614,6 +603,8 @@ namespace cv

        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);

+        CV_EXPORTS void setIdentity(oclMat& src, double val);
+
        //////////////////////////////// Filter Engine ////////////////////////////////

        /*!
@ -982,7 +973,7 @@ namespace cv
        // real to complex dft requires at least v1.8 clAmdFft
        // real to complex dft output is not the same with cpu version
        // real to complex and complex to real does not support DFT_ROWS
-        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0);
+        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(), int flags = 0);

        //! implements generalized matrix product algorithm GEMM from BLAS
        // The functionality requires clAmdBlas library
@ -1954,6 +1945,80 @@ namespace cv

            oclMat bgmodelUsedModes_; //keep track of number of modes per pixel
        };
+
+        /*!***************Kalman Filter*************!*/
+        class CV_EXPORTS KalmanFilter
+        {
+        public:
+            KalmanFilter();
+            //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector
+            KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
+            //! re-initializes Kalman filter. The previous content is destroyed.
+            void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
+
+            const oclMat& predict(const oclMat& control=oclMat());
+            const oclMat& correct(const oclMat& measurement);
+
+            oclMat statePre;           //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
+            oclMat statePost;          //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+            oclMat transitionMatrix;   //!< state transition matrix (A)
+            oclMat controlMatrix;      //!< control matrix (B) (not used if there is no control)
+            oclMat measurementMatrix;  //!< measurement matrix (H)
+            oclMat processNoiseCov;    //!< process noise covariance matrix (Q)
+            oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R)
+            oclMat errorCovPre;        //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
+            oclMat gain;               //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
+            oclMat errorCovPost;       //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
+        private:
+            oclMat temp1;
+            oclMat temp2;
+            oclMat temp3;
+            oclMat temp4;
+            oclMat temp5;
+        };
+
+        static inline size_t divUp(size_t total, size_t grain)
+        {
+            return (total + grain - 1) / grain;
+        }
+
+        /*!***************K Nearest Neighbour*************!*/
+        class CV_EXPORTS KNearestNeighbour: public CvKNearest
+        {
+        public:
+            KNearestNeighbour();
+            ~KNearestNeighbour();
+
+            bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)),
+                bool isRegression = false, int max_k = 32, bool updateBase = false);
+
+            void clear();
+
+            void find_nearest(const oclMat& samples, int k, oclMat& lables);
+
+        private:
+            oclMat samples_ocl;
+        };
+        /*!***************  SVM  *************!*/
+        class CV_EXPORTS CvSVM_OCL : public CvSVM
+        {
+        public:
+            CvSVM_OCL();
+
+            CvSVM_OCL(const cv::Mat& trainData, const cv::Mat& responses,
+                      const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
+                      CvSVMParams params=CvSVMParams());
+            CV_WRAP float predict( const int row_index, Mat& src, bool returnDFVal=false ) const;
+            CV_WRAP void predict( cv::InputArray samples, cv::OutputArray results ) const;
+            CV_WRAP float predict( const cv::Mat& sample, bool returnDFVal=false ) const;
+            float predict( const CvMat* samples, CV_OUT CvMat* results ) const;
+
+        protected:
+            float predict( const int row_index, int row_len, Mat& src, bool returnDFVal=false ) const;
+            void create_kernel();
+            void create_solver();
+        };
+        /*!***************  END  *************!*/
    }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@ -174,6 +174,8 @@ namespace cv
        size_t CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel);
        template<>
        bool CV_EXPORTS queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel kernel);
+
+        unsigned long CV_EXPORTS queryLocalMemInfo();
    }//namespace ocl

 }//namespace cv
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@ -842,54 +842,6 @@ PERF_TEST_P(PowFixture, pow, OCL_TYPICAL_MAT_SIZES)
        OCL_PERF_ELSE
 }

-///////////// MagnitudeSqr////////////////////////
-
-typedef TestBaseWithParam<Size> MagnitudeSqrFixture;
-
-PERF_TEST_P(MagnitudeSqrFixture, MagnitudeSqr, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst(srcSize, CV_32FC1);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::magnitudeSqr(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        ASSERT_EQ(1, src1.channels());
-
-        TEST_CYCLE()
-        {
-            for (int y = 0; y < srcSize.height; ++y)
-            {
-                const float * const src1Data = reinterpret_cast<float *>(src1.data + src1.step * y);
-                const float * const src2Data = reinterpret_cast<float *>(src2.data + src2.step * y);
-                float * const dstData = reinterpret_cast<float *>(dst.data + dst.step * y);
-                for (int x = 0; x < srcSize.width; ++x)
-                {
-                    float t0 = src1Data[x] * src1Data[x];
-                    float t1 = src2Data[x] * src2Data[x];
-                    dstData[x] = t0 + t1;
-                }
-            }
-        }
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
 ///////////// AddWeighted////////////////////////

 typedef Size_MatType AddWeightedFixture;
--- a/modules/ocl/perf/perf_bgfg.cpp
+++ b/modules/ocl/perf/perf_bgfg.cpp
@ -44,12 +44,14 @@
 //
 //M*/
 #include "perf_precomp.hpp"
+
 using namespace perf;
 using namespace std;
 using namespace cv::ocl;
 using namespace cv;
 using std::tr1::tuple;
 using std::tr1::get;
+
 #if defined(HAVE_XINE)         || \
    defined(HAVE_GSTREAMER)    || \
    defined(HAVE_QUICKTIME)    || \
@ -63,6 +65,7 @@ using std::tr1::get;
 #endif

 #if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 static void cvtFrameFmt(vector<Mat>& input, vector<Mat>& output)
 {
    for(int i = 0; i< (int)(input.size()); i++)
@ -70,6 +73,7 @@ static void cvtFrameFmt(vector<Mat>& input, vector<Mat>& output)
        cvtColor(input[i], output[i], COLOR_RGB2GRAY);
    }
 }
+
 //prepare data for CPU
 static void prepareData(VideoCapture& cap, int cn, vector<Mat>& frame_buffer)
 {
@ -88,15 +92,15 @@ static void prepareData(VideoCapture& cap, int cn, vector<Mat>& frame_buffer)
    else
        frame_buffer = frame_buffer_init;
 }
+
 //copy CPU data to GPU
 static void prepareData(vector<Mat>& frame_buffer, vector<oclMat>& frame_buffer_ocl)
 {
    for(int i = 0; i < (int)frame_buffer.size(); i++)
        frame_buffer_ocl.push_back(cv::ocl::oclMat(frame_buffer[i]));
 }
-#endif
+
 ///////////// MOG ////////////////////////
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT

 typedef tuple<string, int, double> VideoMOGParamType;
 typedef TestBaseWithParam<VideoMOGParamType> VideoMOGFixture;
@ -137,7 +141,8 @@ PERF_TEST_P(VideoMOGFixture, MOG,
            }
        }
        SANITY_CHECK(foreground);
-    }else if(RUN_OCL_IMPL)
+    }
+    else if(RUN_OCL_IMPL)
    {
        prepareData(frame_buffer, frame_buffer_ocl);
        CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
@ -152,13 +157,12 @@ PERF_TEST_P(VideoMOGFixture, MOG,
        }
        foreground_d.download(foreground);
        SANITY_CHECK(foreground);
-    }else
+    }
+    else
        OCL_PERF_ELSE
 }
-#endif

 ///////////// MOG2 ////////////////////////
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT

 typedef tuple<string, int> VideoMOG2ParamType;
 typedef TestBaseWithParam<VideoMOG2ParamType> VideoMOG2Fixture;
@ -196,7 +200,8 @@ PERF_TEST_P(VideoMOG2Fixture, MOG2,
            }
        }
        SANITY_CHECK(foreground);
-    }else if(RUN_OCL_IMPL)
+    }
+    else if(RUN_OCL_IMPL)
    {
        prepareData(frame_buffer, frame_buffer_ocl);
        CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
@ -211,13 +216,12 @@ PERF_TEST_P(VideoMOG2Fixture, MOG2,
        }
        foreground_d.download(foreground);
        SANITY_CHECK(foreground);
-    }else
+    }
+    else
        OCL_PERF_ELSE
 }
-#endif

 ///////////// MOG2_GetBackgroundImage //////////////////
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT

 typedef TestBaseWithParam<VideoMOG2ParamType> Video_MOG2GetBackgroundImage;

@ -259,7 +263,8 @@ PERF_TEST_P(Video_MOG2GetBackgroundImage, MOG2,
            mog2->getBackgroundImage(background);
        }
        SANITY_CHECK(background);
-    }else if(RUN_OCL_IMPL)
+    }
+    else if(RUN_OCL_IMPL)
    {
        prepareData(frame_buffer, frame_buffer_ocl);
        CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
@ -276,7 +281,9 @@ PERF_TEST_P(Video_MOG2GetBackgroundImage, MOG2,
        }
        background_d.download(background);
        SANITY_CHECK(background);
-    }else
+    }
+    else
        OCL_PERF_ELSE
 }
+
 #endif
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@ -333,13 +333,13 @@ PERF_TEST_P(BilateralFixture, Bilateral,
    const Size_MatType_t params = GetParam();
    const Size srcSize = get<0>(params);
    const int type = get<1>(params), d = 7;
-    double sigmacolor = 50.0, sigmaspace = 50.0;
+    const double sigmacolor = 50.0, sigmaspace = 50.0;

    Mat src(srcSize, type), dst(srcSize, type);
    declare.in(src, WARMUP_RNG).out(dst);

-    if (srcSize == OCL_SIZE_4000 && type == CV_8UC3)
-        declare.time(8);
+    if (srcSize == OCL_SIZE_4000)
+        declare.time(type == CV_8UC3 ? 8 : 4.5);

    if (RUN_OCL_IMPL)
    {
@ -372,14 +372,16 @@ PERF_TEST_P(adaptiveBilateralFixture, adaptiveBilateral,
    const Size_MatType_t params = GetParam();
    const Size srcSize = get<0>(params);
    const int type = get<1>(params);
-    double sigmaspace = 10.0;
-    Size ksize(9,9);
+    const double sigmaspace = 10.0;
+    Size ksize(9, 9);

    Mat src(srcSize, type), dst(srcSize, type);
    declare.in(src, WARMUP_RNG).out(dst);

    if (srcSize == OCL_SIZE_4000)
-        declare.time(15);
+        declare.time(type == CV_8UC3 ? 46 : 28);
+    else if (srcSize == OCL_SIZE_2000)
+        declare.time(type == CV_8UC3 ? 11 : 7);

    if (RUN_OCL_IMPL)
    {
@ -389,7 +391,7 @@ PERF_TEST_P(adaptiveBilateralFixture, adaptiveBilateral,

        oclDst.download(dst);

-        SANITY_CHECK(dst, 1.);
+        SANITY_CHECK(dst, 1.0);
    }
    else if (RUN_PLAIN_IMPL)
    {
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@ -49,6 +49,23 @@ using namespace perf;

 ///////////// HOG////////////////////////

+struct RectLess :
+        public std::binary_function<cv::Rect, cv::Rect, bool>
+{
+    bool operator()(const cv::Rect& a,
+        const cv::Rect& b) const
+    {
+        if (a.x != b.x)
+            return a.x < b.x;
+        else if (a.y != b.y)
+            return a.y < b.y;
+        else if (a.width != b.width)
+            return a.width < b.width;
+        else
+            return a.height < b.height;
+    }
+};
+
 PERF_TEST(HOGFixture, HOG)
 {
    Mat src = imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE);
@ -64,6 +81,7 @@ PERF_TEST(HOGFixture, HOG)

        TEST_CYCLE() hog.detectMultiScale(src, found_locations);

+        std::sort(found_locations.begin(), found_locations.end(), RectLess());
        SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
    }
    else if (RUN_OCL_IMPL)
@ -74,6 +92,7 @@ PERF_TEST(HOGFixture, HOG)

        OCL_TEST_CYCLE() ocl_hog.detectMultiScale(oclSrc, found_locations);

+        std::sort(found_locations.begin(), found_locations.end(), RectLess());
        SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
    }
    else
--- a/modules/ocl/perf/perf_kalman.cpp
+++ b/modules/ocl/perf/perf_kalman.cpp
@ -0,0 +1,93 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "perf_precomp.hpp"
+using namespace perf;
+using namespace std;
+using namespace cv::ocl;
+using namespace cv;
+using std::tr1::tuple;
+using std::tr1::get;
+///////////// Kalman Filter ////////////////////////
+
+typedef tuple<int> KalmanFilterType;
+typedef TestBaseWithParam<KalmanFilterType> KalmanFilterFixture;
+
+PERF_TEST_P(KalmanFilterFixture, KalmanFilter,
+            ::testing::Values(1000, 1500))
+{
+    KalmanFilterType params = GetParam();
+    const int dim = get<0>(params);
+
+    cv::Mat sample(dim, 1, CV_32FC1), dresult;
+    randu(sample, -1, 1);
+
+    cv::Mat statePre_;
+
+    if(RUN_PLAIN_IMPL)
+    {
+        cv::KalmanFilter kalman;
+        TEST_CYCLE()
+        {
+            kalman.init(dim, dim);
+            kalman.correct(sample);
+            kalman.predict();
+        }
+        statePre_ = kalman.statePre;
+    }else if(RUN_OCL_IMPL)
+    {
+        cv::ocl::oclMat dsample(sample);
+        cv::ocl::KalmanFilter kalman_ocl;
+        OCL_TEST_CYCLE()
+        {
+            kalman_ocl.init(dim, dim);
+            kalman_ocl.correct(dsample);
+            kalman_ocl.predict();
+        }
+        kalman_ocl.statePre.download(statePre_);
+    }else
+        OCL_PERF_ELSE
+    SANITY_CHECK(statePre_);
+}
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@ -155,3 +155,78 @@ PERF_TEST_P(setToFixture, setTo,
    else
        OCL_PERF_ELSE
 }
+
+/////////////////// upload ///////////////////////////
+
+typedef tuple<Size, int, int> uploadParams;
+typedef TestBaseWithParam<uploadParams> uploadFixture;
+
+PERF_TEST_P(uploadFixture, DISABLED_upload,
+            testing::Combine(
+                OCL_TYPICAL_MAT_SIZES,
+                testing::Range(CV_8U, CV_64F),
+                testing::Range(1, 5)))
+{
+    const uploadParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), cn = get<2>(params);
+    const int type = CV_MAKE_TYPE(depth, cn);
+
+    Mat src(srcSize, type), dst;
+    declare.in(src, WARMUP_RNG);
+
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclDst;
+
+        for(; startTimer(), next(); ocl::finish(), stopTimer(), oclDst.release())
+            oclDst.upload(src);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
+            dst = src.clone();
+    }
+    else
+        OCL_PERF_ELSE
+
+    int value = 0;
+    SANITY_CHECK(value);
+}
+
+/////////////////// download ///////////////////////////
+
+typedef TestBaseWithParam<uploadParams> downloadFixture;
+
+PERF_TEST_P(downloadFixture, DISABLED_download,
+            testing::Combine(
+                OCL_TYPICAL_MAT_SIZES,
+                testing::Range(CV_8U, CV_64F),
+                testing::Range(1, 5)))
+{
+    const uploadParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), cn = get<2>(params);
+    const int type = CV_MAKE_TYPE(depth, cn);
+
+    Mat src(srcSize, type), dst;
+    declare.in(src, WARMUP_RNG);
+
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src);
+
+        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
+            oclSrc.download(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
+            dst = src.clone();
+    }
+    else
+        OCL_PERF_ELSE
+
+    int value = 0;
+    SANITY_CHECK(value);
+}
--- a/modules/ocl/perf/perf_ml.cpp
+++ b/modules/ocl/perf/perf_ml.cpp
@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma, jin@multicorewareinc.com
+//    Xiaopeng Fu, fuxiaopeng2222@163.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "perf_precomp.hpp"
+using namespace perf;
+using namespace std;
+using namespace cv::ocl;
+using namespace cv;
+using std::tr1::tuple;
+using std::tr1::get;
+////////////////////////////////// K-NEAREST NEIGHBOR ////////////////////////////////////
+static void genData(Mat& trainData, Size size, Mat& trainLabel = Mat().setTo(Scalar::all(0)), int nClasses = 0)
+{
+    trainData.create(size, CV_32FC1);
+    randu(trainData, 1.0, 100.0);
+
+    if(nClasses != 0)
+    {
+        trainLabel.create(size.height, 1, CV_8UC1);
+        randu(trainLabel, 0, nClasses - 1);
+        trainLabel.convertTo(trainLabel, CV_32FC1);
+    }
+}
+
+typedef tuple<int> KNNParamType;
+typedef TestBaseWithParam<KNNParamType> KNNFixture;
+
+PERF_TEST_P(KNNFixture, KNN,
+            testing::Values(1000, 2000, 4000))
+{
+    KNNParamType params = GetParam();
+    const int rows = get<0>(params);
+    int columns = 100;
+    int k = rows/250;
+
+    Mat trainData, trainLabels;
+    Size size(columns, rows);
+    genData(trainData, size, trainLabels, 3);
+
+    Mat testData;
+    genData(testData, size);
+    Mat best_label;
+
+    if(RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE()
+        {
+            CvKNearest knn_cpu;
+            knn_cpu.train(trainData, trainLabels);
+            knn_cpu.find_nearest(testData, k, &best_label);
+        }
+    }else if(RUN_OCL_IMPL)
+    {
+        cv::ocl::oclMat best_label_ocl;
+        cv::ocl::oclMat testdata;
+        testdata.upload(testData);
+
+        OCL_TEST_CYCLE()
+        {
+            cv::ocl::KNearestNeighbour knn_ocl;
+            knn_ocl.train(trainData, trainLabels);
+            knn_ocl.find_nearest(testdata, k, best_label_ocl);
+        }
+        best_label_ocl.download(best_label);
+    }else
+        OCL_PERF_ELSE
+    SANITY_CHECK(best_label);
+}
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@ -357,14 +357,13 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
    std::vector< std::pair<size_t, const void *> > args;
    size_t localThreads[3]  = {128, 1, 1};

-#define DIVUP(a, b) ((a)+(b)-1)/(b)
    int count_i[1] = {0};
    while(count > 0)
    {
        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));

        args.clear();
-        size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
+        size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1};
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st2.data));
@ -379,7 +378,6 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
        std::swap(st1, st2);
    }
-#undef DIVUP
 }

 void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols)
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@ -67,22 +67,12 @@ extern const char *filtering_adaptive_bilateral;
 }
 }

-namespace
-{
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-}
-
 namespace
 {
 inline void normalizeAnchor(int &anchor, int ksize)
 {
    if (anchor < 0)
-    {
        anchor = ksize >> 1;
-    }

    CV_Assert(0 <= anchor && anchor < ksize);
 }
@ -96,9 +86,7 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize)
 inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size)
 {
    if (roi == Rect(0, 0, -1, -1))
-    {
        roi = Rect(0, 0, src_size.width, src_size.height);
-    }

    CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
    CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
@ -111,10 +99,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8
    int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;

    if (nDivisor)
-    {
        *nDivisor = scale;
-    }
-
    Mat temp(kernel.size(), type);
    kernel.convertTo(temp, type, scale);
    Mat cont_krnl = temp.reshape(1, 1);
@ -124,9 +109,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8
        int count = cont_krnl.cols >> 1;

        for (int i = 0; i < count; ++i)
-        {
            std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
-        }
    }

    gpu_krnl.upload(cont_krnl);
@ -146,7 +129,7 @@ public:
    {
        Size src_size = src.size();

-        // Delete those two clause below which exist before, However, the result is alos correct
+        // Delete those two clause below which exist before, However, the result is also correct
        // dst.create(src_size, src.type());
        // dst = Scalar(0.0);

@ -411,23 +394,8 @@ public:
    {
        Filter2DEngine_GPU::apply(src, dst);

-        //if (iters > 1)
-        //{
-        // Size wholesize;
-        // Point ofs;
-        // dst.locateROI(wholesize,ofs);
-        // int rows = dst.rows, cols = dst.cols;
-        // dst.adjustROI(ofs.y,-ofs.y-rows+dst.wholerows,ofs.x,-ofs.x-cols+dst.wholecols);
-        // dst.copyTo(morfBuf);
-        // dst.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // morfBuf.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // //morfBuf.create(src.size(),src.type());
-        // //Filter2DEngine_GPU::apply(dst, morfBuf);
-        // //morfBuf.copyTo(dst);
-        //}
        for (int i = 1; i < iters; ++i)
        {
-            //dst.swap(morfBuf);
            Size wholesize;
            Point ofs;
            dst.locateROI(wholesize, ofs);
@ -627,8 +595,6 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel
    int localWidth = localThreads[0] + paddingPixels;
    int localHeight = localThreads[1] + paddingPixels;

-    // 260 = divup((localThreads[0] + filterWidth * 2), 4) * 4
-    // 6   = (ROWS_PER_GROUP_WHICH_IS_4 + filterWidth * 2)
    size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();

    int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
@ -739,24 +705,16 @@ public:
    virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
    {
        Size src_size = src.size();
-        //int src_type = src.type();

        int cn = src.oclchannels();
-        //dst.create(src_size, src_type);
-        //dst = Scalar(0.0);
-        //dstBuf.create(src_size, src_type);
        dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
-        //dstBuf = Scalar(0.0);

        normalizeROI(roi, ksize, anchor, src_size);

        srcROI = src(roi);
        dstROI = dst(roi);
-        //dstBufROI = dstBuf(roi);

        (*rowFilter)(srcROI, dstBuf);
-        //Mat rm(dstBufROI);
-        //std::cout << "rm " << rm << endl;
        (*columnFilter)(dstBuf, dstROI);
    }

@ -1343,11 +1301,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
    CV_Assert(src.oclchannels() == dst.oclchannels());
    CV_Assert(ksize == (anchor << 1) + 1);
    int src_pix_per_row, dst_pix_per_row;
-    //int src_offset_x, src_offset_y;
    int dst_offset_in_pixel;
    src_pix_per_row = src.step / src.elemSize();
-    //src_offset_x = (src.offset % src.step) / src.elemSize();
-    //src_offset_y = src.offset / src.step;
    dst_pix_per_row = dst.step / dst.elemSize();
    dst_offset_in_pixel = dst.offset / dst.elemSize();

@ -1359,8 +1314,6 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_pix_per_row));
-    //args.push_back(std::make_pair(sizeof(cl_int),(void*)&src_offset_x));
-    //args.push_back(std::make_pair(sizeof(cl_int),(void*)&src_offset_y));
    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_pix_per_row));
    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel));
    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
@ -1379,23 +1332,11 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
        linearColumnFilter_gpu<int>,
        linearColumnFilter_gpu<float>
    };
-    /*
-    CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 ||
-    dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1);
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 ||
-    bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);
-
-    Mat temp(columnKernel.size(), CV_32SC1);
-    columnKernel.convertTo(temp, CV_32SC1);
-    Mat cont_krnl = temp.reshape(1, 1);
-    */
+
    Mat temp = columnKernel.reshape(1, 1);
    oclMat mat_kernel(temp);

    int ksize = temp.cols;
-
-    //CV_Assert(ksize < 16);
-
    normalizeAnchor(anchor, ksize);

    return makePtr<GpuLinearColumnFilter>(ksize, anchor, mat_kernel,
@ -1433,11 +1374,8 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
    }

    if (ddepth < 0)
-    {
        ddepth = src.depth();
-    }

-    //CV_Assert(ddepth == src.depth());
    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));

    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
@ -1464,19 +1402,11 @@ void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
        // usually the smoothing part is the slowest to compute,
        // so try to scale it instead of the faster differenciating part
        if (dx == 0)
-        {
            kx *= scale;
-        }
        else
-        {
            ky *= scale;
-        }
    }

-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);
-
    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType);
 }

@ -1490,19 +1420,11 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
        // usually the smoothing part is the slowest to compute,
        // so try to scale it instead of the faster differenciating part
        if (dx == 0)
-        {
            kx *= scale;
-        }
        else
-        {
            ky *= scale;
-        }
    }

-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);
-
    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
 }

@ -1524,9 +1446,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
    Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]);

    if (scale != 1)
-    {
        kernel *= scale;
-    }

    filter2D(src, dst, ddepth, kernel, Point(-1, -1));
 }
@ -1545,14 +1465,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do

    // automatic detection of kernel size from sigma
    if (ksize.width <= 0 && sigma1 > 0)
-    {
        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }

    if (ksize.height <= 0 && sigma2 > 0)
-    {
        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }

    CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1);

@ -1563,17 +1479,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
    Mat ky;

    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
-    {
        ky = kx;
-    }
    else
-    {
        ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
-    }

-    //Mat kx_, ky_;
-    //kx.convertTo(kx_,CV_32S,1<<8);
-    //ky.convertTo(ky_,CV_32S,1<<8);
    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
 }

@ -1604,14 +1513,10 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
    if (bordertype != BORDER_CONSTANT)
    {
        if (src.rows == 1)
-        {
            ksize.height = 1;
-        }

        if (src.cols == 1)
-        {
            ksize.width = 1;
-        }
    }

    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
@ -1637,6 +1542,7 @@ void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize
    {
        lut.at<float>(idx++) = sigma2 / (sigma2 + x * x + y * y);
    }
+
    oclMat dlut(lut);
    int depth = src.depth();
    int cn = src.oclchannels();
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@ -124,11 +124,6 @@ namespace cv

 using namespace ::cv::ocl::device;

-static inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-
 cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_,
                                      Size cell_size_, int nbins_, double win_sigma_,
                                      double threshold_L2hys_, bool gamma_correction_, int nlevels_)
@ -1671,7 +1666,8 @@ void cv::ocl::device::hog::compute_hists(int nbins,
    {
        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
            localThreads, args, -1, -1, "-D CPU");
-    }else
+    }
+    else
    {
        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@ -245,9 +245,6 @@ namespace cv
                    kernelName = "remapNNF1Constant";
            }

-            //int channels = dst.oclchannels();
-            //int depth = dst.depth();
-            //int type = src.type();
            size_t blkSizeX = 16, blkSizeY = 16;
            size_t glbSizeX;
            int cols = dst.cols;
@ -501,21 +498,13 @@ namespace cv
                openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
            }
            else
-            {
                CV_Error(Error::StsUnsupportedFormat, "Non-supported filter length");
-                //String kernelName = "medianFilter";
-                //args.push_back( std::make_pair( sizeof(cl_int),(void*)&m));
-
-                //openCLExecuteKernel(clCxt,&imgproc_median,kernelName,globalThreads,localThreads,args,src.oclchannels(),-1);
-            }
-
        }

        ////////////////////////////////////////////////////////////////////////
        // copyMakeBorder
        void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
        {
-            //CV_Assert(src.oclchannels() != 2);
            CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
            if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
            {
@ -531,10 +520,12 @@ namespace cv
            {
                CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
            }
+
            if(bordertype == cv::BORDER_REFLECT_101)
            {
                CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
            }
+
            dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
            int srcStep = src.step1() / src.oclchannels();
            int dstStep = dst.step1() / dst.oclchannels();
@ -734,19 +725,6 @@ namespace cv
            }

            openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
-            //uchar* cputemp=new uchar[32*dst.wholerows];
-            ////int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
-            //openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE,
-            //						0, 32*dst.wholerows, cputemp, 0, NULL, NULL));
-            //for(int i=0;i<dst.wholerows;i++)
-            //{
-            //	for(int j=0;j<dst.wholecols;j++)
-            //	{
-            //		std::cout<< (int)cputemp[i*32+j]<<" ";
-            //	}
-            //	std::cout<<std::endl;
-            //}
-            //delete []cputemp;
        }

        ////////////////////////////////////////////////////////////////////////
@ -1512,11 +1490,6 @@ namespace cv
        // CLAHE
        namespace clahe
        {
-            inline int divUp(int total, int grain)
-            {
-                return (total + grain - 1) / grain * grain;
-            }
-
            static void calcLut(const oclMat &src, oclMat &dst,
                const int tilesX, const int tilesY, const cv::Size tileSize,
                const int clipLimit, const float lutScale)
@ -1540,9 +1513,7 @@ namespace cv
                size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
                bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
                if (is_cpu)
-                {
                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
-                }
                else
                {
                    cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
@ -1577,7 +1548,7 @@ namespace cv

                String kernelName = "transform";
                size_t localThreads[3]  = { 32, 8, 1 };
-                size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 };
+                size_t globalThreads[3] = { src.cols, src.rows, 1 };

                openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
            }
@ -1819,11 +1790,6 @@ void cv::ocl::mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int /*fl
    openCLExecuteKernel(clCxt, &imgproc_mulAndScaleSpectrums, kernelName, gt, lt, args, -1, -1);
 }
 //////////////////////////////////convolve////////////////////////////////////////////////////
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-
 // ported from CUDA module
 void cv::ocl::ConvolveBuf::create(Size image_size, Size templ_size)
 {
@ -1938,6 +1904,7 @@ static void convolve_run_fft(const oclMat &image, const oclMat &templ, oclMat &r
 #undef UNUSED
 #endif
 }
+
 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, String kernelName, const char **kernelString)
 {
    CV_Assert(src.depth() == CV_32FC1);
@ -1959,10 +1926,7 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, St
    int rows = dst.rows;

    size_t localThreads[3]  = { 16, 16, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, rows, 1 };

    std::vector<std::pair<size_t , const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@ -282,11 +282,6 @@ namespace cv
            return 0;
        }

-        inline int divUp(int total, int grain)
-        {
-            return (total + grain - 1) / grain;
-        }
-
        int getDevice(std::vector<Info> &oclinfo, int devicetype)
        {
            //TODO: cache oclinfo vector
@ -687,6 +682,16 @@ namespace cv
            CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
        }

+        static inline size_t roundUp(size_t sz, size_t n)
+        {
+            // we don't assume that n is a power of 2 (see alignSize)
+            // equal to divUp(sz, n) * n
+            size_t t = sz + n - 1;
+            size_t rem = t % n;
+            size_t result = t - rem;
+            return result;
+        }
+
 #ifdef PRINT_KERNEL_RUN_TIME
        static double total_execute_time = 0;
        static double total_kernel_time = 0;
@ -710,11 +715,10 @@ namespace cv

            if ( localThreads != NULL)
            {
-                globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
-                globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
-                globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
+                globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
+                globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
+                globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);

-                //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
                cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
            }
            for(size_t i = 0; i < args.size(); i ++)
@ -745,10 +749,6 @@ namespace cv
            execute_time = (double)(end_time - start_time) / (1000 * 1000);
            total_time = (double)(end_time - queue_time) / (1000 * 1000);

-            //	std::cout << setiosflags(ios::left) << setw(15) << execute_time;
-            //	std::cout << setiosflags(ios::left) << setw(15) << total_time - execute_time;
-            //	std::cout << setiosflags(ios::left) << setw(15) << total_time << std::endl;
-
            total_execute_time += execute_time;
            total_kernel_time += total_time;
            clReleaseEvent(event);
@ -1016,7 +1016,7 @@ namespace cv
            programCache->releaseProgram();
        }

-        bool Context::supportsFeature(int ftype)
+        bool Context::supportsFeature(int ftype) const
        {
            switch(ftype)
            {
@ -1031,7 +1031,7 @@ namespace cv
            }
        }

-        size_t Context::computeUnits()
+        size_t Context::computeUnits() const
        {
            return impl->maxComputeUnits;
        }
@ -1041,6 +1041,14 @@ namespace cv
            return impl->maxWorkGroupSize;
        }

+        unsigned long queryLocalMemInfo()
+        {
+            Info::Impl* impl = Context::getContext()->impl;
+            cl_ulong local_memory_size = 0;
+            clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void*)&local_memory_size, 0);
+            return local_memory_size;
+        }
+
        void* Context::oclContext()
        {
            return impl->oclcontext;
--- a/modules/ocl/src/kalman.cpp
+++ b/modules/ocl/src/kalman.cpp
@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//     Jin Ma, jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+
+KalmanFilter::KalmanFilter()
+{
+
+}
+
+KalmanFilter::KalmanFilter(int dynamParams, int measureParams, int controlParams, int type)
+{
+    init(dynamParams, measureParams, controlParams, type);
+}
+
+void KalmanFilter::init(int DP, int MP, int CP, int type)
+{
+    CV_Assert( DP > 0 && MP > 0 );
+    CV_Assert( type == CV_32F || type == CV_64F );
+    CP = cv::max(CP, 0);
+
+    statePre.create(DP, 1, type);
+    statePre.setTo(Scalar::all(0));
+
+    statePost.create(DP, 1, type);
+    statePost.setTo(Scalar::all(0));
+
+    transitionMatrix.create(DP, DP, type);
+    setIdentity(transitionMatrix, 1);
+
+    processNoiseCov.create(DP, DP, type);
+    setIdentity(processNoiseCov, 1);
+
+    measurementNoiseCov.create(MP, MP, type);
+    setIdentity(measurementNoiseCov, 1);
+
+    measurementMatrix.create(MP, DP, type);
+    measurementMatrix.setTo(Scalar::all(0));
+
+    errorCovPre.create(DP, DP, type);
+    errorCovPre.setTo(Scalar::all(0));
+
+    errorCovPost.create(DP, DP, type);
+    errorCovPost.setTo(Scalar::all(0));
+
+    gain.create(DP, MP, type);
+    gain.setTo(Scalar::all(0));
+
+    if( CP > 0 )
+    {
+        controlMatrix.create(DP, CP, type);
+        controlMatrix.setTo(Scalar::all(0));
+    }
+    else
+        controlMatrix.release();
+
+    temp1.create(DP, DP, type);
+    temp2.create(MP, DP, type);
+    temp3.create(MP, MP, type);
+    temp4.create(MP, DP, type);
+    temp5.create(MP, 1, type);
+}
+
+CV_EXPORTS const oclMat& KalmanFilter::predict(const oclMat& control)
+{
+    gemm(transitionMatrix, statePost, 1, oclMat(), 0, statePre);
+    oclMat temp;
+
+    if(control.data)
+        gemm(controlMatrix, control, 1, statePre, 1, statePre);
+    gemm(transitionMatrix, errorCovPost, 1, oclMat(), 0, temp1);
+    gemm(temp1, transitionMatrix, 1, processNoiseCov, 1, errorCovPre, GEMM_2_T);
+    statePre.copyTo(statePost);
+    return statePre;
+}
+
+CV_EXPORTS const oclMat& KalmanFilter::correct(const oclMat& measurement)
+{
+    CV_Assert(measurement.empty() == false);
+    gemm(measurementMatrix, errorCovPre, 1, oclMat(), 0, temp2);
+    gemm(temp2, measurementMatrix, 1, measurementNoiseCov, 1, temp3, GEMM_2_T);
+    Mat temp;
+    solve(Mat(temp3), Mat(temp2), temp, DECOMP_SVD);
+    temp4.upload(temp);
+    gain = temp4.t();
+    gemm(measurementMatrix, statePre, -1, measurement, 1, temp5);
+    gemm(gain, temp5, 1, statePre, 1, statePost);
+    gemm(gain, temp2, -1, errorCovPre, 1, errorCovPost);
+    return statePost;
+}
--- a/modules/ocl/src/knearest.cpp
+++ b/modules/ocl/src/knearest.cpp
@ -0,0 +1,157 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma, jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+using namespace cv;
+using namespace cv::ocl;
+
+namespace cv
+{
+    namespace ocl
+    {
+        extern const char* knearest;//knearest
+    }
+}
+
+KNearestNeighbour::KNearestNeighbour()
+{
+    clear();
+}
+
+KNearestNeighbour::~KNearestNeighbour()
+{
+    clear();
+    samples_ocl.release();
+}
+
+void KNearestNeighbour::clear()
+{
+    CvKNearest::clear();
+}
+
+bool KNearestNeighbour::train(const Mat& trainData, Mat& labels, Mat& sampleIdx,
+                              bool isRegression, int _max_k, bool updateBase)
+{
+    max_k = _max_k;
+    bool cv_knn_train = CvKNearest::train(trainData, labels, sampleIdx, isRegression, max_k, updateBase);
+
+    CvVectors* s = CvKNearest::samples;
+
+    cv::Mat samples_mat(s->count, CvKNearest::var_count + 1, s->type);
+
+    float* s1 = (float*)(s + 1);
+    for(int i = 0; i < s->count; i++)
+    {
+        float* t1 = s->data.fl[i];
+        for(int j = 0; j < CvKNearest::var_count; j++)
+        {
+            Point pos(j, i);
+            samples_mat.at<float>(pos) = t1[j];
+        }
+
+        Point pos_label(CvKNearest::var_count, i);
+        samples_mat.at<float>(pos_label) = s1[i];
+    }
+
+    samples_ocl = samples_mat;
+    return cv_knn_train;
+}
+
+void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lables)
+{
+    CV_Assert(!samples_ocl.empty());
+    lables.create(samples.rows, 1, CV_32FC1);
+
+    CV_Assert(samples.cols == CvKNearest::var_count);
+    CV_Assert(samples.type() == CV_32FC1);
+    CV_Assert(k >= 1 && k <= max_k);
+
+    int k1 = KNearest::get_sample_count();
+    k1 = MIN( k1, k );
+
+    String kernel_name = "knn_find_nearest";
+    cl_ulong local_memory_size = queryLocalMemInfo();
+    int nThreads = local_memory_size / (2 * k * 4);
+    if(nThreads >= 256)
+        nThreads = 256;
+
+    int smem_size = nThreads * k * 4 * 2;
+    size_t local_thread[] = {1, nThreads, 1};
+    size_t global_thread[] = {1, samples.rows, 1};
+
+    char build_option[50];
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        sprintf(build_option, " ");
+    }else
+        sprintf(build_option, "-D DOUBLE_SUPPORT");
+
+    std::vector< std::pair<size_t, const void*> > args;
+
+    int samples_ocl_step = samples_ocl.step/samples_ocl.elemSize();
+    int samples_step = samples.step/samples.elemSize();
+    int lables_step = lables.step/lables.elemSize();
+
+    int _regression = 0;
+    if(CvKNearest::regression)
+        _regression = 1;
+
+    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&samples.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples.cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&k));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&samples_ocl.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&lables.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&lables_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&_regression));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&k1));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl.cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&nThreads));
+    args.push_back(std::make_pair(smem_size, (void*)NULL));
+    openCLExecuteKernel(Context::getContext(), &knearest, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
+}
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@ -295,11 +295,6 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
    m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
 }

-/////////////////////common//////////////////////////////////////
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
 ///////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// CopyTo /////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////
@ -319,11 +314,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
    char compile_option[32];
    sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-
-    globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
-    globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
-    globalThreads[2] = 1;
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };

    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
@ -344,19 +335,14 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
                        localThreads, args, -1, -1, compile_option);
 }

-void cv::ocl::oclMat::copyTo( oclMat &m ) const
-{
-    CV_DbgAssert(!this->empty());
-    m.create(size(), type());
-    openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
-                       data, step, cols * elemSize(), rows, offset);
-}
-
 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 {
    if (mask.empty())
    {
-        copyTo(mat);
+        CV_DbgAssert(!this->empty());
+        mat.create(size(), type());
+        openCLCopyBuffer2D(clCxt, mat.data, mat.step, mat.offset,
+                           data, step, cols * elemSize(), rows, offset);
    }
    else
    {
@ -370,40 +356,50 @@ void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 ///////////////////////////////////////////////////////////////////////////
 static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
 {
-    String kernelName = "convert_to_S";
-    std::stringstream idxStr;
-    idxStr << src.depth();
-    kernelName = kernelName + idxStr.str().c_str();
+    String kernelName = "convert_to";
    float alpha_f = alpha, beta_f = beta;
+    int sdepth = src.depth(), ddepth = dst.depth();
+    int sstep1 = (int)src.step1(), dstep1 = (int)dst.step1();
+    int cols1 = src.cols * src.oclchannels();
+
+    char buildOptions[150], convertString[50];
+    const char * typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    sprintf(convertString, "convert_%s_sat_rte", typeMap[ddepth]);
+    sprintf(buildOptions, "-D srcT=%s -D dstT=%s -D convertToDstType=%s", typeMap[sdepth],
+            typeMap[ddepth], CV_32F == ddepth || ddepth == CV_64F ? "" : convertString);
+
    CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
    std::vector<std::pair<size_t , const void *> > args;
-    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
-    globalThreads[2] = 1;
-    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
-    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
-    if(dst.type() == CV_8UC1)
-    {
-        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
-    }
+
+    size_t localThreads[3] = { 16, 16, 1 };
+    size_t globalThreads[3] = { divUp(cols1, localThreads[0]) * localThreads[0],
+                                divUp(dst.rows, localThreads[1]) * localThreads[1], 1 };
+
+    int doffset1 = dst.offset / dst.elemSize1();
+    int soffset1 = src.offset / src.elemSize1();
+
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols1 ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sstep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&soffset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&doffset1 ));
    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&alpha_f ));
    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&beta_f ));
+
    openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
-                        localThreads, args, dst.oclchannels(), dst.depth());
+                        localThreads, args, -1, -1, buildOptions);
 }
 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
 {
-    //cout << "cv::ocl::oclMat::convertTo()" << endl;
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&
+            (depth() == CV_64F || dst.depth() == CV_64F))
+    {
+        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        return;
+    }

    bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
                   && fabs(beta) < std::numeric_limits<double>::epsilon();
@ -413,7 +409,6 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
    else
        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());

-    //int scn = channels();
    int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
    if( sdepth == ddepth && noScale )
    {
@ -433,201 +428,62 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
 ///////////////////////////////////////////////////////////////////////////
 //////////////////////////////// setTo ////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////
+
 oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
 {
-    //cout << "cv::ocl::oclMat::=" << endl;
    setTo(s);
    return *this;
 }
+
 static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, String kernelName)
 {
    std::vector<std::pair<size_t , const void *> > args;

    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
-    globalThreads[2] = 1;
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
-    if(dst.type() == CV_8UC1)
-    {
+
+    if (dst.type() == CV_8UC1)
        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    }
-    char compile_option[32];
-    union sc
-    {
-        cl_uchar4 uval;
-        cl_char4  cval;
-        cl_ushort4 usval;
-        cl_short4 shval;
-        cl_int4 ival;
-        cl_float4 fval;
-        cl_double4 dval;
-    } val;
-    switch(dst.depth())
-    {
-    case CV_8U:
-        val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
-        val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
-        val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
-        val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=uchar");
-            args.push_back( std::make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=uchar4");
-            args.push_back( std::make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_8S:
-        val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
-        val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
-        val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
-        val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=char");
-            args.push_back( std::make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=char4");
-            args.push_back( std::make_pair( sizeof(cl_char4) , (void *)&val.cval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_16U:
-        val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
-        val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
-        val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
-        val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=ushort");
-            args.push_back( std::make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=ushort4");
-            args.push_back( std::make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_16S:
-        val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
-        val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
-        val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
-        val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=short");
-            args.push_back( std::make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=short4");
-            args.push_back( std::make_pair( sizeof(cl_short4) , (void *)&val.shval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_32S:
-        val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
-        val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
-        val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
-        val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=int");
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
-            break;
-        case 2:
-            sprintf(compile_option, "-D GENTYPE=int2");
-            cl_int2 i2val;
-            i2val.s[0] = val.ival.s[0];
-            i2val.s[1] = val.ival.s[1];
-            args.push_back( std::make_pair( sizeof(cl_int2) , (void *)&i2val ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=int4");
-            args.push_back( std::make_pair( sizeof(cl_int4) , (void *)&val.ival ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_32F:
-        val.fval.s[0] = scalar.val[0];
-        val.fval.s[1] = scalar.val[1];
-        val.fval.s[2] = scalar.val[2];
-        val.fval.s[3] = scalar.val[3];
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=float");
-            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=float4");
-            args.push_back( std::make_pair( sizeof(cl_float4) , (void *)&val.fval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_64F:
-        val.dval.s[0] = scalar.val[0];
-        val.dval.s[1] = scalar.val[1];
-        val.dval.s[2] = scalar.val[2];
-        val.dval.s[3] = scalar.val[3];
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=double");
-            args.push_back( std::make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=double4");
-            args.push_back( std::make_pair( sizeof(cl_double4) , (void *)&val.dval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    default:
-        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
-    }
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char channelMap[] = { ' ', ' ', '2', '4', '4' };
+    std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
+
+    Mat mat(1, 1, dst.type(), scalar);
+
 #ifdef CL_VERSION_1_2
-    //this enables backwards portability to
-    //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-    if(Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
+    // this enables backwards portability to
+    // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
+    if (Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
        dst.offset == 0 && dst.cols == dst.wholecols)
    {
+        const int sizeofMap[][7] =
+            {
+                { sizeof(cl_uchar) , sizeof(cl_char) , sizeof(cl_ushort) , sizeof(cl_short) , sizeof(cl_int) , sizeof(cl_float) , sizeof(cl_double)  },
+                { sizeof(cl_uchar2), sizeof(cl_char2), sizeof(cl_ushort2), sizeof(cl_short2), sizeof(cl_int2), sizeof(cl_float2), sizeof(cl_double2) },
+                { 0                , 0               , 0                 , 0                , 0              , 0                ,  0                 },
+                { sizeof(cl_uchar4), sizeof(cl_char4), sizeof(cl_ushort4), sizeof(cl_short4), sizeof(cl_int4), sizeof(cl_float4), sizeof(cl_double4) },
+            };
+        int sizeofGeneric = sizeofMap[dst.oclchannels() - 1][dst.depth()];
+
        clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
-            (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
+                            (cl_mem)dst.data, (void*)mat.data, sizeofGeneric,
+                            0, dst.step * dst.rows, 0, NULL, NULL);
    }
    else
 #endif
    {
+        oclMat m(mat);
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void*)&m.data ));
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
+
        openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
-            localThreads, args, -1, -1, compile_option);
+            localThreads, args, -1, -1, buildOptions.c_str());
    }
 }

@ -635,161 +491,16 @@ static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const o
 {
    CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
    std::vector<std::pair<size_t , const void *> > args;
-    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
-    globalThreads[2] = 1;
+    size_t localThreads[3] = { 16, 16, 1 };
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
-    char compile_option[32];
-    union sc
-    {
-        cl_uchar4 uval;
-        cl_char4  cval;
-        cl_ushort4 usval;
-        cl_short4 shval;
-        cl_int4 ival;
-        cl_float4 fval;
-        cl_double4 dval;
-    } val;
-    switch(dst.depth())
-    {
-    case CV_8U:
-        val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
-        val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
-        val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
-        val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=uchar");
-            args.push_back( std::make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=uchar4");
-            args.push_back( std::make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_8S:
-        val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
-        val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
-        val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
-        val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=char");
-            args.push_back( std::make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=char4");
-            args.push_back( std::make_pair( sizeof(cl_char4) , (void *)&val.cval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_16U:
-        val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
-        val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
-        val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
-        val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=ushort");
-            args.push_back( std::make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=ushort4");
-            args.push_back( std::make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_16S:
-        val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
-        val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
-        val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
-        val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=short");
-            args.push_back( std::make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=short4");
-            args.push_back( std::make_pair( sizeof(cl_short4) , (void *)&val.shval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_32S:
-        val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
-        val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
-        val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
-        val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=int");
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=int4");
-            args.push_back( std::make_pair( sizeof(cl_int4) , (void *)&val.ival ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_32F:
-        val.fval.s[0] = scalar.val[0];
-        val.fval.s[1] = scalar.val[1];
-        val.fval.s[2] = scalar.val[2];
-        val.fval.s[3] = scalar.val[3];
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=float");
-            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=float4");
-            args.push_back( std::make_pair( sizeof(cl_float4) , (void *)&val.fval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_64F:
-        val.dval.s[0] = scalar.val[0];
-        val.dval.s[1] = scalar.val[1];
-        val.dval.s[2] = scalar.val[2];
-        val.dval.s[3] = scalar.val[3];
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=double");
-            args.push_back( std::make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=double4");
-            args.push_back( std::make_pair( sizeof(cl_double4) , (void *)&val.dval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    default:
-        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
-    }
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char channelMap[] = { ' ', ' ', '2', '4', '4' };
+    std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
+
+    oclMat m(Mat(1, 1, dst.type(), scalar));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&m.data ));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
@ -799,38 +510,21 @@ static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const o
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
    openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
-                        localThreads, args, -1, -1, compile_option);
+                        localThreads, args, -1, -1, buildOptions.c_str());
 }

 oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
 {
-    //cout << "cv::ocl::oclMat::setTo()" << endl;
    CV_Assert(mask.type() == CV_8UC1);
    CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
    CV_DbgAssert( !this->empty());
-    //cl_int status;
-    //cl_mem mem;
-    //mem = clCreateBuffer(this->clCxt->clContext,CL_MEM_READ_WRITE,
-    //                   sizeof(double)*4,NULL,&status);
-    //openCLVerifyCall(status);
-    //double* s =  (double *)scalar.val;
-    //openCLSafeCall(clEnqueueWriteBuffer(this->clCxt->clCmdQueue,
-    //                   (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0));
    if (mask.empty())
    {
-        if(type() == CV_8UC1)
-        {
-            set_to_withoutmask_run(*this, scalar, "set_to_without_mask_C1_D0");
-        }
-        else
-        {
-            set_to_withoutmask_run(*this, scalar, "set_to_without_mask");
-        }
+        set_to_withoutmask_run(*this, scalar, type() == CV_8UC1 ?
+                                   "set_to_without_mask_C1_D0" : "set_to_without_mask");
    }
    else
-    {
        set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
-    }

    return *this;
 }
@ -845,79 +539,38 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
    oclMat hdr = *this;

    int cn = oclchannels();
-
    if (new_cn == 0)
-
        new_cn = cn;

-
-
    int total_width = cols * cn;
-
-
-
    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
-
        new_rows = rows * total_width / new_cn;

-
-
    if (new_rows != 0 && new_rows != rows)
-
    {
-
        int total_size = total_width * rows;

-
-
        if (!isContinuous())
-
            CV_Error(Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed");

-
-
        if ((unsigned)new_rows > (unsigned)total_size)
-
            CV_Error(Error::StsOutOfRange, "Bad new number of rows");

-
-
        total_width = total_size / new_rows;
-
-
-
        if (total_width * new_rows != total_size)
-
            CV_Error(Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");

-
-
        hdr.rows = new_rows;
-
        hdr.step = total_width * elemSize1();
-
    }

-
-
    int new_width = total_width / new_cn;
-
-
-
    if (new_width * new_cn != total_width)
-
        CV_Error(Error::BadNumChannels, "The total width is not divisible by the new number of channels");

-
-
    hdr.cols = new_width;
-
    hdr.wholecols = new_width;
-
    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
-
-
-
    return hdr;

 }
@ -973,7 +626,6 @@ void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,

 void cv::ocl::oclMat::release()
 {
-    //cout << "cv::ocl::oclMat::release()" << endl;
    if( refcount && CV_XADD(refcount, -1) == 1 )
    {
        fastFree(refcount);
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@ -71,12 +71,6 @@ namespace cv
 {
    namespace ocl
    {
-
-        inline int divUp(int total, int grain)
-        {
-            return (total + grain - 1) / grain;
-        }
-
        // provide additional methods for the user to interact with the command queue after a task is fired
        static void openCLExecuteKernel_2(Context *clCxt , const char **source, String kernelName, size_t globalThreads[3],
                                   size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
--- a/modules/ocl/src/opencl/arithm_2_mat.cl
+++ b/modules/ocl/src/opencl/arithm_2_mat.cl
@ -1,158 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
-#define CV_PI   3.1415926535897932384626433832795
-
-char round_char(double v){
-    char v1=(char)v;
-    return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned char round_uchar(double v){
-    unsigned char v1=(unsigned char)v;
-    return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-short round_short(double v){
-    short v1=(short)v;
-    return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned short round_ushort(double v){
-    unsigned short v1=(unsigned short)v;
-    return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-int round_int(double v){
-    int v1=(int)v;
-    return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-
-char round2_char(double v){
-    char v1=(char)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned char round2_uchar(double v){
-    unsigned char v1=(unsigned char)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-short round2_short(double v){
-    short v1=(short)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned short round2_ushort(double v){
-    unsigned short v1=(unsigned short)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-int round2_int(double v){
-    int v1=(int)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-
-/*****************************************EXP***************************************/
-__kernel void arithm_op_exp_5 (int rows,int cols,int srcStep,__global float *src1Mat,
-                             __global float * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 2 ) + x;
-        dstMat[idx] = (float)exp((float)src1Mat[idx]);
-    }
-}
-__kernel void arithm_op_exp_6 (int rows,int cols,int srcStep,__global double *src1Mat,
-                             __global double * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 3 ) + x;
-        dstMat[idx] = exp(src1Mat[idx]);
-    }
-}
-
-/*****************************************LOG***************************************/
-__kernel void arithm_op_log_5 (int rows,int cols,int srcStep,__global float *src1Mat,
-                             __global float * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 2 ) + x;
-        dstMat[idx] =(float) log((float)src1Mat[idx]);
-    }
-}
-__kernel void arithm_op_log_6 (int rows,int cols,int srcStep,__global double *src1Mat,
-                             __global double * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 3 ) + x;
-        dstMat[idx] = log(src1Mat[idx]);
-    }
-}
--- a/modules/ocl/src/opencl/arithm_LUT.cl
+++ b/modules/ocl/src/opencl/arithm_LUT.cl
@ -38,125 +38,66 @@
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif

-__kernel
-void LUT_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C1( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0)<<2;
-    int gidy = get_global_id(1);
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
+    int x1 = get_global_id(0);
+    int y = get_global_id(1);

-    __local uchar l[256];
-    l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-
-
-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= cols-4?cols-4:gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    p.x = src[src_index];
-    p.y = src[src_index+1];
-    p.z = src[src_index+2];
-    p.w = src[src_index+3];
+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);

-    q.x = l[p.x];
-    q.y = l[p.y];
-    q.z = l[p.z];
-    q.w = l[p.w];
-    *(__global uchar4*)(dst + dst_index) = q;
+        dst[dst_index] = lut[lut_offset1 + src[src_index]];
+    }
 }

-__kernel
-void LUT2_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int precols,
-      int channels,
-      int whole_rows,
-      int cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C2( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    //int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    __local uchar l[256];
-    l[lidy] = table[lidy+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);

+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);

-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= precols ? cols+gidx : gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    //uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uchar p = src[src_index];
-    uchar q = l[p];
-    dst[dst_index] = q;
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 1)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 1) + 1] : dst[dst_index + 1];
+    }
 }

-__kernel
-void LUT_C4_D0( __global uchar4 *dst,
-      __global uchar4 *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C4( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);

-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    int src_index = mad24(gidy,src_step,gidx+src_offset);
-    int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
-    __local uchar l[256];
-    l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(gidx<cols && gidy<rows)
+    if (x1 < cols1 && y < rows)
    {
-        uchar4 p = src[src_index];
-        uchar4 q;
-        q.x = l[p.x];
-        q.y = l[p.y];
-        q.z = l[p.z];
-        q.w = l[p.w];
-        dst[dst_index] = q;
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 2)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 2) + 1] : dst[dst_index + 1];
+        dst[dst_index + 2] = x1 + 2 < cols1 ? lut[lut_offset1 + (src[src_index + 2] << 2) + 2] : dst[dst_index + 2];
+        dst[dst_index + 3] = x1 + 3 < cols1 ? lut[lut_offset1 + (src[src_index + 3] << 2) + 3] : dst[dst_index + 3];
    }
 }
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ b/modules/ocl/src/opencl/arithm_absdiff.cl
@ -1,970 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////absdiff////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************adddiff *************************************/
-__kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                 __global uchar *src2, int src2_step, int src2_offset,
-                                 __global uchar *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = abs_diff(src1_data, src2_data);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                 __global ushort *src2, int src2_step, int src2_offset,
-                                 __global ushort *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = abs_diff(src1_data, src2_data);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_offset,
-                                 __global short *src2, int src2_step, int src2_offset,
-                                 __global short *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4  dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        ushort4 tmp = abs_diff(src1_data, src2_data);
-        short4  tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_absdiff_D4 (__global int *src1, int src1_step, int src1_offset,
-                                 __global int *src2, int src2_step, int src2_offset,
-                                 __global int *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        uint tmp = abs_diff(data1, data2);
-        int  tmp_data = convert_int_sat(tmp);
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-__kernel void arithm_absdiff_D5 (__global float *src1, int src1_step, int src1_offset,
-                                 __global float *src2, int src2_step, int src2_offset,
-                                 __global float *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = fabs(data1 - data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_absdiff_D6 (__global double *src1, int src1_step, int src1_offset,
-                                 __global double *src2, int src2_step, int src2_offset,
-                                 __global double *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double tmp = fabs(data1-data2);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-#endif
-
-/**************************************absdiff with scalar**************************************/
-__kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        uint tmp_data = abs_diff(src_data1, src_data2);
-        int  data = convert_int_sat(tmp_data);
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = fabs(src_data1 - src_data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src2_data = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = fabs(src_data1 - src2_data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = convert_ushort2_sat( abs_diff(convert_int2_sat(src_data1), src_data2));
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src_data1), src_data2));
-        short2 data = convert_short2_sat(tmp);
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(abs_diff(src_data1, src_data2));
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = fabs(src_data1 - src_data2);
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = fabs(src_data1 - src_data2);
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
-        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
-        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
-        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
-        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
-        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
-        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
-
-        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
-        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
-        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
-        uchar4 tmp_data_0 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_0), src2_data_0));
-        uchar4 tmp_data_1 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_1), src2_data_1));
-        uchar4 tmp_data_2 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_2), src2_data_2));
-
-        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
-                     ? tmp_data_0.w : data_0.w;
-
-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
-                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.zw : data_1.zw;
-
-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
-                     ? tmp_data_2.yzw : data_2.yzw;
-
-        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
-        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
-        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
-        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
-        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
-        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
-        int2 src2_data_0 = (int2)(src2.x, src2.y);
-        int2 src2_data_1 = (int2)(src2.z, src2.x);
-        int2 src2_data_2 = (int2)(src2.y, src2.z);
-
-        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
-        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
-        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
-        ushort2 tmp_data_0 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
-        ushort2 tmp_data_1 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
-        ushort2 tmp_data_2 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
-
-        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                    ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_1.y : data_1.y;
-
-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_2.xy : data_2.xy;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
-        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
-        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
-        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
-        int2 src2_data_0 = (int2)(src2.x, src2.y);
-        int2 src2_data_1 = (int2)(src2.z, src2.x);
-        int2 src2_data_2 = (int2)(src2.y, src2.z);
-
-        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
-        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
-        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
-        short2 tmp_data_0 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
-        short2 tmp_data_1 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
-        short2 tmp_data_2 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
-
-        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                    ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_1.y : data_1.y;
-
-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_2.xy : data_2.xy;
-
-        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
-
-        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
-        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
-        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
-        int src2_data_0 = src2.x;
-        int src2_data_1 = src2.y;
-        int src2_data_2 = src2.z;
-
-        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
-        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
-        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
-        int tmp_data_0 = convert_int_sat(abs_diff(src1_data_0, src2_data_0));
-        int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
-        int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
-
-        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
-
-        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
-        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
-        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
-        float src2_data_0 = src2.x;
-        float src2_data_1 = src2.y;
-        float src2_data_2 = src2.z;
-
-        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
-        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
-        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
-        float tmp_data_0 = fabs(src1_data_0 - src2_data_0);
-        float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
-        float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-
-        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
-
-        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
-        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
-        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
-        double src2_data_0 = src2.x;
-        double src2_data_1 = src2.y;
-        double src2_data_2 = src2.z;
-
-        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
-        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
-        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
-        double tmp_data_0 = fabs(src1_data_0 - src2_data_0);
-        double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
-        double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-
-        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-    }
-}
-#endif
-__kernel void arithm_s_absdiff_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = convert_uchar4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = convert_ushort4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = convert_short4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = convert_int4_sat(abs_diff(src_data1, src2));
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-
-        float4 data = fabs(src_data1 - src2);
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-
-        double4 data = fabs(src_data1 - src2);
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@ -52,809 +52,105 @@
 #endif
 #endif

-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////ADD////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************add without mask**************************************/
-__kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        short4 tmp      = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        int4    tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        int4   tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
-        short4 tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        long tmp  = ARITHM_OP((long)(data1), (long)(data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
-    }
-}
-__kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = ARITHM_OP(data1, data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        *((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2);
-    }
-}
-#endif
-
-/**************************************add with mask**************************************/
-__kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        if(mask_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src_data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        short4   tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
-        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
-        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index));
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
+///////////////////////////////////////////// ADD ////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////

-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index));
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
    }
 }
-#endif

-__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_binary_op_mat_div(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+        T zero = (T)(0);
+        dst[dst_index] = src2[src2_index] == zero ? zero : convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
    }
 }
-__kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);

-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_absdiff_mat(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+        WT value = convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]);
+        value = value > (WT)(0) ? value : -value;
+        dst[dst_index] = convertToT(value);
    }
 }
-__kernel void arithm_add_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);

-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+// add mat with scale for multiply
+__kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset,
+                                __global T *src2, int src2_step, int src2_offset,
+                               __global WT *scalar,
+                               __global T *dst, int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);

-        uchar mask_data = *(mask + mask_index);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-        float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index));
-        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
-
-        float4 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index]));
    }
 }

-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+// add mat with scale for divide
+__kernel void arithm_binary_op_mat_scalar_div(__global T *src1, int src1_step, int src1_offset,
+                                __global T *src2, int src2_step, int src2_offset,
+                               __global WT *scalar,
+                               __global T *dst, int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 5) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);

-        uchar mask_data = *(mask + mask_index);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-        double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index));
-        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
-
-        double4 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
+        T zero = (T)(0);
+        dst[dst_index] = src2[src2_index] == zero ? zero :
+            convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index]));
    }
 }
-#endif
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@ -42,392 +42,34 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #elif defined (cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
-typedef double F;
-#else
-typedef float F;
 #endif
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
-                              __global uchar *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global uchar *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data ,src2_data;
-
-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-//        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
-        short4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-        // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
-    }
-
-}
-
-
-
-__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
-                              __global ushort *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global ushort *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-
-}
-
-
-__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
-                              __global short *src2,  int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global short *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        short4 tmp_data = convert_short4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-

-__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
-                              __global int *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global int *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
+__kernel void addWeighted(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global T * dst, int dst_step1, int dst_offset1,
+                              WT alpha, WT beta, WT gama,
+                              int cols1, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
-
+    if (x < cols1 && y < rows)
    {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);

-        x = x << 2;
-
-#define bitOfInt  (sizeof(int)== 4 ? 2: 3)
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> bitOfInt) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
-        // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-        float4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        int4 tmp_data = convert_int4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = dst_data;
+        dst[dst_index] = convertToT(src1[src1_index]*alpha + src2[src2_index]*beta + gama);
    }
-
 }
-
-
-__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
-                              __global float *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global float *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-
-        // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
-        float4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        // float4 tmp_data = convert_float4(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
-                              __global double *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global double *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
-        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
-        double4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 16 >= dst_start) && (dst_index + 16 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 24 >= dst_start) && (dst_index + 24 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_add_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_mask.cl
@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// add with mask //////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_mat_mask(__global T * src1, int src1_step, int src1_offset,
+                              __global T * src2, int src2_step, int src2_offset,
+                              __global uchar * mask, int mask_step, int mask_offset,
+                              __global T * dst, int dst_step, int dst_offset,
+                              int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = mad24(y, src2_step, x + src2_offset);
+            int dst_index  = mad24(y, dst_step, dst_offset + x);
+
+            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
+        }
+    }
+}
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@ -51,463 +51,61 @@
 #endif
 #endif

-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
-/**************************************add with scalar without mask**************************************/
-__kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src2_data = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src2_data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        ushort2 data = convert_ushort2_sat(tmp);
+///////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Add with scalar /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////

-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset,
+                                 __global WT *scalar,
+                                 __global T *dst,  int dst_step,  int dst_offset,
+                                 int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);

-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        short2 data = convert_short2_sat(tmp);
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
    }
 }
-__kernel void arithm_s_add_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{

-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_absdiff_scalar(__global T *src1, int src1_step, int src1_offset,
+                         __global WT *src2,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-        float2 data = ARITHM_OP(src_data1, src_data2);
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
+        WT value = convertToWT(src1[src1_index]) - src2[0];
+        value = value > (WT)(0) ? value : -value;
+        dst[dst_index] = convertToT(value);
    }
 }

-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
+// scalar divide to matrix
+__kernel void arithm_binary_op_scalar_div(__global T *src1, int src1_step, int src1_offset,
+                               __global WT *scalar,
+                               __global T *dst,  int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);

-        double2 data = ARITHM_OP(src_data1, src_data2);
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+        T zero = (T)(0);
+        dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index]));
    }
 }
-__kernel void arithm_s_add_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2)));
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-
-        float4 data = ARITHM_OP(src_data1, src2);
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-
-        double4 data = ARITHM_OP(src_data1, src2);
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@ -51,561 +51,28 @@
 #endif
 #endif

-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
-/**************************************add with scalar with mask**************************************/
-__kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(mask_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
+///////////////////////////////////////////////////////////////////////////////////
+//////////////////////////// Add with scalar with mask ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////

-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset,
+                                     __global WT *scalar,
+                                     __global uchar *mask, int mask_step, int mask_offset,
+                                     __global T *dst,  int dst_step,  int dst_offset,
+                                     int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_add_with_mask_C1_D5 (__global   float   *src1, int src1_step, int src1_offset,
-                                            __global   float   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C1_D6 (__global   double   *src1, int src1_step, int src1_offset,
-                                            __global   double   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src_data2 = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4   src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4  tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                            __global   float *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                            __global   double *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                            __global   float *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
-
-        float4 data = ARITHM_OP(src_data1, src2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                            __global   double *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
-
-        double4 data = ARITHM_OP(src_data1, src2);
-        data = mask_data ? data : dst_data;
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int dst_index = mad24(y, dst_step, dst_offset + x);

-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
+            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
+        }
    }
 }
-#endif
--- a/modules/ocl/src/opencl/arithm_bitwise_binary.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
@ -43,303 +43,25 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//bitwise_binary without mask for and, or, xor operators

 /////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////bitwise_binary///////////////////////////////////////////
+/////////////////////////////////////////// bitwise_binary //////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////////

-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
-
-__kernel void arithm_bitwise_binary_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                     __global uchar *src2, int src2_step, int src2_offset,
-                                     __global uchar *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_D1 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        char4 src1_data = vload4(0, src1 + src1_index_fix);
-        char4 src2_data = vload4(0, src2 + src2_index_fix);
-
-        if(src1_index < 0)
-        {
-            char4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            char4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        char4 dst_data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global char4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                     __global ushort *src2, int src2_step, int src2_offset,
-                                     __global ushort *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
+__kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset,
+                                    __global uchar * src2, int src2_step, int src2_offset,
+                                    __global uchar * dst, int dst_step, int dst_offset,
+                                    int cols1, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_D3 (__global short *src1, int src1_step, int src1_offset,
-                                     __global short *src2, int src2_step, int src2_offset,
-                                     __global short *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        short4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_D4 (__global int *src1, int src1_step, int src1_offset,
-                                     __global int *src2, int src2_step, int src2_offset,
-                                     __global int *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int tmp  = data1 OP_BINARY data2;
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-__kernel void arithm_bitwise_binary_D5 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
-        char4 tmp = data1 OP_BINARY data2;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_binary_D6 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, dst_offset + x);

-        *((__global char8 *)((__global char *)dst + dst_index)) = data1 OP_BINARY data2;
+        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
    }
 }
-#endif
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
@ -43,767 +43,31 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************bitwise_binary with mask**************************************/
-__kernel void arithm_bitwise_binary_with_mask_C1_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);

-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset,
+                                    __global uchar * src2, int src2_step, int src2_offset,
+                                    __global uchar * mask, int mask_step, int mask_offset, int elemSize,
+                                    __global uchar * dst, int dst_step, int dst_offset,
+                                    int cols1, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = convert_char((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = convert_char((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int   *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index));
-        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        short2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int    *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-        char4 src_data2 = *((__global char4 *)(src2 + src2_index));
-        char4 dst_data  = *((__global char4 *)(dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int   *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_binary_with_mask_C4_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 5) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0));
-        char8 src_data1_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8));
-        char8 src_data1_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-        char8 src_data1_3 = *((__global char8 *)((__global char *)src1 + src1_index + 24));
-
-        char8 src_data2_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0));
-        char8 src_data2_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8));
-        char8 src_data2_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
-        char8 src_data2_3 = *((__global char8 *)((__global char *)src2 + src2_index + 24));
-
-        char8 dst_data_0  = *((__global char8 *)((__global char *)dst  + dst_index + 0));
-        char8 dst_data_1  = *((__global char8 *)((__global char *)dst  + dst_index + 8));
-        char8 dst_data_2  = *((__global char8 *)((__global char *)dst  + dst_index + 16));
-        char8 dst_data_3  = *((__global char8 *)((__global char *)dst  + dst_index + 24));
-
-        char8 data_0 = src_data1_0 OP_BINARY src_data2_0;
-        char8 data_1 = src_data1_1 OP_BINARY src_data2_1;
-        char8 data_2 = src_data1_2 OP_BINARY src_data2_2;
-        char8 data_3 = src_data1_3 OP_BINARY src_data2_3;
+        int mask_index = mad24(y, mask_step, mask_offset + (x / elemSize));

-        data_0 = mask_data ? data_0 : dst_data_0;
-        data_1 = mask_data ? data_1 : dst_data_1;
-        data_2 = mask_data ? data_2 : dst_data_2;
-        data_3 = mask_data ? data_3 : dst_data_3;
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = mad24(y, src2_step, x + src2_offset);
+            int dst_index = mad24(y, dst_step, x + dst_offset);

-        *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
-        *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
-        *((__global char8 *)((__global char *)dst + dst_index + 16)) = data_2;
-        *((__global char8 *)((__global char *)dst + dst_index + 24)) = data_3;
+            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+        }
    }
 }
-#endif
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
@ -43,596 +43,26 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary/////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-/******************************bitwise binary with scalar without mask********************************/
-__kernel void arithm_s_bitwise_binary_C1_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C1_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C1_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = (ushort2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);

-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = (short2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_binary_scalar(
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int elemSize,
+        __global uchar *dst, int dst_step, int dst_offset,
+        int cols1, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-
-        int data = src_data1 OP_BINARY src_data2;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
-
-        char4 data  = *((__global char4 *)((__global char *)dst  + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C1_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-
-        short4 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_C2_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C2_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C2_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = (short2)(src2.x, src2.y);
-
-        short2 data = src_data1 OP_BINARY src_data2;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-
-        char8 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C2_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
-        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-
-        short8 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-
-__kernel void arithm_s_bitwise_binary_C4_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = src_data1 OP_BINARY src2;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C4_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-
-        char4 data = src_data1 OP_BINARY src2;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C4_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = src_data1 OP_BINARY src2;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = src_data1 OP_BINARY src2;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = src_data1 OP_BINARY src2;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
-                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
-
-        char16 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C4_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
-        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
-        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
-
-        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-
-        short4 tmp_data_0 = src1_data_0 OP_BINARY src2_data_0;
-        short4 tmp_data_1 = src1_data_1 OP_BINARY src2_data_1;
-        short4 tmp_data_2 = src1_data_2 OP_BINARY src2_data_2;
-        short4 tmp_data_3 = src1_data_3 OP_BINARY src2_data_3;
-
-        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+        int src1_index = mad24(y, src1_step, src1_offset + x);
+        int src2_index = x % elemSize;
+        int dst_index  = mad24(y, dst_step, dst_offset + x);

+        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
    }
 }
-#endif
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
@ -42,6 +42,7 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
@ -50,698 +51,29 @@
 #endif
 #endif

-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************bitwise_binary with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar  *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = (ushort2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = (short2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D4 (
-        __global   int   *src1, int src1_step, int src1_offset,
-        __global   int   *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));
-
-        char4 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);

-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = (short2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        short2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global  char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-        char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src1_data OP_BINARY src2_data;
-
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
+__kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int elemSize,
        __global uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
-        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-        short8 dst_data = *((__global short8 *)((__global char *)dst  + dst_index));
-
-        short8 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global short8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int mask_index = mad24(y, mask_step, (x / elemSize) + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = x % elemSize;
+            int dst_index = mad24(y, dst_step, x + dst_offset);

-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+        }
    }
 }
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-        char4 dst_data  = *((__global char4 *)(dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
-                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
-        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
-        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
-
-        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-
-        short4 dst_data_0  = *((__global short4 *)((__global char *)dst  + dst_index + 0));
-        short4 dst_data_1  = *((__global short4 *)((__global char *)dst  + dst_index + 8));
-        short4 dst_data_2  = *((__global short4 *)((__global char *)dst  + dst_index + 16));
-        short4 dst_data_3  = *((__global short4 *)((__global char *)dst  + dst_index + 24));
-
-        short4 data_0 = src1_data_0 OP_BINARY src2_data_0;
-        short4 data_1 = src1_data_1 OP_BINARY src2_data_1;
-        short4 data_2 = src1_data_2 OP_BINARY src2_data_2;
-        short4 data_3 = src1_data_3 OP_BINARY src2_data_3;
-
-        data_0 = mask_data ? data_0 : dst_data_0;
-        data_1 = mask_data ? data_1 : dst_data_1;
-        data_2 = mask_data ? data_2 : dst_data_2;
-        data_3 = mask_data ? data_3 : dst_data_3;
-
-        *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
-        *((__global short4 *)((__global char *)dst + dst_index + 8)) = data_1;
-        *((__global short4 *)((__global char *)dst + dst_index + 16)) = data_2;
-        *((__global short4 *)((__global char *)dst + dst_index + 24)) = data_3;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_compare.cl
+++ b/modules/ocl/src/opencl/arithm_compare.cl
@ -0,0 +1,74 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_compare(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global uchar * dst, int dst_step1, int dst_offset1,
+                              int cols1, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols1 && y < rows)
+    {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);
+
+        dst[dst_index] = convert_uchar(src1[src1_index] Operation src2[src2_index] ? 255 : 0);
+    }
+}
--- a/modules/ocl/src/opencl/arithm_compare_eq.cl
+++ b/modules/ocl/src/opencl/arithm_compare_eq.cl
--- a/modules/ocl/src/opencl/arithm_compare_ne.cl
+++ b/modules/ocl/src/opencl/arithm_compare_ne.cl
--- a/Show More
+++ b/Show More