Merge remote-tracking branch 'github/master' into pullreq/140319-PyrLKOpticalFlow

Conflicts: modules/video/src/opencl/pyrlk.cl
11 years ago · 4ca695cab0
parent 65d64af2a8 5a3200d513
commit 4ca695cab0
163 changed files with 3041 additions and 2252 deletions
--- a/3rdparty/jinja2/markupsafe/init.py
+++ b/3rdparty/jinja2/markupsafe/init.py
@ -9,7 +9,7 @@
    :license: BSD, see LICENSE for more details.
 """
 import re
-from _compat import text_type, string_types, int_types, \
+from ._compat import text_type, string_types, int_types, \
     unichr, PY2


@ -227,7 +227,7 @@ class _MarkupEscapeHelper(object):
 try:
    from _speedups import escape, escape_silent, soft_unicode
 except ImportError:
-    from _native import escape, escape_silent, soft_unicode
+    from ._native import escape, escape_silent, soft_unicode

 if not PY2:
    soft_str = soft_unicode
--- a/3rdparty/jinja2/markupsafe/_native.py
+++ b/3rdparty/jinja2/markupsafe/_native.py
@ -8,7 +8,7 @@
    :copyright: (c) 2010 by Armin Ronacher.
    :license: BSD, see LICENSE for more details.
 """
-from _compat import text_type
+from ._compat import text_type


 def escape(s):
--- a/3rdparty/jinja2/utils.py
+++ b/3rdparty/jinja2/utils.py
@ -517,4 +517,4 @@ class Joiner(object):


 # Imported here because that's where it was in the past
-from markupsafe import Markup, escape, soft_unicode
+from .markupsafe import Markup, escape, soft_unicode
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.3.0.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.3.0.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.4.0.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.4.0.so
--- a/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so
--- a/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so
--- a/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so
--- a/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so
--- a/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so
--- a/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
--- a/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so
--- a/3rdparty/lib/armeabi/libnative_camera_r4.3.0.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r4.3.0.so
--- a/3rdparty/lib/armeabi/libnative_camera_r4.4.0.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r4.4.0.so
--- a/3rdparty/lib/mips/libnative_camera_r4.0.3.so
+++ b/3rdparty/lib/mips/libnative_camera_r4.0.3.so
--- a/3rdparty/lib/mips/libnative_camera_r4.1.1.so
+++ b/3rdparty/lib/mips/libnative_camera_r4.1.1.so
--- a/3rdparty/lib/mips/libnative_camera_r4.2.0.so
+++ b/3rdparty/lib/mips/libnative_camera_r4.2.0.so
--- a/3rdparty/lib/mips/libnative_camera_r4.3.0.so
+++ b/3rdparty/lib/mips/libnative_camera_r4.3.0.so
--- a/3rdparty/lib/mips/libnative_camera_r4.4.0.so
+++ b/3rdparty/lib/mips/libnative_camera_r4.4.0.so
--- a/3rdparty/lib/x86/libnative_camera_r2.3.3.so
+++ b/3rdparty/lib/x86/libnative_camera_r2.3.3.so
--- a/3rdparty/lib/x86/libnative_camera_r3.0.1.so
+++ b/3rdparty/lib/x86/libnative_camera_r3.0.1.so
--- a/3rdparty/lib/x86/libnative_camera_r4.0.3.so
+++ b/3rdparty/lib/x86/libnative_camera_r4.0.3.so
--- a/3rdparty/lib/x86/libnative_camera_r4.1.1.so
+++ b/3rdparty/lib/x86/libnative_camera_r4.1.1.so
--- a/3rdparty/lib/x86/libnative_camera_r4.2.0.so
+++ b/3rdparty/lib/x86/libnative_camera_r4.2.0.so
--- a/3rdparty/lib/x86/libnative_camera_r4.3.0.so
+++ b/3rdparty/lib/x86/libnative_camera_r4.3.0.so
--- a/3rdparty/lib/x86/libnative_camera_r4.4.0.so
+++ b/3rdparty/lib/x86/libnative_camera_r4.4.0.so
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -438,7 +438,6 @@ include(cmake/OpenCVFindLibsGUI.cmake)
 include(cmake/OpenCVFindLibsVideo.cmake)
 include(cmake/OpenCVFindLibsPerf.cmake)

-
 # ----------------------------------------------------------------------------
 #  Detect other 3rd-party libraries/tools
 # ----------------------------------------------------------------------------
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -217,3 +217,42 @@ else()
  unset(CUDA_ARCH_BIN CACHE)
  unset(CUDA_ARCH_PTX CACHE)
 endif()
+
+if(HAVE_CUDA)
+  set(CUDA_LIBS_PATH "")
+  foreach(p ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+    get_filename_component(_tmp ${p} PATH)
+    list(APPEND CUDA_LIBS_PATH ${_tmp})
+  endforeach()
+
+  if(HAVE_CUBLAS)
+    foreach(p ${CUDA_cublas_LIBRARY})
+      get_filename_component(_tmp ${p} PATH)
+      list(APPEND CUDA_LIBS_PATH ${_tmp})
+    endforeach()
+  endif()
+
+  if(HAVE_CUFFT)
+    foreach(p ${CUDA_cufft_LIBRARY})
+      get_filename_component(_tmp ${p} PATH)
+      list(APPEND CUDA_LIBS_PATH ${_tmp})
+    endforeach()
+  endif()
+
+  list(REMOVE_DUPLICATES CUDA_LIBS_PATH)
+  link_directories(${CUDA_LIBS_PATH})
+
+  set(CUDA_LIBRARIES_ABS ${CUDA_LIBRARIES})
+  ocv_convert_to_lib_name(CUDA_LIBRARIES ${CUDA_LIBRARIES})
+  set(CUDA_npp_LIBRARY_ABS ${CUDA_npp_LIBRARY})
+  ocv_convert_to_lib_name(CUDA_npp_LIBRARY ${CUDA_npp_LIBRARY})
+  if(HAVE_CUBLAS)
+    set(CUDA_cublas_LIBRARY_ABS ${CUDA_cublas_LIBRARY})
+    ocv_convert_to_lib_name(CUDA_cublas_LIBRARY ${CUDA_cublas_LIBRARY})
+  endif()
+
+  if(HAVE_CUFFT)
+    set(CUDA_cufft_LIBRARY_ABS ${CUDA_cufft_LIBRARY})
+    ocv_convert_to_lib_name(CUDA_cufft_LIBRARY ${CUDA_cufft_LIBRARY})
+  endif()
+endif()
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -27,7 +27,8 @@
 # The verbose template for OpenCV module:
 #
 #   ocv_add_module(modname <dependencies>)
-#   ocv_glob_module_sources() or glob them manually and ocv_set_module_sources(...)
+#   ocv_glob_module_sources(([EXCLUDE_CUDA] <extra sources&headers>)
+#                          or glob them manually and ocv_set_module_sources(...)
 #   ocv_module_include_directories(<extra include directories>)
 #   ocv_create_module()
 #   <add extra link dependencies, compiler options, etc>
@ -478,9 +479,15 @@ endmacro()

 # finds and sets headers and sources for the standard OpenCV module
 # Usage:
-# ocv_glob_module_sources(<extra sources&headers in the same format as used in ocv_set_module_sources>)
+# ocv_glob_module_sources([EXCLUDE_CUDA] <extra sources&headers in the same format as used in ocv_set_module_sources>)
 macro(ocv_glob_module_sources)
-  file(GLOB_RECURSE lib_srcs     "src/*.cpp")
+  set(_argn ${ARGN})
+  list(FIND _argn "EXCLUDE_CUDA" exclude_cuda)
+  if(NOT exclude_cuda EQUAL -1)
+    list(REMOVE_AT _argn ${exclude_cuda})
+  endif()
+
+  file(GLOB_RECURSE lib_srcs "src/*.cpp")
  file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
  file(GLOB lib_hdrs     "include/opencv2/*.hpp" "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
@ -492,15 +499,21 @@ macro(ocv_glob_module_sources)
  ocv_source_group("Src" DIRBASE "${CMAKE_CURRENT_SOURCE_DIR}/src" FILES ${lib_srcs} ${lib_int_hdrs})
  ocv_source_group("Include" DIRBASE "${CMAKE_CURRENT_SOURCE_DIR}/include" FILES ${lib_hdrs} ${lib_hdrs_detail})

-  file(GLOB lib_cuda_srcs "src/cuda/*.cu")
-  set(cuda_objs "")
-  set(lib_cuda_hdrs "")
-  if(HAVE_CUDA AND lib_cuda_srcs)
-    ocv_include_directories(${CUDA_INCLUDE_DIRS})
-    file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
+  if (exclude_cuda EQUAL -1)
+    file(GLOB lib_cuda_srcs "src/cuda/*.cu")
+    set(cuda_objs "")
+    set(lib_cuda_hdrs "")
+    if(HAVE_CUDA)
+      ocv_include_directories(${CUDA_INCLUDE_DIRS})
+      file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")

-    ocv_cuda_compile(cuda_objs ${lib_cuda_srcs} ${lib_cuda_hdrs})
-    source_group("Src\\Cuda" FILES ${lib_cuda_srcs} ${lib_cuda_hdrs})
+      ocv_cuda_compile(cuda_objs ${lib_cuda_srcs} ${lib_cuda_hdrs})
+      source_group("Src\\Cuda"      FILES ${lib_cuda_srcs} ${lib_cuda_hdrs})
+    endif()
+  else()
+    set(cuda_objs "")
+    set(lib_cuda_srcs "")
+    set(lib_cuda_hdrs "")
  endif()

  file(GLOB cl_kernels "src/opencl/*.cl")
@ -516,8 +529,8 @@ macro(ocv_glob_module_sources)
    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
  endif()

-  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
-                                 SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_srcs} ${lib_cuda_hdrs})
+  ocv_set_module_sources(${_argn} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
+                         SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_srcs} ${lib_cuda_hdrs})
 endmacro()

 # creates OpenCV module in current folder
@ -622,11 +635,20 @@ endmacro()
 # short command for adding simple OpenCV module
 # see ocv_add_module for argument details
 # Usage:
-# ocv_define_module(module_name  [INTERNAL] [REQUIRED] [<list of dependencies>] [OPTIONAL <list of optional dependencies>])
+# ocv_define_module(module_name  [INTERNAL] [EXCLUDE_CUDA] [REQUIRED] [<list of dependencies>] [OPTIONAL <list of optional dependencies>])
 macro(ocv_define_module module_name)
-  ocv_add_module(${module_name} ${ARGN})
+  set(_argn ${ARGN})
+  set(exclude_cuda "")
+  foreach(arg ${_argn})
+    if("${arg}" STREQUAL "EXCLUDE_CUDA")
+      set(exclude_cuda "${arg}")
+      list(REMOVE_ITEM _argn ${arg})
+    endif()
+  endforeach()
+
+  ocv_add_module(${module_name} ${_argn})
  ocv_module_include_directories()
-  ocv_glob_module_sources()
+  ocv_glob_module_sources(${exclude_cuda})
  ocv_create_module()
  ocv_add_precompiled_headers(${the_module})

--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@ -19,8 +19,8 @@
 #    This file will define the following variables:
 #      - OpenCV_LIBS                     : The list of all imported targets for OpenCV modules.
 #      - OpenCV_INCLUDE_DIRS             : The OpenCV include directories.
-#      - OpenCV_COMPUTE_CAPABILITIES     : The version of compute capability
-#      - OpenCV_ANDROID_NATIVE_API_LEVEL : Minimum required level of Android API
+#      - OpenCV_COMPUTE_CAPABILITIES     : The version of compute capability.
+#      - OpenCV_ANDROID_NATIVE_API_LEVEL : Minimum required level of Android API.
 #      - OpenCV_VERSION                  : The version of this OpenCV build: "@OPENCV_VERSION_PLAIN@"
 #      - OpenCV_VERSION_MAJOR            : Major version part of OpenCV_VERSION: "@OPENCV_VERSION_MAJOR@"
 #      - OpenCV_VERSION_MINOR            : Minor version part of OpenCV_VERSION: "@OPENCV_VERSION_MINOR@"
@ -28,25 +28,29 @@
 #      - OpenCV_VERSION_STATUS           : Development status of this build: "@OPENCV_VERSION_STATUS@"
 #
 #    Advanced variables:
-#      - OpenCV_SHARED
-#      - OpenCV_CONFIG_PATH
-#      - OpenCV_INSTALL_PATH  (not set on Windows)
-#      - OpenCV_LIB_COMPONENTS
-#      - OpenCV_USE_MANGLED_PATHS
-#      - OpenCV_HAVE_ANDROID_CAMERA
+#      - OpenCV_SHARED                   : Use OpenCV as shared library
+#      - OpenCV_CONFIG_PATH              : Path to this OpenCVConfig.cmake
+#      - OpenCV_INSTALL_PATH             : OpenCV location (not set on Windows)
+#      - OpenCV_LIB_COMPONENTS           : Present OpenCV modules list
+#      - OpenCV_USE_MANGLED_PATHS        : Mangled OpenCV path flag
+#      - OpenCV_MODULES_SUFFIX           : The suffix for OpenCVModules-XXX.cmake file
+#      - OpenCV_HAVE_ANDROID_CAMERA      : Presence of Android native camera wrappers
 #
 #    Deprecated variables:
 #      - OpenCV_VERSION_TWEAK            : Always "0"
 #
 # ===================================================================================

-set(modules_file_suffix "")
-if(ANDROID)
-  string(REPLACE - _ modules_file_suffix "_${ANDROID_NDK_ABI_NAME}")
+if(NOT DEFINED OpenCV_MODULES_SUFFIX)
+  if(ANDROID)
+    string(REPLACE - _ OpenCV_MODULES_SUFFIX "_${ANDROID_NDK_ABI_NAME}")
+  else()
+    set(OpenCV_MODULES_SUFFIX "")
+  endif()
 endif()

 if(NOT TARGET opencv_core)
-  include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${modules_file_suffix}.cmake)
+  include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${OpenCV_MODULES_SUFFIX}.cmake)
 endif()

 # TODO All things below should be reviewed. What is about of moving this code into related modules (special vars/hooks/files)
@ -209,7 +213,7 @@ foreach(__opttype OPT DBG)
  SET(OpenCV_EXTRA_LIBS_${__opttype} "")

  # CUDA
-  if(OpenCV_CUDA_VERSION AND (CMAKE_CROSSCOMPILING OR (WIN32 AND NOT OpenCV_SHARED)))
+  if(OpenCV_CUDA_VERSION)
    if(NOT CUDA_FOUND)
      find_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
    else()
@ -218,32 +222,41 @@ foreach(__opttype OPT DBG)
      endif()
    endif()

-    list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES})
+    set(OpenCV_CUDA_LIBS_ABSPATH ${CUDA_LIBRARIES})

    if(${CUDA_VERSION} VERSION_LESS "5.5")
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_npp_LIBRARY})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_npp_LIBRARY})
    else()
      find_cuda_helper_libs(nppc)
      find_cuda_helper_libs(nppi)
      find_cuda_helper_libs(npps)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
    endif()

    if(OpenCV_USE_CUBLAS)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUBLAS_LIBRARIES})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_CUBLAS_LIBRARIES})
    endif()

    if(OpenCV_USE_CUFFT)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUFFT_LIBRARIES})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_CUFFT_LIBRARIES})
    endif()

    if(OpenCV_USE_NVCUVID)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvid_LIBRARIES})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nvcuvid_LIBRARIES})
    endif()

    if(WIN32)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvenc_LIBRARIES})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nvcuvenc_LIBRARIES})
    endif()
+
+    set(OpenCV_CUDA_LIBS_RELPATH "")
+    foreach(l ${OpenCV_CUDA_LIBS_ABSPATH})
+      get_filename_component(_tmp ${l} PATH)
+      list(APPEND OpenCV_CUDA_LIBS_RELPATH ${_tmp})
+    endforeach()
+
+    list(REMOVE_DUPLICATES OpenCV_CUDA_LIBS_RELPATH)
+    link_directories(${OpenCV_CUDA_LIBS_RELPATH})
  endif()
 endforeach()

--- a/doc/conf.py
+++ b/doc/conf.py
@ -304,11 +304,11 @@ extlinks = {
            'oldbasicstructures' : ('http://docs.opencv.org/modules/core/doc/old_basic_structures.html#%s', None),
            'readwriteimagevideo' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#%s', None),
            'operationsonarrays' : ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html#%s', None),
-            'utilitysystemfunctions':('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html#%s', None),
-            'imgprocfilter':('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
-            'svms':('http://docs.opencv.org/modules/ml/doc/support_vector_machines.html#%s', None),
-            'drawingfunc':('http://docs.opencv.org/modules/core/doc/drawing_functions.html#%s', None),
-            'xmlymlpers':('http://docs.opencv.org/modules/core/doc/xml_yaml_persistence.html#%s', None),
+            'utilitysystemfunctions' : ('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html#%s', None),
+            'imgprocfilter' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
+            'svms' : ('http://docs.opencv.org/modules/ml/doc/support_vector_machines.html#%s', None),
+            'drawingfunc' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#%s', None),
+            'xmlymlpers' : ('http://docs.opencv.org/modules/core/doc/xml_yaml_persistence.html#%s', None),
            'hgvideo' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#%s', None),
            'gpuinit' : ('http://docs.opencv.org/modules/gpu/doc/initalization_and_information.html#%s', None),
            'gpudatastructure' : ('http://docs.opencv.org/modules/gpu/doc/data_structures.html#%s', None),
@ -316,56 +316,58 @@ extlinks = {
            'gpuperelement' : ('http://docs.opencv.org/modules/gpu/doc/per_element_operations.html#%s', None),
            'gpuimgproc' : ('http://docs.opencv.org/modules/gpu/doc/image_processing.html#%s', None),
            'gpumatrixreduct' : ('http://docs.opencv.org/modules/gpu/doc/matrix_reductions.html#%s', None),
-            'filtering':('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
+            'filtering' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
            'flann' : ('http://docs.opencv.org/modules/flann/doc/flann_fast_approximate_nearest_neighbor_search.html#%s', None ),
            'calib3d' : ('http://docs.opencv.org/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.html#%s', None ),
            'feature2d' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html#%s', None ),
            'imgproc_geometric' : ('http://docs.opencv.org/modules/imgproc/doc/geometric_transformations.html#%s', None ),
+            'miscellaneous_transformations' : ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html#%s', None),
+            'user_interface' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html#%s', None),

            # 'opencv_group' : ('http://answers.opencv.org/%s', None),
            'opencv_qa' : ('http://answers.opencv.org/%s', None),
            'how_to_contribute' : ('http://code.opencv.org/projects/opencv/wiki/How_to_contribute/%s', None),

-            'cvt_color': ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=cvtcolor#cvtcolor%s', None),
-            'imread':    ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imread#imread%s', None),
-            'imwrite':   ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imwrite#imwrite%s', None),
-            'imshow':    ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=imshow#imshow%s', None),
-            'named_window': ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=namedwindow#namedwindow%s', None),
-            'wait_key': ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=waitkey#waitkey%s', None),
-            'add_weighted': ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html?highlight=addweighted#addweighted%s', None),
-            'saturate_cast': ('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html?highlight=saturate_cast#saturate-cast%s', None),
-            'mat_zeros': ('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=zeros#mat-zeros%s', None),
-            'convert_to': ('http://docs.opencv.org/modules/core/doc/basic_structures.html#mat-convertto%s', None),
-            'create_trackbar': ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=createtrackbar#createtrackbar%s', None),
-            'point': ('http://docs.opencv.org/modules/core/doc/basic_structures.html#point%s', None),
-            'scalar': ('http://docs.opencv.org/modules/core/doc/basic_structures.html#scalar%s', None),
-            'line': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#line%s', None),
-            'ellipse': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#ellipse%s', None),
-            'rectangle': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#rectangle%s', None),
-            'circle': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#circle%s', None),
-            'fill_poly': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#fillpoly%s', None),
-            'rng': ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html?highlight=rng#rng%s', None),
-            'put_text': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#puttext%s', None),
-            'gaussian_blur': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur%s', None),
-            'blur': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=blur#blur%s', None),
-            'median_blur': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=medianblur#medianblur%s', None),
-            'bilateral_filter': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=bilateralfilter#bilateralfilter%s', None),
-            'erode': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=erode#erode%s', None),
-            'dilate': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=dilate#dilate%s', None),
-            'get_structuring_element': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=getstructuringelement#getstructuringelement%s', None),
-            'flood_fill': ( 'http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=floodfill#floodfill%s', None),
-            'morphology_ex': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=morphologyex#morphologyex%s', None),
-            'pyr_down': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=pyrdown#pyrdown%s', None),
-            'pyr_up': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=pyrup#pyrup%s', None),
-            'resize': ('http://docs.opencv.org/modules/imgproc/doc/geometric_transformations.html?highlight=resize#resize%s', None),
-            'threshold': ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=threshold#threshold%s', None),
-            'filter2d': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=filter2d#filter2d%s', None),
-            'copy_make_border': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=copymakeborder#copymakeborder%s', None),
-            'sobel': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=sobel#sobel%s', None),
-            'scharr': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=scharr#scharr%s', None),
-            'laplacian': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=laplacian#laplacian%s', None),
-            'canny': ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=canny#canny%s', None),
-            'copy_to': ('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=copyto#mat-copyto%s', None),
+            'cvt_color' : ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=cvtcolor#cvtcolor%s', None),
+            'imread' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imread#imread%s', None),
+            'imwrite' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imwrite#imwrite%s', None),
+            'imshow' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=imshow#imshow%s', None),
+            'named_window' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=namedwindow#namedwindow%s', None),
+            'wait_key' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=waitkey#waitkey%s', None),
+            'add_weighted' : ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html?highlight=addweighted#addweighted%s', None),
+            'saturate_cast' : ('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html?highlight=saturate_cast#saturate-cast%s', None),
+            'mat_zeros' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=zeros#mat-zeros%s', None),
+            'convert_to' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html#mat-convertto%s', None),
+            'create_trackbar' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=createtrackbar#createtrackbar%s', None),
+            'point' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html#point%s', None),
+            'scalar' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html#scalar%s', None),
+            'line' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#line%s', None),
+            'ellipse' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#ellipse%s', None),
+            'rectangle' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#rectangle%s', None),
+            'circle' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#circle%s', None),
+            'fill_poly' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#fillpoly%s', None),
+            'rng' : ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html?highlight=rng#rng%s', None),
+            'put_text' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#puttext%s', None),
+            'gaussian_blur' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur%s', None),
+            'blur' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=blur#blur%s', None),
+            'median_blur' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=medianblur#medianblur%s', None),
+            'bilateral_filter' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=bilateralfilter#bilateralfilter%s', None),
+            'erode' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=erode#erode%s', None),
+            'dilate' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=dilate#dilate%s', None),
+            'get_structuring_element' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=getstructuringelement#getstructuringelement%s', None),
+            'flood_fill' : ( 'http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=floodfill#floodfill%s', None),
+            'morphology_ex' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=morphologyex#morphologyex%s', None),
+            'pyr_down' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=pyrdown#pyrdown%s', None),
+            'pyr_up' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=pyrup#pyrup%s', None),
+            'resize' : ('http://docs.opencv.org/modules/imgproc/doc/geometric_transformations.html?highlight=resize#resize%s', None),
+            'threshold' : ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=threshold#threshold%s', None),
+            'filter2d' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=filter2d#filter2d%s', None),
+            'copy_make_border' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=copymakeborder#copymakeborder%s', None),
+            'sobel' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=sobel#sobel%s', None),
+            'scharr' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=scharr#scharr%s', None),
+            'laplacian' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=laplacian#laplacian%s', None),
+            'canny' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=canny#canny%s', None),
+            'copy_to' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=copyto#mat-copyto%s', None),
            'hough_lines' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=houghlines#houghlines%s', None),
            'hough_lines_p' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=houghlinesp#houghlinesp%s', None),
            'hough_circles' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=houghcircles#houghcircles%s', None),
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst
@ -123,7 +123,7 @@ Let (x,y) be the top-left coordinate of the rectangle and (w,h) be its width and

 7.b. Rotated Rectangle
 -----------------------
-Here, bounding rectangle is drawn with minimum area, so it considers the rotation also. The function used is **cv2.minAreaRect()**. It returns a Box2D structure which contains following detals - ( top-left corner(x,y), (width, height), angle of rotation ). But to draw this rectangle, we need 4 corners of the rectangle. It is obtained by the function **cv2.boxPoints()**
+Here, bounding rectangle is drawn with minimum area, so it considers the rotation also. The function used is **cv2.minAreaRect()**. It returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ). But to draw this rectangle, we need 4 corners of the rectangle. It is obtained by the function **cv2.boxPoints()**
 ::

    rect = cv2.minAreaRect(cnt)
--- a/doc/py_tutorials/py_setup/py_intro/py_intro.rst
+++ b/doc/py_tutorials/py_setup/py_intro/py_intro.rst
@ -7,45 +7,41 @@ Introduction to OpenCV-Python Tutorials
 OpenCV
 ===============

-OpenCV was started at Intel in 1999 by **Gary Bradsky** and the first release came out in 2000. **Vadim Pisarevsky** joined Gary Bradsky to manage Intel's Russian software OpenCV team. In 2005, OpenCV was used on Stanley, the vehicle who won 2005 DARPA Grand Challenge. Later its active development continued under the support of Willow Garage, with Gary Bradsky and Vadim Pisarevsky leading the project. Right now, OpenCV supports a lot of algorithms related to Computer Vision and Machine Learning and it is expanding day-by-day.
+OpenCV was started at Intel in 1999 by **Gary Bradsky**, and the first release came out in 2000. **Vadim Pisarevsky** joined Gary Bradsky to manage Intel's Russian software OpenCV team. In 2005, OpenCV was used on Stanley, the vehicle that won the 2005 DARPA Grand Challenge. Later, its active development continued under the support of Willow Garage with Gary Bradsky and Vadim Pisarevsky leading the project. OpenCV now supports a multitude of algorithms related to Computer Vision and Machine Learning and is expanding day by day.

-Currently OpenCV supports a wide variety of programming languages like C++, Python, Java etc and is available on different platforms including Windows, Linux, OS X, Android, iOS etc. Also, interfaces based on CUDA and OpenCL are also under active development for high-speed GPU operations.
+OpenCV supports a wide variety of programming languages such as C++, Python, Java, etc., and is available on different platforms including Windows, Linux, OS X, Android, and iOS. Interfaces for high-speed GPU operations based on CUDA and OpenCL are also under active development.

-OpenCV-Python is the Python API of OpenCV. It combines the best qualities of OpenCV C++ API and Python language.
+OpenCV-Python is the Python API for OpenCV, combining the best qualities of the OpenCV C++ API and the Python language.


 OpenCV-Python
 ===============

-Python is a general purpose programming language started by **Guido van Rossum**, which became very popular in short time mainly because of its simplicity and code readability. It enables the programmer to express his ideas in fewer lines of code without reducing any readability.
+OpenCV-Python is a library of Python bindings designed to solve computer vision problems.

-Compared to other languages like C/C++, Python is slower. But another important feature of Python is that it can be easily extended with C/C++. This feature helps us to write computationally intensive codes in C/C++ and create a Python wrapper for it so that we can use these wrappers as Python modules. This gives us two advantages: first, our code is as fast as original C/C++ code (since it is the actual C++ code working in background) and second, it is very easy to code in Python. This is how OpenCV-Python works, it is a Python wrapper around original C++ implementation.
+Python is a general purpose programming language started by **Guido van Rossum** that became very popular very quickly, mainly because of its simplicity and code readability. It enables the programmer to express ideas in fewer lines of code without reducing readability.

-And the support of Numpy makes the task more easier. **Numpy** is a highly optimized library for numerical operations. It gives a MATLAB-style syntax. All the OpenCV array structures are converted to-and-from Numpy arrays. So whatever operations you can do in Numpy, you can combine it with OpenCV, which increases number of weapons in your arsenal. Besides that, several other libraries like SciPy, Matplotlib which supports Numpy can be used with this.
+Compared to languages like C/C++, Python is slower. That said, Python can be easily extended with C/C++, which allows us to write computationally intensive code in C/C++ and create Python wrappers that can be used as Python modules. This gives us two advantages: first, the code is as fast as the original C/C++ code (since it is the actual C++ code working in background) and second, it easier to code in Python than C/C++. OpenCV-Python is a Python wrapper for the original OpenCV C++ implementation.

-So OpenCV-Python is an appropriate tool for fast prototyping of computer vision problems.
+OpenCV-Python makes use of **Numpy**, which is a highly optimized library for numerical operations with a MATLAB-style syntax. All the OpenCV array structures are converted to and from Numpy arrays. This also makes it easier to integrate with other libraries that use Numpy such as SciPy and Matplotlib.


 OpenCV-Python Tutorials
 =============================

-OpenCV introduces a new set of tutorials which will guide you through various functions available in OpenCV-Python. **This guide is mainly focused on OpenCV 3.x version** (although most of the tutorials will work with OpenCV 2.x also).
+OpenCV introduces a new set of tutorials which will guide you through various functions available in OpenCV-Python. **This guide is mainly focused on OpenCV 3.x version** (although most of the tutorials will also work with OpenCV 2.x).

-A prior knowledge on Python and Numpy is required before starting because they won't be covered in this guide. **Especially, a good knowledge on Numpy is must to write optimized codes in OpenCV-Python.**
+Prior knowledge of Python and Numpy is recommended as they won't be covered in this guide. **Proficiency with Numpy is a must in order to write optimized code using OpenCV-Python.**

-This tutorial has been started by *Abid Rahman K.* as part of Google Summer of Code 2013 program, under the guidance of *Alexander Mordvintsev*.
+This tutorial was originally started by *Abid Rahman K.* as part of the Google Summer of Code 2013 program under the guidance of *Alexander Mordvintsev*.


 OpenCV Needs You !!!
 ==========================

-Since OpenCV is an open source initiative, all are welcome to make contributions to this library. And it is same for this tutorial also.
+Since OpenCV is an open source initiative, all are welcome to make contributions to the library, documentation, and tutorials. If you find any mistake in this tutorial (from a small spelling mistake to an egregious error in code or concept), feel free to correct it by cloning OpenCV in `GitHub <https://github.com/Itseez/opencv>`_ and submitting a pull request. OpenCV developers will check your pull request, give you important feedback and (once it passes the approval of the reviewer) it will be merged into OpenCV. You will then become an open source contributor :-)

-So, if you find any mistake in this tutorial (whether it be a small spelling mistake or a big error in code or concepts, whatever), feel free to correct it.
-
-And that will be a good task for freshers who begin to contribute to open source projects. Just fork the OpenCV in github, make necessary corrections and send a pull request to OpenCV. OpenCV developers will check your pull request, give you important feedback and once it passes the approval of the reviewer, it will be merged to OpenCV. Then you become a open source contributor. Similar is the case with other tutorials, documentation etc.
-
-As new modules are added to OpenCV-Python, this tutorial will have to be expanded. So those who knows about particular algorithm can write up a tutorial which includes a basic theory of the algorithm and a code showing basic usage of the algorithm and submit it to OpenCV.
+As new modules are added to OpenCV-Python, this tutorial will have to be expanded. If you are familiar with a particular algorithm and can write up a tutorial including basic theory of the algorithm and code showing example usage, please do so.

 Remember, we **together** can make this project a great success !!!

--- a/doc/py_tutorials/py_video/py_meanshift/py_meanshift.rst
+++ b/doc/py_tutorials/py_video/py_meanshift/py_meanshift.rst
@ -52,7 +52,7 @@ To use meanshift in OpenCV, first we need to setup the target, find its histogra

    # set up the ROI for tracking
    roi = frame[r:r+h, c:c+w]
-    hsv_roi =  cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
+    hsv_roi =  cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
    mask = cv2.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))
    roi_hist = cv2.calcHist([hsv_roi],[0],mask,[180],[0,180])
    cv2.normalize(roi_hist,roi_hist,0,255,cv2.NORM_MINMAX)
@ -127,7 +127,7 @@ It is almost same as meanshift, but it returns a rotated rectangle (that is our

    # set up the ROI for tracking
    roi = frame[r:r+h, c:c+w]
-    hsv_roi =  cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
+    hsv_roi =  cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
    mask = cv2.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))
    roi_hist = cv2.calcHist([hsv_roi],[0],mask,[180],[0,180])
    cv2.normalize(roi_hist,roi_hist,0,255,cv2.NORM_MINMAX)
--- a/doc/tutorials/core/adding_images/adding_images.rst
+++ b/doc/tutorials/core/adding_images/adding_images.rst
@ -6,12 +6,12 @@ Adding (blending) two images using OpenCV
 Goal
 =====

-In this tutorial you will learn how to:
+In this tutorial you will learn:

 .. container:: enumeratevisibleitemswithsquare

-   * What is *linear blending* and why it is useful.
-   * Add two images using :add_weighted:`addWeighted <>`
+   * what is *linear blending* and why it is useful;
+   * how to add two images using :add_weighted:`addWeighted <>`

 Theory
 =======
--- a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.rst
+++ b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.rst
@ -18,7 +18,7 @@ We'll seek answers for the following questions:
 Our test case
 =============

-Let us consider a simple color reduction method. Using the unsigned char C and C++ type for matrix item storing a channel of pixel may have up to 256 different values. For a three channel image this can allow the formation of way too many colors (16 million to be exact). Working with so many color shades may give a heavy blow to our algorithm performance. However, sometimes it is enough to work with a lot less of them to get the same final result.
+Let us consider a simple color reduction method. By using the unsigned char C and C++ type for matrix item storing, a channel of pixel may have up to 256 different values. For a three channel image this can allow the formation of way too many colors (16 million to be exact). Working with so many color shades may give a heavy blow to our algorithm performance. However, sometimes it is enough to work with a lot less of them to get the same final result.

 In this cases it's common that we make a *color space reduction*. This means that we divide the color space current value with a new input value to end up with fewer colors. For instance every value between zero and nine takes the new value zero, every value between ten and nineteen the value ten and so on.

--- a/doc/tutorials/core/mat-mask-operations/mat-mask-operations.rst
+++ b/doc/tutorials/core/mat-mask-operations/mat-mask-operations.rst
@ -32,14 +32,14 @@ Here's a function that will do this:

 .. code-block:: cpp

-   void Sharpen(const Mat& myImage,Mat& Result)
+   void Sharpen(const Mat& myImage, Mat& Result)
   {
       CV_Assert(myImage.depth() == CV_8U);  // accept only uchar images

-       Result.create(myImage.size(),myImage.type());
+       Result.create(myImage.size(), myImage.type());
       const int nChannels = myImage.channels();

-       for(int j = 1 ; j < myImage.rows-1; ++j)
+       for(int j = 1; j < myImage.rows - 1; ++j)
       {
           const uchar* previous = myImage.ptr<uchar>(j - 1);
           const uchar* current  = myImage.ptr<uchar>(j    );
@ -47,17 +47,17 @@ Here's a function that will do this:

           uchar* output = Result.ptr<uchar>(j);

-           for(int i= nChannels;i < nChannels*(myImage.cols-1); ++i)
+           for(int i = nChannels; i < nChannels * (myImage.cols - 1); ++i)
           {
-               *output++ = saturate_cast<uchar>(5*current[i]
-                            -current[i-nChannels] - current[i+nChannels] - previous[i] - next[i]);
+               *output++ = saturate_cast<uchar>(5 * current[i]
+                            -current[i - nChannels] - current[i + nChannels] - previous[i] - next[i]);
           }
       }

       Result.row(0).setTo(Scalar(0));
-       Result.row(Result.rows-1).setTo(Scalar(0));
+       Result.row(Result.rows - 1).setTo(Scalar(0));
       Result.col(0).setTo(Scalar(0));
-       Result.col(Result.cols-1).setTo(Scalar(0));
+       Result.col(Result.cols - 1).setTo(Scalar(0));
   }

 At first we make sure that the input images data is in unsigned char format. For this we use the :utilitysystemfunctions:`CV_Assert <cv-assert>` function that throws an error when the expression inside it is false.
@ -70,14 +70,14 @@ We create an output image with the same size and the same type as our input. As

 .. code-block:: cpp

-   Result.create(myImage.size(),myImage.type());
+   Result.create(myImage.size(), myImage.type());
   const int nChannels = myImage.channels();

 We'll use the plain C [] operator to access pixels. Because we need to access multiple rows at the same time we'll acquire the pointers for each of them (a previous, a current and a next line). We need another pointer to where we're going to save the calculation. Then simply access the right items with the [] operator. For moving the output pointer ahead we simply increase this (with one byte) after each operation:

 .. code-block:: cpp

-   for(int j = 1 ; j < myImage.rows-1; ++j)
+   for(int j = 1; j < myImage.rows - 1; ++j)
   {
       const uchar* previous = myImage.ptr<uchar>(j - 1);
       const uchar* current  = myImage.ptr<uchar>(j    );
@ -85,21 +85,21 @@ We'll use the plain C [] operator to access pixels. Because we need to access mu

       uchar* output = Result.ptr<uchar>(j);

-       for(int i= nChannels;i < nChannels*(myImage.cols-1); ++i)
+       for(int i = nChannels; i < nChannels * (myImage.cols - 1); ++i)
       {
-           *output++ = saturate_cast<uchar>(5*current[i]
-                        -current[i-nChannels] - current[i+nChannels] - previous[i] - next[i]);
+           *output++ = saturate_cast<uchar>(5 * current[i]
+                        -current[i - nChannels] - current[i + nChannels] - previous[i] - next[i]);
       }
   }

-On the borders of the image the upper notation results inexistent pixel locations (like minus one - minus one). In these points our formula is undefined. A simple solution is to not apply the mask in these points and, for example, set the pixels on the borders to zeros:
+On the borders of the image the upper notation results inexistent pixel locations (like minus one - minus one). In these points our formula is undefined. A simple solution is to not apply the kernel in these points and, for example, set the pixels on the borders to zeros:

 .. code-block:: cpp

-   Result.row(0).setTo(Scalar(0));             // The top row
-   Result.row(Result.rows-1).setTo(Scalar(0)); // The bottom row
-   Result.col(0).setTo(Scalar(0));             // The left column
-   Result.col(Result.cols-1).setTo(Scalar(0)); // The right column
+   Result.row(0).setTo(Scalar(0));               // The top row
+   Result.row(Result.rows - 1).setTo(Scalar(0)); // The bottom row
+   Result.col(0).setTo(Scalar(0));               // The left column
+   Result.col(Result.cols - 1).setTo(Scalar(0)); // The right column

 The filter2D function
 =====================
@ -116,7 +116,7 @@ Then call the :filtering:`filter2D <filter2d>` function specifying the input, th

 .. code-block:: cpp

-   filter2D(I, K, I.depth(), kern );
+   filter2D(I, K, I.depth(), kern);

 The function even has a fifth optional argument to specify the center of the kernel, and a sixth one for determining what to do in the regions where the operation is undefined (borders). Using this function has the advantage that it's shorter, less verbose and because there are some optimization techniques implemented it is usually faster than the *hand-coded method*. For example in my test while the second one took only 13 milliseconds the first took around 31 milliseconds. Quite some difference.

--- a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
+++ b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
@ -45,7 +45,7 @@ All the above objects, in the end, point to the same single data matrix. Their h
   :linenos:

   Mat D (A, Rect(10, 10, 100, 100) ); // using a rectangle
-   Mat E = A(Range:all(), Range(1,3)); // using row and column boundaries
+   Mat E = A(Range::all(), Range(1,3)); // using row and column boundaries

 Now you may ask if the matrix itself may belong to multiple *Mat* objects who takes responsibility for cleaning it up when it's no longer needed. The short answer is: the last object that used it. This is handled by using a reference counting mechanism. Whenever somebody copies a header of a *Mat* object, a counter is increased for the matrix. Whenever a header is cleaned this counter is decreased. When the counter reaches zero the matrix too is freed. Sometimes you will want to copy the matrix itself too, so OpenCV provides the :basicstructures:`clone() <mat-clone>` and :basicstructures:`copyTo() <mat-copyto>` functions.

@ -86,7 +86,7 @@ Each of the building components has their own valid domains. This leads to the d
 Creating a *Mat* object explicitly
 ==================================

-In the :ref:`Load_Save_Image` tutorial you have already learned how to write a matrix to an image file by using the :readWriteImageVideo:` imwrite() <imwrite>` function. However, for debugging purposes it's much more convenient to see the actual values. You can do this using the << operator of *Mat*. Be aware that this only works for two dimensional matrices.
+In the :ref:`Load_Save_Image` tutorial you have already learned how to write a matrix to an image file by using the :readwriteimagevideo:`imwrite() <imwrite>` function. However, for debugging purposes it's much more convenient to see the actual values. You can do this using the << operator of *Mat*. Be aware that this only works for two dimensional matrices.

 Although *Mat* works really well as an image container, it is also a general matrix class. Therefore, it is possible to create and manipulate multidimensional matrices. You can create a Mat object in multiple ways:

--- a/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.rst
+++ b/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.rst
@ -84,88 +84,10 @@ Code

   * **Code at glance:**

-.. code-block:: cpp
-
-   #include "opencv2/highgui.hpp"
-   #include "opencv2/imgproc.hpp"
-   #include <iostream>
-   #include <stdio.h>
-
-   using namespace std;
-   using namespace cv;
-
-   /** @function main */
-   int main( int argc, char** argv )
-   {
-     Mat src_base, hsv_base;
-     Mat src_test1, hsv_test1;
-     Mat src_test2, hsv_test2;
-     Mat hsv_half_down;
-
-     /// Load three images with different environment settings
-     if( argc < 4 )
-       { printf("** Error. Usage: ./compareHist_Demo <image_settings0> <image_setting1> <image_settings2>\n");
-         return -1;
-       }
-
-     src_base = imread( argv[1], 1 );
-     src_test1 = imread( argv[2], 1 );
-     src_test2 = imread( argv[3], 1 );
-
-     /// Convert to HSV
-     cvtColor( src_base, hsv_base, CV_BGR2HSV );
-     cvtColor( src_test1, hsv_test1, CV_BGR2HSV );
-     cvtColor( src_test2, hsv_test2, CV_BGR2HSV );
-
-     hsv_half_down = hsv_base( Range( hsv_base.rows/2, hsv_base.rows - 1 ), Range( 0, hsv_base.cols - 1 ) );
-
-     /// Using 30 bins for hue and 32 for saturation
-     int h_bins = 50; int s_bins = 60;
-     int histSize[] = { h_bins, s_bins };
-
-     // hue varies from 0 to 256, saturation from 0 to 180
-     float h_ranges[] = { 0, 256 };
-     float s_ranges[] = { 0, 180 };
-
-     const float* ranges[] = { h_ranges, s_ranges };
-
-     // Use the o-th and 1-st channels
-     int channels[] = { 0, 1 };
-
-     /// Histograms
-     MatND hist_base;
-     MatND hist_half_down;
-     MatND hist_test1;
-     MatND hist_test2;
-
-     /// Calculate the histograms for the HSV images
-     calcHist( &hsv_base, 1, channels, Mat(), hist_base, 2, histSize, ranges, true, false );
-     normalize( hist_base, hist_base, 0, 1, NORM_MINMAX, -1, Mat() );
+.. literalinclude:: ../../../../../samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
+   :language: cpp
+   :tab-width: 4

-     calcHist( &hsv_half_down, 1, channels, Mat(), hist_half_down, 2, histSize, ranges, true, false );
-     normalize( hist_half_down, hist_half_down, 0, 1, NORM_MINMAX, -1, Mat() );
-
-     calcHist( &hsv_test1, 1, channels, Mat(), hist_test1, 2, histSize, ranges, true, false );
-     normalize( hist_test1, hist_test1, 0, 1, NORM_MINMAX, -1, Mat() );
-
-     calcHist( &hsv_test2, 1, channels, Mat(), hist_test2, 2, histSize, ranges, true, false );
-     normalize( hist_test2, hist_test2, 0, 1, NORM_MINMAX, -1, Mat() );
-
-     /// Apply the histogram comparison methods
-     for( int i = 0; i < 4; i++ )
-        { int compare_method = i;
-          double base_base = compareHist( hist_base, hist_base, compare_method );
-          double base_half = compareHist( hist_base, hist_half_down, compare_method );
-          double base_test1 = compareHist( hist_base, hist_test1, compare_method );
-          double base_test2 = compareHist( hist_base, hist_test2, compare_method );
-
-          printf( " Method [%d] Perfect, Base-Half, Base-Test(1), Base-Test(2) : %f, %f, %f, %f \n", i, base_base, base_half , base_test1, base_test2 );
-        }
-
-     printf( "Done \n" );
-
-     return 0;
-    }


 Explanation
@ -211,11 +133,11 @@ Explanation

   .. code-block:: cpp

-     int h_bins = 50; int s_bins = 32;
+     int h_bins = 50; int s_bins = 60;
     int histSize[] = { h_bins, s_bins };

-     float h_ranges[] = { 0, 256 };
-     float s_ranges[] = { 0, 180 };
+     float h_ranges[] = { 0, 180 };
+     float s_ranges[] = { 0, 256 };

     const float* ranges[] = { h_ranges, s_ranges };

--- a/doc/tutorials/introduction/load_save_image/load_save_image.rst
+++ b/doc/tutorials/introduction/load_save_image/load_save_image.rst
@ -5,7 +5,7 @@ Load, Modify, and Save an Image

 .. note::

-   We assume that by now you know how to load an image using :imread:`imread <>` and to display it in a window (using :imshow:`imshow <>`). Read the :ref:`Display_Image` tutorial otherwise.
+   We assume that by now you know how to load an image using :readwriteimagevideo:`imread <imread>` and to display it in a window (using :user_interface:`imshow <imshow>`). Read the :ref:`Display_Image` tutorial otherwise.

 Goals
 ======
@ -14,9 +14,9 @@ In this tutorial you will learn how to:

 .. container:: enumeratevisibleitemswithsquare

-   * Load an image using :imread:`imread <>`
-   * Transform an image from BGR to Grayscale format by using :cvt_color:`cvtColor <>`
-   * Save your transformed image in a file on disk (using :imwrite:`imwrite <>`)
+   * Load an image using :readwriteimagevideo:`imread <imread>`
+   * Transform an image from BGR to Grayscale format by using :miscellaneous_transformations:`cvtColor <cvtcolor>`
+   * Save your transformed image in a file on disk (using :readwriteimagevideo:`imwrite <imwrite>`)

 Code
 ======
@ -62,10 +62,7 @@ Here it is:
 Explanation
 ============

-#. We begin by:
-
-   * Creating a Mat object to store the image information
-   * Load an image using :imread:`imread <>`, located in the path given by *imageName*. Fort this example, assume you are loading a RGB image.
+#. We begin by loading an image using :readwriteimagevideo:`imread <imread>`, located in the path given by *imageName*. For this example, assume you are loading a RGB image.

 #. Now we are going to convert our image from BGR to Grayscale format. OpenCV has a really nice function to do this kind of transformations:

@ -73,15 +70,15 @@ Explanation

      cvtColor( image, gray_image, CV_BGR2GRAY );

-   As you can see, :cvt_color:`cvtColor <>` takes as arguments:
+   As you can see, :miscellaneous_transformations:`cvtColor <cvtcolor>` takes as arguments:

   .. container:: enumeratevisibleitemswithsquare

      * a source image (*image*)
      * a destination image (*gray_image*), in which we will save the converted image.
-      * an additional parameter that indicates what kind of transformation will be performed. In this case we use **CV_BGR2GRAY** (because of :imread:`imread <>` has BGR default channel order in case of color images).
+      * an additional parameter that indicates what kind of transformation will be performed. In this case we use **CV_BGR2GRAY** (because of :readwriteimagevideo:`imread <imread>` has BGR default channel order in case of color images).

-#. So now we have our new *gray_image* and want to save it on disk (otherwise it will get lost after the program ends). To save it, we will use a function analagous to :imread:`imread <>`: :imwrite:`imwrite <>`
+#. So now we have our new *gray_image* and want to save it on disk (otherwise it will get lost after the program ends). To save it, we will use a function analagous to :readwriteimagevideo:`imread <imread>`: :readwriteimagevideo:`imwrite <imwrite>`

   .. code-block:: cpp

--- a/modules/androidcamera/camera_wrapper/CMakeLists.txt
+++ b/modules/androidcamera/camera_wrapper/CMakeLists.txt
@ -58,7 +58,7 @@ SET_TARGET_PROPERTIES(${the_target} PROPERTIES
                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
                      )

-if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
+if (NOT (CMAKE_BUILD_TYPE MATCHES "Debug"))
    ADD_CUSTOM_COMMAND( TARGET ${the_target} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${LIBRARY_OUTPUT_PATH}/lib${the_target}.so" )
 endif()

--- a/modules/androidcamera/camera_wrapper/camera_wrapper.cpp
+++ b/modules/androidcamera/camera_wrapper/camera_wrapper.cpp
@ -61,6 +61,12 @@

 using namespace android;

+// non-public camera related classes are not binary compatible
+// objects of these classes have different sizeof on different platforms
+// additional memory tail to all system objects to overcome sizeof issue
+#define MAGIC_TAIL 4096
+
+
 void debugShowFPS();

 #if defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) || defined(ANDROID_r4_3_0)
@ -90,6 +96,7 @@ public:
 };
 #endif

+
 std::string getProcessName()
 {
    std::string result;
@ -142,12 +149,22 @@ class CameraHandler: public CameraListener
 protected:
    int cameraId;
    sp<Camera> camera;
-    CameraParameters params;
+#if defined(ANDROID_r3_0_1) || defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
+    sp<SurfaceTexture> surface;
+#endif
+#if defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) || defined(ANDROID_r4_3_0) || defined(ANDROID_r4_4_0)
+    sp<BufferQueue> queue;
+    sp<ConsumerListenerStub> listener;
+#endif
+    CameraParameters* params;
    CameraCallback cameraCallback;
    void* userData;

    int emptyCameraCallbackReported;

+    int width;
+    int height;
+
    static const char* flashModesNames[ANDROID_CAMERA_FLASH_MODES_NUM];
    static const char* focusModesNames[ANDROID_CAMERA_FOCUS_MODES_NUM];
    static const char* whiteBalanceModesNames[ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM];
@ -258,7 +275,7 @@ protected:

    int is_supported(const char* supp_modes_key, const char* mode)
    {
-        const char* supported_modes = params.get(supp_modes_key);
+        const char* supported_modes = params->get(supp_modes_key);
        return (supported_modes && mode && (strstr(supported_modes, mode) > 0));
    }

@ -268,7 +285,7 @@ protected:
        if (focus_distance_type >= 0 && focus_distance_type < 3)
    {
            float focus_distances[3];
-            const char* output = params.get(CameraParameters::KEY_FOCUS_DISTANCES);
+            const char* output = params->get(CameraParameters::KEY_FOCUS_DISTANCES);
            int val_num = CameraHandler::split_float(output, focus_distances, ',', 3);
            if(val_num == 3)
        {
@ -300,10 +317,15 @@ public:
        emptyCameraCallbackReported(0)
    {
        LOGD("Instantiated new CameraHandler (%p, %p)", callback, _userData);
+        void* params_buffer = operator new(sizeof(CameraParameters) + MAGIC_TAIL);
+        params = new(params_buffer) CameraParameters();
    }

    virtual ~CameraHandler()
    {
+        if (params)
+            params->~CameraParameters();
+            operator delete(params);
        LOGD("CameraHandler destructor is called");
    }

@ -371,10 +393,18 @@ const char* CameraHandler::focusModesNames[ANDROID_CAMERA_FOCUS_MODES_NUM] =
    CameraParameters::FOCUS_MODE_AUTO,
 #if !defined(ANDROID_r2_2_0)
    CameraParameters::FOCUS_MODE_CONTINUOUS_VIDEO,
+#else
+    CameraParameters::FOCUS_MODE_AUTO,
 #endif
    CameraParameters::FOCUS_MODE_EDOF,
    CameraParameters::FOCUS_MODE_FIXED,
-    CameraParameters::FOCUS_MODE_INFINITY
+    CameraParameters::FOCUS_MODE_INFINITY,
+    CameraParameters::FOCUS_MODE_MACRO,
+#if !defined(ANDROID_r2_2_0) && !defined(ANDROID_r2_3_3) && !defined(ANDROID_r3_0_1)
+    CameraParameters::FOCUS_MODE_CONTINUOUS_PICTURE
+#else
+    CameraParameters::FOCUS_MODE_AUTO
+#endif
 };

 const char* CameraHandler::whiteBalanceModesNames[ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM] =
@ -534,39 +564,39 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
    {
        LOGI("initCameraConnect: Setting paramers from previous camera handler");
        camera->setParameters(prevCameraParameters->flatten());
-        handler->params.unflatten(prevCameraParameters->flatten());
+        handler->params->unflatten(prevCameraParameters->flatten());
    }
    else
    {
        android::String8 params_str = camera->getParameters();
        LOGI("initCameraConnect: [%s]", params_str.string());

-        handler->params.unflatten(params_str);
-
-        LOGD("Supported Cameras: %s", handler->params.get("camera-indexes"));
-        LOGD("Supported Picture Sizes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PICTURE_SIZES));
-        LOGD("Supported Picture Formats: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PICTURE_FORMATS));
-        LOGD("Supported Preview Sizes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_SIZES));
-        LOGD("Supported Preview Formats: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_FORMATS));
-        LOGD("Supported Preview Frame Rates: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_FRAME_RATES));
-        LOGD("Supported Thumbnail Sizes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_JPEG_THUMBNAIL_SIZES));
-        LOGD("Supported Whitebalance Modes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_WHITE_BALANCE));
-        LOGD("Supported Effects: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_EFFECTS));
-        LOGD("Supported Scene Modes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_SCENE_MODES));
-        LOGD("Supported Focus Modes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_FOCUS_MODES));
-        LOGD("Supported Antibanding Options: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_ANTIBANDING));
-        LOGD("Supported Flash Modes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_FLASH_MODES));
+        handler->params->unflatten(params_str);
+
+        LOGD("Supported Cameras: %s", handler->params->get("camera-indexes"));
+        LOGD("Supported Picture Sizes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PICTURE_SIZES));
+        LOGD("Supported Picture Formats: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PICTURE_FORMATS));
+        LOGD("Supported Preview Sizes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_SIZES));
+        LOGD("Supported Preview Formats: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_FORMATS));
+        LOGD("Supported Preview Frame Rates: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_FRAME_RATES));
+        LOGD("Supported Thumbnail Sizes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_JPEG_THUMBNAIL_SIZES));
+        LOGD("Supported Whitebalance Modes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_WHITE_BALANCE));
+        LOGD("Supported Effects: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_EFFECTS));
+        LOGD("Supported Scene Modes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_SCENE_MODES));
+        LOGD("Supported Focus Modes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_FOCUS_MODES));
+        LOGD("Supported Antibanding Options: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_ANTIBANDING));
+        LOGD("Supported Flash Modes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_FLASH_MODES));

 #if !defined(ANDROID_r2_2_0)
        // Set focus mode to continuous-video if supported
-        const char* available_focus_modes = handler->params.get(CameraParameters::KEY_SUPPORTED_FOCUS_MODES);
+        const char* available_focus_modes = handler->params->get(CameraParameters::KEY_SUPPORTED_FOCUS_MODES);
        if (available_focus_modes != 0)
        {
            if (strstr(available_focus_modes, "continuous-video") != NULL)
            {
-                handler->params.set(CameraParameters::KEY_FOCUS_MODE, CameraParameters::FOCUS_MODE_CONTINUOUS_VIDEO);
+                handler->params->set(CameraParameters::KEY_FOCUS_MODE, CameraParameters::FOCUS_MODE_CONTINUOUS_VIDEO);

-                status_t resParams = handler->camera->setParameters(handler->params.flatten());
+                status_t resParams = handler->camera->setParameters(handler->params->flatten());

                if (resParams != 0)
                {
@ -581,7 +611,7 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
 #endif

        //check if yuv420sp format available. Set this format as preview format.
-        const char* available_formats = handler->params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_FORMATS);
+        const char* available_formats = handler->params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_FORMATS);
        if (available_formats != 0)
        {
            const char* format_to_set = 0;
@ -607,9 +637,9 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,

            if (0 != format_to_set)
            {
-                handler->params.setPreviewFormat(format_to_set);
+                handler->params->setPreviewFormat(format_to_set);

-                status_t resParams = handler->camera->setParameters(handler->params.flatten());
+                status_t resParams = handler->camera->setParameters(handler->params->flatten());

                if (resParams != 0)
                    LOGE("initCameraConnect: failed to set preview format to %s", format_to_set);
@ -617,6 +647,13 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
                    LOGD("initCameraConnect: preview format is set to %s", format_to_set);
            }
        }
+
+        handler->params->setPreviewSize(640, 480);
+        status_t resParams = handler->camera->setParameters(handler->params->flatten());
+        if (resParams != 0)
+            LOGE("initCameraConnect: failed to set preview resolution to 640x480");
+        else
+            LOGD("initCameraConnect: preview format is set to 640x480");
    }

    status_t bufferStatus;
@ -627,22 +664,27 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
 #elif defined(ANDROID_r2_3_3)
    /* Do nothing in case of 2.3 for now */
 #elif defined(ANDROID_r3_0_1) || defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
-    sp<SurfaceTexture> surfaceTexture = new SurfaceTexture(MAGIC_OPENCV_TEXTURE_ID);
-    bufferStatus = camera->setPreviewTexture(surfaceTexture);
+    void* surface_texture_obj = operator new(sizeof(SurfaceTexture) + MAGIC_TAIL);
+    handler->surface = new(surface_texture_obj) SurfaceTexture(MAGIC_OPENCV_TEXTURE_ID);
+    bufferStatus = camera->setPreviewTexture(handler->surface);
    if (bufferStatus != 0)
        LOGE("initCameraConnect: failed setPreviewTexture call (status %d); camera might not work correctly", bufferStatus);
 #elif defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) || defined(ANDROID_r4_3_0)
-    sp<BufferQueue> bufferQueue = new BufferQueue();
-    sp<BufferQueue::ConsumerListener> queueListener = new ConsumerListenerStub();
-    bufferQueue->consumerConnect(queueListener);
-    bufferStatus = camera->setPreviewTexture(bufferQueue);
+    void* buffer_queue_obj = operator new(sizeof(BufferQueue) + MAGIC_TAIL);
+    handler->queue = new(buffer_queue_obj) BufferQueue();
+    void* consumer_listener_obj = operator new(sizeof(ConsumerListenerStub) + MAGIC_TAIL);
+    handler->listener = new(consumer_listener_obj) ConsumerListenerStub();
+    handler->queue->consumerConnect(handler->listener);
+    bufferStatus = camera->setPreviewTexture(handler->queue);
    if (bufferStatus != 0)
        LOGE("initCameraConnect: failed setPreviewTexture call; camera might not work correctly");
 # elif defined(ANDROID_r4_4_0)
-    sp<BufferQueue> bufferQueue = new BufferQueue();
-    sp<IConsumerListener> queueListener = new ConsumerListenerStub();
-    bufferQueue->consumerConnect(queueListener, true);
-    bufferStatus = handler->camera->setPreviewTarget(bufferQueue);
+    void* buffer_queue_obj = operator new(sizeof(BufferQueue) + MAGIC_TAIL);
+    handler->queue = new(buffer_queue_obj) BufferQueue();
+    void* consumer_listener_obj = operator new(sizeof(ConsumerListenerStub) + MAGIC_TAIL);
+    handler->listener = new(consumer_listener_obj) ConsumerListenerStub();
+    handler->queue->consumerConnect(handler->listener, true);
+    bufferStatus = handler->camera->setPreviewTarget(handler->queue);
    if (bufferStatus != 0)
        LOGE("applyProperties: failed setPreviewTexture call; camera might not work correctly");
 # endif
@ -723,18 +765,18 @@ double CameraHandler::getProperty(int propIdx)
    case ANDROID_CAMERA_PROPERTY_FRAMEWIDTH:
    {
        int w,h;
-        params.getPreviewSize(&w, &h);
+        params->getPreviewSize(&w, &h);
        return w;
    }
    case ANDROID_CAMERA_PROPERTY_FRAMEHEIGHT:
    {
        int w,h;
-        params.getPreviewSize(&w, &h);
+        params->getPreviewSize(&w, &h);
        return h;
    }
    case ANDROID_CAMERA_PROPERTY_SUPPORTED_PREVIEW_SIZES_STRING:
    {
-        cameraPropertySupportedPreviewSizesString = params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_SIZES);
+        cameraPropertySupportedPreviewSizesString = params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_SIZES);
        union {const char* str;double res;} u;
        memset(&u.res, 0, sizeof(u.res));
        u.str = cameraPropertySupportedPreviewSizesString.c_str();
@ -742,7 +784,7 @@ double CameraHandler::getProperty(int propIdx)
    }
    case ANDROID_CAMERA_PROPERTY_PREVIEW_FORMAT_STRING:
    {
-        const char* fmt = params.get(CameraParameters::KEY_PREVIEW_FORMAT);
+        const char* fmt = params->get(CameraParameters::KEY_PREVIEW_FORMAT);
        if (fmt == CameraParameters::PIXEL_FORMAT_YUV422SP)
            fmt = "yuv422sp";
        else if (fmt == CameraParameters::PIXEL_FORMAT_YUV420SP)
@ -762,44 +804,44 @@ double CameraHandler::getProperty(int propIdx)
    }
    case ANDROID_CAMERA_PROPERTY_EXPOSURE:
    {
-        int exposure = params.getInt(CameraParameters::KEY_EXPOSURE_COMPENSATION);
+        int exposure = params->getInt(CameraParameters::KEY_EXPOSURE_COMPENSATION);
        return exposure;
    }
    case ANDROID_CAMERA_PROPERTY_FPS:
    {
-        return params.getPreviewFrameRate();
+        return params->getPreviewFrameRate();
    }
    case ANDROID_CAMERA_PROPERTY_FLASH_MODE:
    {
        int flash_mode = getModeNum(CameraHandler::flashModesNames,
                                    ANDROID_CAMERA_FLASH_MODES_NUM,
-                                    params.get(CameraParameters::KEY_FLASH_MODE));
+                                    params->get(CameraParameters::KEY_FLASH_MODE));
        return flash_mode;
    }
    case ANDROID_CAMERA_PROPERTY_FOCUS_MODE:
    {
        int focus_mode = getModeNum(CameraHandler::focusModesNames,
                                    ANDROID_CAMERA_FOCUS_MODES_NUM,
-                                    params.get(CameraParameters::KEY_FOCUS_MODE));
+                                    params->get(CameraParameters::KEY_FOCUS_MODE));
        return focus_mode;
    }
    case ANDROID_CAMERA_PROPERTY_WHITE_BALANCE:
    {
        int white_balance = getModeNum(CameraHandler::whiteBalanceModesNames,
                                       ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM,
-                                       params.get(CameraParameters::KEY_WHITE_BALANCE));
+                                       params->get(CameraParameters::KEY_WHITE_BALANCE));
        return white_balance;
    }
    case ANDROID_CAMERA_PROPERTY_ANTIBANDING:
    {
        int antibanding = getModeNum(CameraHandler::antibandingModesNames,
                                     ANDROID_CAMERA_ANTIBANDING_MODES_NUM,
-                                     params.get(CameraParameters::KEY_ANTIBANDING));
+                                     params->get(CameraParameters::KEY_ANTIBANDING));
        return antibanding;
    }
    case ANDROID_CAMERA_PROPERTY_FOCAL_LENGTH:
    {
-        float focal_length = params.getFloat(CameraParameters::KEY_FOCAL_LENGTH);
+        float focal_length = params->getFloat(CameraParameters::KEY_FOCAL_LENGTH);
        return focal_length;
    }
    case ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_NEAR:
@ -814,6 +856,24 @@ double CameraHandler::getProperty(int propIdx)
    {
        return getFocusDistance(ANDROID_CAMERA_FOCUS_DISTANCE_FAR_INDEX);
    }
+#if !defined(ANDROID_r2_2_0) && !defined(ANDROID_r2_3_3) && !defined(ANDROID_r3_0_1)
+    case ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK:
+    {
+        const char* status = params->get(CameraParameters::KEY_AUTO_WHITEBALANCE_LOCK);
+        if (status == CameraParameters::TRUE)
+            return 1.;
+        else
+            return 0.;
+    }
+    case ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK:
+    {
+        const char* status = params->get(CameraParameters::KEY_AUTO_EXPOSURE_LOCK);
+        if (status == CameraParameters::TRUE)
+            return 1.;
+        else
+            return 0.;
+    }
+#endif
    default:
        LOGW("CameraHandler::getProperty - Unsupported property.");
    };
@ -824,99 +884,151 @@ void CameraHandler::setProperty(int propIdx, double value)
 {
    LOGD("CameraHandler::setProperty(%d, %f)", propIdx, value);

+    android::String8 params_str;
+    params_str = camera->getParameters();
+    LOGI("Params before set: [%s]", params_str.string());
+
    switch (propIdx)
    {
    case ANDROID_CAMERA_PROPERTY_FRAMEWIDTH:
    {
        int w,h;
-        params.getPreviewSize(&w, &h);
-        w = (int)value;
-        params.setPreviewSize(w, h);
+        params->getPreviewSize(&w, &h);
+        width = (int)value;
    }
    break;
    case ANDROID_CAMERA_PROPERTY_FRAMEHEIGHT:
    {
        int w,h;
-        params.getPreviewSize(&w, &h);
-        h = (int)value;
-        params.setPreviewSize(w, h);
+        params->getPreviewSize(&w, &h);
+        height = (int)value;
    }
    break;
    case ANDROID_CAMERA_PROPERTY_EXPOSURE:
    {
-        int max_exposure = params.getInt("max-exposure-compensation");
-        int min_exposure = params.getInt("min-exposure-compensation");
-        if(max_exposure && min_exposure){
+        int max_exposure = params->getInt("max-exposure-compensation");
+        int min_exposure = params->getInt("min-exposure-compensation");
+        if(max_exposure && min_exposure)
+        {
            int exposure = (int)value;
-            if(exposure >= min_exposure && exposure <= max_exposure){
-                params.set("exposure-compensation", exposure);
-            } else {
+            if(exposure >= min_exposure && exposure <= max_exposure)
+                params->set("exposure-compensation", exposure);
+            else
                LOGE("Exposure compensation not in valid range (%i,%i).", min_exposure, max_exposure);
-            }
-        } else {
+        } else
            LOGE("Exposure compensation adjust is not supported.");
-        }
+
+        camera->setParameters(params->flatten());
    }
    break;
    case ANDROID_CAMERA_PROPERTY_FLASH_MODE:
    {
        int new_val = (int)value;
-        if(new_val >= 0 && new_val < ANDROID_CAMERA_FLASH_MODES_NUM){
+        if(new_val >= 0 && new_val < ANDROID_CAMERA_FLASH_MODES_NUM)
+        {
            const char* mode_name = flashModesNames[new_val];
            if(is_supported(CameraParameters::KEY_SUPPORTED_FLASH_MODES, mode_name))
-                params.set(CameraParameters::KEY_FLASH_MODE, mode_name);
+                params->set(CameraParameters::KEY_FLASH_MODE, mode_name);
            else
                LOGE("Flash mode %s is not supported.", mode_name);
-        } else {
-            LOGE("Flash mode value not in valid range.");
        }
+        else
+            LOGE("Flash mode value not in valid range.");
+
+        camera->setParameters(params->flatten());
    }
    break;
    case ANDROID_CAMERA_PROPERTY_FOCUS_MODE:
    {
        int new_val = (int)value;
-        if(new_val >= 0 && new_val < ANDROID_CAMERA_FOCUS_MODES_NUM){
+        if(new_val >= 0 && new_val < ANDROID_CAMERA_FOCUS_MODES_NUM)
+        {
            const char* mode_name = focusModesNames[new_val];
            if(is_supported(CameraParameters::KEY_SUPPORTED_FOCUS_MODES, mode_name))
-                params.set(CameraParameters::KEY_FOCUS_MODE, mode_name);
+                params->set(CameraParameters::KEY_FOCUS_MODE, mode_name);
            else
                LOGE("Focus mode %s is not supported.", mode_name);
-        } else {
-            LOGE("Focus mode value not in valid range.");
        }
+        else
+            LOGE("Focus mode value not in valid range.");
+
+        camera->setParameters(params->flatten());
    }
    break;
    case ANDROID_CAMERA_PROPERTY_WHITE_BALANCE:
    {
        int new_val = (int)value;
-        if(new_val >= 0 && new_val < ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM){
+        if(new_val >= 0 && new_val < ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM)
+        {
            const char* mode_name = whiteBalanceModesNames[new_val];
            if(is_supported(CameraParameters::KEY_SUPPORTED_WHITE_BALANCE, mode_name))
-                params.set(CameraParameters::KEY_WHITE_BALANCE, mode_name);
+                params->set(CameraParameters::KEY_WHITE_BALANCE, mode_name);
            else
                LOGE("White balance mode %s is not supported.", mode_name);
-        } else {
-            LOGE("White balance mode value not in valid range.");
        }
+        else
+            LOGE("White balance mode value not in valid range.");
+
+        camera->setParameters(params->flatten());
    }
    break;
    case ANDROID_CAMERA_PROPERTY_ANTIBANDING:
    {
        int new_val = (int)value;
-        if(new_val >= 0 && new_val < ANDROID_CAMERA_ANTIBANDING_MODES_NUM){
+        if(new_val >= 0 && new_val < ANDROID_CAMERA_ANTIBANDING_MODES_NUM)
+        {
            const char* mode_name = antibandingModesNames[new_val];
            if(is_supported(CameraParameters::KEY_SUPPORTED_ANTIBANDING, mode_name))
-                params.set(CameraParameters::KEY_ANTIBANDING, mode_name);
+                params->set(CameraParameters::KEY_ANTIBANDING, mode_name);
            else
                LOGE("Antibanding mode %s is not supported.", mode_name);
-        } else {
+        }
+        else
            LOGE("Antibanding mode value not in valid range.");
+
+        camera->setParameters(params->flatten());
+    }
+    break;
+#if !defined(ANDROID_r2_2_0) && !defined(ANDROID_r2_3_3) && !defined(ANDROID_r3_0_1)
+    case ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK:
+    {
+        if (is_supported(CameraParameters::KEY_AUTO_EXPOSURE_LOCK_SUPPORTED, "true"))
+        {
+            if (value != 0)
+                params->set(CameraParameters::KEY_AUTO_EXPOSURE_LOCK, CameraParameters::TRUE);
+            else
+                params->set(CameraParameters::KEY_AUTO_EXPOSURE_LOCK, CameraParameters::FALSE);
+            LOGE("Expose lock is set");
        }
+        else
+            LOGE("Expose lock is not supported");
+
+        camera->setParameters(params->flatten());
    }
    break;
+    case ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK:
+    {
+        if (is_supported(CameraParameters::KEY_AUTO_WHITEBALANCE_LOCK_SUPPORTED, "true"))
+        {
+            if (value != 0)
+                params->set(CameraParameters::KEY_AUTO_WHITEBALANCE_LOCK, CameraParameters::TRUE);
+            else
+                params->set(CameraParameters::KEY_AUTO_WHITEBALANCE_LOCK, CameraParameters::FALSE);
+            LOGE("White balance lock is set");
+        }
+        else
+            LOGE("White balance lock is not supported");
+
+        camera->setParameters(params->flatten());
+    }
+    break;
+#endif
    default:
        LOGW("CameraHandler::setProperty - Unsupported property.");
    };
+
+    params_str = camera->getParameters();
+    LOGI("Params after set: [%s]", params_str.string());
 }

 void CameraHandler::applyProperties(CameraHandler** ppcameraHandler)
@ -935,7 +1047,10 @@ void CameraHandler::applyProperties(CameraHandler** ppcameraHandler)
        return;
    }

-    CameraParameters curCameraParameters((*ppcameraHandler)->params.flatten());
+    // delayed resolution setup to exclude errors during other parameres setup on the fly
+    // without camera restart
+    if (((*ppcameraHandler)->width != 0) && ((*ppcameraHandler)->height != 0))
+        (*ppcameraHandler)->params->setPreviewSize((*ppcameraHandler)->width, (*ppcameraHandler)->height);

 #if defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3) || defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) \
 || defined(ANDROID_r4_3_0) || defined(ANDROID_r4_4_0)
@ -951,27 +1066,27 @@ void CameraHandler::applyProperties(CameraHandler** ppcameraHandler)
        return;
    }

-    handler->camera->setParameters(curCameraParameters.flatten());
-    handler->params.unflatten(curCameraParameters.flatten());
+    handler->camera->setParameters((*ppcameraHandler)->params->flatten());

    status_t bufferStatus;
 # if defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
-    sp<SurfaceTexture> surfaceTexture = new SurfaceTexture(MAGIC_OPENCV_TEXTURE_ID);
-    bufferStatus = handler->camera->setPreviewTexture(surfaceTexture);
+    void* surface_texture_obj = operator new(sizeof(SurfaceTexture) + MAGIC_TAIL);
+    handler->surface = new(surface_texture_obj) SurfaceTexture(MAGIC_OPENCV_TEXTURE_ID);
+    bufferStatus = handler->camera->setPreviewTexture(handler->surface);
    if (bufferStatus != 0)
        LOGE("applyProperties: failed setPreviewTexture call (status %d); camera might not work correctly", bufferStatus);
 # elif defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) || defined(ANDROID_r4_3_0)
-    sp<BufferQueue> bufferQueue = new BufferQueue();
-    sp<BufferQueue::ConsumerListener> queueListener = new ConsumerListenerStub();
-    bufferQueue->consumerConnect(queueListener);
-    bufferStatus = handler->camera->setPreviewTexture(bufferQueue);
+    void* buffer_queue_obj = operator new(sizeof(BufferQueue) + MAGIC_TAIL);
+    handler->queue = new(buffer_queue_obj) BufferQueue();
+    handler->queue->consumerConnect(handler->listener);
+    bufferStatus = handler->camera->setPreviewTexture(handler->queue);
    if (bufferStatus != 0)
        LOGE("applyProperties: failed setPreviewTexture call; camera might not work correctly");
 # elif defined(ANDROID_r4_4_0)
-    sp<BufferQueue> bufferQueue = new BufferQueue();
-    sp<IConsumerListener> queueListener = new ConsumerListenerStub();
-    bufferQueue->consumerConnect(queueListener, true);
-    bufferStatus = handler->camera->setPreviewTarget(bufferQueue);
+    void* buffer_queue_obj = operator new(sizeof(BufferQueue) + MAGIC_TAIL);
+    handler->queue = new(buffer_queue_obj) BufferQueue();
+    handler->queue->consumerConnect(handler->listener, true);
+    bufferStatus = handler->camera->setPreviewTarget(handler->queue);
    if (bufferStatus != 0)
        LOGE("applyProperties: failed setPreviewTexture call; camera might not work correctly");
 # endif
@ -1002,7 +1117,7 @@ void CameraHandler::applyProperties(CameraHandler** ppcameraHandler)
    LOGD("CameraHandler::applyProperties(): after previousCameraHandler->closeCameraConnect");

    LOGD("CameraHandler::applyProperties(): before initCameraConnect");
-    CameraHandler* handler=initCameraConnect(cameraCallback, cameraId, userData, &curCameraParameters);
+    CameraHandler* handler=initCameraConnect(cameraCallback, cameraId, userData, (*ppcameraHandler)->params);
    LOGD("CameraHandler::applyProperties(): after initCameraConnect, handler=0x%x", (int)handler);
    if (handler == NULL) {
        LOGE("ERROR in applyProperties --- cannot reinit camera");
--- a/modules/androidcamera/include/camera_properties.h
+++ b/modules/androidcamera/include/camera_properties.h
@ -15,7 +15,9 @@ enum {
    ANDROID_CAMERA_PROPERTY_FOCAL_LENGTH = 105,
    ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_NEAR = 106,
    ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_OPTIMAL = 107,
-    ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_FAR = 108
+    ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_FAR = 108,
+    ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK = 109,
+    ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK = 110
 };


@ -30,12 +32,12 @@ enum {

 enum {
    ANDROID_CAMERA_FOCUS_MODE_AUTO = 0,
-    ANDROID_CAMERA_FOCUS_MODE_CONTINUOUS_PICTURE,
    ANDROID_CAMERA_FOCUS_MODE_CONTINUOUS_VIDEO,
    ANDROID_CAMERA_FOCUS_MODE_EDOF,
    ANDROID_CAMERA_FOCUS_MODE_FIXED,
    ANDROID_CAMERA_FOCUS_MODE_INFINITY,
    ANDROID_CAMERA_FOCUS_MODE_MACRO,
+    ANDROID_CAMERA_FOCUS_MODE_CONTINUOUS_PICTURE,
    ANDROID_CAMERA_FOCUS_MODES_NUM
 };

--- a/modules/calib3d/src/opencl/stereobm.cl
+++ b/modules/calib3d/src/opencl/stereobm.cl
@ -147,6 +147,8 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
    __local int best_disp[2];
    __local int best_cost[2];
    best_cost[nthread] = MAX_VAL;
+    best_disp[nthread] = MAX_VAL;
+    barrier(CLK_LOCAL_MEM_FENCE);

    short costbuf[wsz];
    int head = 0;
@ -159,7 +161,7 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
    int costIdx = calcLocalIdx(lx, ly, d, sizeY);
    cost = costFunc + costIdx;

-    short tempcost = 0;
+    int tempcost = 0;
    if(x < cols-wsz2-mindisp && y < rows-wsz2)
    {
        int shift = 1*nthread + cols*(1-nthread);
@ -191,7 +193,7 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
    barrier(CLK_LOCAL_MEM_FENCE);

    if(best_cost[1] == tempcost)
-        best_disp[1] = ndisp - d - 1;
+        atomic_min(best_disp + 1, ndisp - d - 1);
    barrier(CLK_LOCAL_MEM_FENCE);

    int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short));
@ -209,6 +211,7 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
        y = (ly < sizeY) ? gy + shiftY + ly : rows;

        best_cost[nthread] = MAX_VAL;
+        best_disp[nthread] = MAX_VAL;
        barrier(CLK_LOCAL_MEM_FENCE);

        costIdx = calcLocalIdx(lx, ly, d, sizeY);
@ -227,12 +230,11 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
        barrier(CLK_LOCAL_MEM_FENCE);

        if(best_cost[nthread] == tempcost)
-            best_disp[nthread] = ndisp - d - 1;
+            atomic_min(best_disp + nthread, ndisp - d - 1);
        barrier(CLK_LOCAL_MEM_FENCE);

        int dispIdx = mad24(gy+ly, disp_step, disp_offset + (gx+lx)*(int)sizeof(short));
        disp = (__global short *)(dispptr + dispIdx);
-
        calcDisp(cost, disp, uniquenessRatio, mindisp, ndisp, 2*sizeY,
            best_disp + nthread, best_cost + nthread, d, x, y, cols, rows, wsz2);
        barrier(CLK_LOCAL_MEM_FENCE);
--- a/modules/core/doc/old_basic_structures.rst
+++ b/modules/core/doc/old_basic_structures.rst
@ -1387,7 +1387,7 @@ description rewritten using

    IplImage* color_img = cvCreateImage(cvSize(320,240), IPL_DEPTH_8U, 3);
    IplImage gray_img_hdr, *gray_img;
-    gray_img = (IplImage*)cvReshapeND(color_img, &gray_img_hdr, 1, 0, 0);
+    gray_img = (IplImage*)cvReshapeMatND(color_img, sizeof(gray_img_hdr), &gray_img_hdr, 1, 0, 0);

    ...

@ -1395,6 +1395,18 @@ description rewritten using
    int size[] = { 2, 2, 2 };
    CvMatND* mat = cvCreateMatND(3, size, CV_32F);
    CvMat row_header, *row;
+    row = (CvMat*)cvReshapeMatND(mat, sizeof(row_header), &row_header, 0, 1, 0);
+
+..
+
+In C, the header file for this function includes a convenient macro ``cvReshapeND`` that does away with the ``sizeof_header`` parameter. So, the lines containing the call to ``cvReshapeMatND`` in the examples may be replaced as follow:
+
+::
+
+    gray_img = (IplImage*)cvReshapeND(color_img, &gray_img_hdr, 1, 0, 0);
+
+    ...
+
    row = (CvMat*)cvReshapeND(mat, &row_header, 0, 1, 0);

 ..
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -118,6 +118,8 @@ public:

    virtual int kind() const;
    virtual int dims(int i=-1) const;
+    virtual int cols(int i=-1) const;
+    virtual int rows(int i=-1) const;
    virtual Size size(int i=-1) const;
    virtual int sizend(int* sz, int i=-1) const;
    virtual bool sameSize(const _InputArray& arr) const;
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -592,7 +592,7 @@ protected:
 CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
 CV_EXPORTS const char* typeToStr(int t);
 CV_EXPORTS const char* memopTypeToStr(int t);
-CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1);
+CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
 CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
 CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
                                         InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -1299,7 +1299,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
    bool haveMask = !_mask.empty();

-    if( ((haveMask || haveScalar) && cn > 4) )
+    if ( (haveMask || haveScalar) && cn > 4 )
        return false;

    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
@ -1320,14 +1320,11 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d",
            (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
            oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
-            ocl::typeToStr(CV_MAKETYPE(depth1, 1)),
-            ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
-            ocl::typeToStr(CV_MAKETYPE(depth2, 1)),
-            ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
-            ocl::typeToStr(CV_MAKETYPE(ddepth, 1)),
-            ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
+            ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
+            ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
+            ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
            ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
-            ocl::typeToStr(CV_MAKETYPE(wdepth, 1)), wdepth,
+            ocl::typeToStr(wdepth), wdepth,
            ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
            ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
            ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
@ -1347,7 +1344,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
    }

    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
-    if( k.empty() )
+    if (k.empty())
        return false;

    UMat src1 = _src1.getUMat(), src2;
@ -1388,12 +1385,12 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,

        if( !haveMask )
        {
-            if(n == 0)
+            if (n == 0)
                k.args(src1arg, src2arg, dstarg);
-            else if(n == 1)
+            else if (n == 1)
                k.args(src1arg, src2arg, dstarg,
                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
-            else if(n == 3)
+            else if (n == 3)
                k.args(src1arg, src2arg, dstarg,
                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
                       ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
@ -2621,53 +2618,37 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
 {
    const ocl::Device& dev = ocl::Device::getDefault();
    bool doubleSupport = dev.doubleFPConfig() > 0;
-    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
-    int type2 = _src2.type();
-
-    if (!haveScalar)
-    {
-        if ( (!doubleSupport && (depth1 == CV_64F || _src2.depth() == CV_64F)) ||
-            !_src1.sameSize(_src2) || type1 != type2)
-            return false;
-    }
-    else
-    {
-        if (cn > 1 || depth1 <= CV_32S) // FIXIT: if (cn > 4): Need to clear CPU-based compare behavior
-            return false;
-    }
+    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1),
+            type2 = _src2.type(), depth2 = CV_MAT_DEPTH(type2);

    if (!doubleSupport && depth1 == CV_64F)
        return false;

+    if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
+            return false;
+
    int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
    // Workaround for bug with "?:" operator in AMD OpenCL compiler
-    bool workaroundForAMD = /*dev.isAMD() &&*/
-            (
-                (depth1 != CV_8U && depth1 != CV_8S)
-            );
-    if (workaroundForAMD)
+    if (depth1 >= CV_16U)
        kercn = 1;

    int scalarcn = kercn == 3 ? 4 : kercn;
-
    const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
    char cvt[40];

-    String buildOptions = format(
-            "-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
-            " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
-            " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s",
-            (haveScalar ? "UNARY_OP" : "BINARY_OP"),
-            ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
-            ocl::typeToStr(CV_8UC(kercn)), kercn,
-            ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
-            operationMap[op],
-            ocl::typeToStr(depth1), ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
-            ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)),
-            doubleSupport ? " -D DOUBLE_SUPPORT" : ""
-            );
-
-    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, buildOptions);
+    String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
+                         " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
+                         " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s",
+                         haveScalar ? "UNARY_OP" : "BINARY_OP",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
+                         ocl::typeToStr(CV_8UC(kercn)), kercn,
+                         ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
+                         operationMap[op], ocl::typeToStr(depth1),
+                         ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
+                         ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
    if (k.empty())
        return false;

@ -2678,24 +2659,43 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in

    if (haveScalar)
    {
-        size_t esz = CV_ELEM_SIZE1(type1)*scalarcn;
-        double buf[4]={0,0,0,0};
-        Mat src2sc = _src2.getMat();
+        size_t esz = CV_ELEM_SIZE1(type1) * scalarcn;
+        double buf[4] = { 0, 0, 0, 0 };
+        Mat src2 = _src2.getMat();

-        if (!src2sc.empty())
-            convertAndUnrollScalar(src2sc, type1, (uchar*)buf, 1);
+        if( depth1 > CV_32S )
+            convertAndUnrollScalar( src2, depth1, (uchar *)buf, kercn );
+        else
+        {
+            double fval = 0;
+            getConvertFunc(depth2, CV_64F)(src2.data, 0, 0, 0, (uchar *)&fval, 0, Size(1, 1), 0);
+            if( fval < getMinVal(depth1) )
+                return dst.setTo(Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0)), true;
+
+            if( fval > getMaxVal(depth1) )
+                return dst.setTo(Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0)), true;
+
+            int ival = cvRound(fval);
+            if( fval != ival )
+            {
+                if( op == CMP_LT || op == CMP_GE )
+                    ival = cvCeil(fval);
+                else if( op == CMP_LE || op == CMP_GT )
+                    ival = cvFloor(fval);
+                else
+                    return dst.setTo(Scalar::all(op == CMP_NE ? 255 : 0)), true;
+            }
+            convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, (uchar *)buf, kercn);
+        }

        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);

        k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn),
-               ocl::KernelArg::WriteOnly(dst, cn, kercn),
-               scalararg);
+               ocl::KernelArg::WriteOnly(dst, cn, kercn), scalararg);
    }
    else
    {
-        CV_DbgAssert(type1 == type2);
        UMat src2 = _src2.getUMat();
-        CV_DbgAssert(size == src2.size());

        k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
               ocl::KernelArg::ReadOnlyNoSize(src2),
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -415,42 +415,54 @@ namespace cv {

 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
 {
-    std::vector<UMat> src;
+    std::vector<UMat> src, ksrc;
    _mv.getUMatVector(src);
    CV_Assert(!src.empty());

    int type = src[0].type(), depth = CV_MAT_DEPTH(type);
    Size size = src[0].size();

-    size_t srcsize = src.size();
-    for (size_t i = 0; i < srcsize; ++i)
+    for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
    {
-        int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype);
-        if (src[i].dims > 2 || icn != 1)
+        int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype),
+                esz1 = CV_ELEM_SIZE1(idepth);
+        if (src[i].dims > 2)
            return false;
+
        CV_Assert(size == src[i].size() && depth == idepth);
+
+        for (int cn = 0; cn < icn; ++cn)
+        {
+            UMat tsrc = src[i];
+            tsrc.offset += cn * esz1;
+            ksrc.push_back(tsrc);
+        }
    }
+    int dcn = (int)ksrc.size();

-    String srcargs, srcdecl, processelem;
-    for (size_t i = 0; i < srcsize; ++i)
+    String srcargs, srcdecl, processelem, cndecl;
+    for (int i = 0; i < dcn; ++i)
    {
        srcargs += format("DECLARE_SRC_PARAM(%d)", i);
        srcdecl += format("DECLARE_DATA(%d)", i);
        processelem += format("PROCESS_ELEM(%d)", i);
+        cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
    }

    ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
-                  format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s -D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s",
-                         (int)srcsize, ocl::memopTypeToStr(depth), srcargs.c_str(), srcdecl.c_str(), processelem.c_str()));
+                  format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
+                         " -D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s%s",
+                         dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
+                         srcdecl.c_str(), processelem.c_str(), cndecl.c_str()));
    if (k.empty())
        return false;

-    _dst.create(size, CV_MAKE_TYPE(depth, (int)srcsize));
+    _dst.create(size, CV_MAKE_TYPE(depth, dcn));
    UMat dst = _dst.getUMat();

    int argidx = 0;
-    for (size_t i = 0; i < srcsize; ++i)
-        argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(src[i]));
+    for (int i = 0; i < dcn; ++i)
+        argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
    k.set(argidx, ocl::KernelArg::WriteOnly(dst));

    size_t globalsize[2] = { dst.cols, dst.rows };
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -482,9 +482,9 @@ enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS
 static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
 {
    CV_Assert(flipCode >= - 1 && flipCode <= 1);
-    int type = _src.type(), cn = CV_MAT_CN(type), flipType;
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), flipType;

-    if (cn > 4 || cn == 3)
+    if (cn > 4)
        return false;

    const char * kernelName;
@ -506,7 +506,8 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
    }

    ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
-        format( "-D type=%s", ocl::memopTypeToStr(type)));
+        format( "-D T=%s -D T1=%s -D cn=%d", ocl::memopTypeToStr(type),
+                ocl::memopTypeToStr(depth), cn));
    if (k.empty())
        return false;

--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@ -2041,7 +2041,7 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
    const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW";

    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+                  format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(depth),
                         op, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
    if (k.empty())
        return false;
@ -2081,7 +2081,7 @@ void pow( InputArray _src, double power, OutputArray _dst )
    {
        if( ipower < 0 )
        {
-            divide( 1., _src, _dst );
+            divide( Scalar::all(1), _src, _dst );
            if( ipower == -1 )
                return;
            ipower = -ipower;
@ -2115,10 +2115,7 @@ void pow( InputArray _src, double power, OutputArray _dst )

    Mat src, dst;
    if (same)
-    {
-        dst = _dst.getMat();
-        src = dst;
-    }
+        src = dst = _dst.getMat();
    else
    {
        src = _src.getMat();
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -1416,6 +1416,16 @@ int _InputArray::kind() const
    return flags & KIND_MASK;
 }

+int _InputArray::rows(int i) const
+{
+    return size(i).height;
+}
+
+int _InputArray::cols(int i) const
+{
+    return size(i).width;
+}
+
 Size _InputArray::size(int i) const
 {
    int k = kind();
@ -2078,45 +2088,45 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
    create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }

-void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const
+void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const
 {
    int k = kind();
    if( k == MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((Mat*)obj)->type() == mtype);
-        ((Mat*)obj)->create(rows, cols, mtype);
+        ((Mat*)obj)->create(_rows, _cols, mtype);
        return;
    }
    if( k == UMAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((UMat*)obj)->type() == mtype);
-        ((UMat*)obj)->create(rows, cols, mtype);
+        ((UMat*)obj)->create(_rows, _cols, mtype);
        return;
    }
    if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
-        ((cuda::GpuMat*)obj)->create(rows, cols, mtype);
+        ((cuda::GpuMat*)obj)->create(_rows, _cols, mtype);
        return;
    }
    if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
-        ((ogl::Buffer*)obj)->create(rows, cols, mtype);
+        ((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
        return;
    }
    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
-        ((cuda::CudaMem*)obj)->create(rows, cols, mtype);
+        ((cuda::CudaMem*)obj)->create(_rows, _cols, mtype);
        return;
    }
-    int sizes[] = {rows, cols};
+    int sizes[] = {_rows, _cols};
    create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }

@ -2679,17 +2689,17 @@ namespace cv {

 static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
 {
-    int type = _m.type(), cn = CV_MAT_CN(type);
-    if (cn == 3)
-        return false;
+    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn);

    ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
-                  format("-D T=%s", ocl::memopTypeToStr(type)));
+                  format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
+                         ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype)));
    if (k.empty())
        return false;

    UMat m = _m.getUMat();
-    k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, type, s)));
+    k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));

    size_t globalsize[2] = { m.cols, m.rows };
    return k.run(2, globalsize, NULL, false);
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -1410,7 +1410,7 @@ bool useOpenCL()
 {
    CoreTLSData* data = coreTlsData.get();
    if( data->useOpenCL < 0 )
-        data->useOpenCL = (int)haveOpenCL();
+        data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
    return data->useOpenCL > 0;
 }

@ -1419,7 +1419,7 @@ void setUseOpenCL(bool flag)
    if( haveOpenCL() )
    {
        CoreTLSData* data = coreTlsData.get();
-        data->useOpenCL = flag ? 1 : 0;
+        data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
    }
 }

@ -2179,7 +2179,6 @@ static cl_device_id selectOpenCLDevice()
            goto not_found;
        }
    }
-
    if (deviceTypes.size() == 0)
    {
        if (!isID)
@ -2193,13 +2192,16 @@ static cl_device_id selectOpenCLDevice()
    for (size_t t = 0; t < deviceTypes.size(); t++)
    {
        int deviceType = 0;
-        if (deviceTypes[t] == "GPU")
+        std::string tempStrDeviceType = deviceTypes[t];
+        std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
+
+        if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
            deviceType = Device::TYPE_GPU;
-        else if (deviceTypes[t] == "CPU")
+        else if (tempStrDeviceType == "cpu")
            deviceType = Device::TYPE_CPU;
-        else if (deviceTypes[t] == "ACCELERATOR")
+        else if (tempStrDeviceType == "accelerator")
            deviceType = Device::TYPE_ACCELERATOR;
-        else if (deviceTypes[t] == "ALL")
+        else if (tempStrDeviceType == "all")
            deviceType = Device::TYPE_ALL;
        else
        {
@ -2229,7 +2231,14 @@ static cl_device_id selectOpenCLDevice()
        {
            std::string name;
            CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
-            if (isID || name.find(deviceName) != std::string::npos)
+            cl_bool useGPU = true;
+            if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
+            {
+                cl_bool isIGPU = CL_FALSE;
+                clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
+                useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
+            }
+            if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
            {
                // TODO check for OpenCL 1.1
                return devices[i];
@ -2245,6 +2254,7 @@ not_found:
        std::cerr << deviceTypes[t] << " ";

    std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
+    CV_Error(CL_INVALID_DEVICE, "Requested OpenCL device is not found");
    return NULL;
 }

@ -4306,7 +4316,7 @@ static std::string kerToStr(const Mat & k)
    return stream.str();
 }

-String kernelToStr(InputArray _kernel, int ddepth)
+String kernelToStr(InputArray _kernel, int ddepth, const char * name)
 {
    Mat kernel = _kernel.getMat().reshape(1, 1);

@ -4317,13 +4327,13 @@ String kernelToStr(InputArray _kernel, int ddepth)
    if (ddepth != depth)
        kernel.convertTo(kernel, ddepth);

-    typedef std::string (*func_t)(const Mat &);
-    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>,kerToStr<short>,
+    typedef std::string (* func_t)(const Mat &);
+    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
    const func_t func = funcs[depth];
    CV_Assert(func != 0);

-    return cv::format(" -D COEFF=%s", func(kernel).c_str());
+    return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
 }

 #define PROCESS_SRC(src) \
@ -4347,7 +4357,7 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
                              InputArray src4, InputArray src5, InputArray src6,
                              InputArray src7, InputArray src8, InputArray src9)
 {
-    int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(depth);
    Size ssize = src1.size();
    const ocl::Device & d = ocl::Device::getDefault();

@ -4371,7 +4381,8 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
    PROCESS_SRC(src9);

    size_t size = offsets.size();
-    std::vector<int> dividers(size, width);
+    int wsz = width * esz;
+    std::vector<int> dividers(size, wsz);

    for (size_t i = 0; i < size; ++i)
        while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0)
@ -4379,7 +4390,7 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,

    // default strategy
    for (size_t i = 0; i < size; ++i)
-        if (dividers[i] != width)
+        if (dividers[i] != wsz)
        {
            width = 1;
            break;
--- a/modules/core/src/opencl/flip.cl
+++ b/modules/core/src/opencl/flip.cl
@ -39,10 +39,18 @@
 //
 //M*/

-#define sizeoftype ((int)sizeof(type))
+#if cn != 3
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
+#else
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE ((int)sizeof(T1)*3)
+#endif

-__kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int srcoffset,
-                               __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int src_offset,
+                               __global uchar * dstptr, int dst_step, int dst_offset,
                               int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
@ -50,19 +58,16 @@ __kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int sr

    if (x < cols && y < thread_rows)
    {
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x, sizeoftype, srcoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset)));

-        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x, sizeoftype, dstoffset)));
-
-        dst0[0] = src1[0];
-        dst1[0] = src0[0];
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+        storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset)));
    }
 }

-__kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, int srcoffset,
-                                    __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, int src_offset,
+                                    __global uchar * dstptr, int dst_step, int dst_offset,
                                    int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
@ -71,19 +76,16 @@ __kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, i
    if (x < cols && y < thread_rows)
    {
        int x1 = cols - x - 1;
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x1, sizeoftype, srcoffset)));
-
-        __global type* dst0 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x1, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset)));

-        dst0[0] = src0[0];
-        dst1[0] = src1[0];
+        storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset)));
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
    }
 }

-__kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int srcoffset,
-                               __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int src_offset,
+                               __global uchar * dstptr, int dst_step, int dst_offset,
                               int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
@ -92,13 +94,10 @@ __kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int sr
    if (x < thread_cols && y < rows)
    {
        int x1 = cols - x - 1;
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x1, sizeoftype, srcoffset)));
-
-        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x1, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset)));

-        dst1[0] = src1[0];
-        dst0[0] = src0[0];
+        storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset)));
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
    }
 }
--- a/modules/core/src/opencl/set_identity.cl
+++ b/modules/core/src/opencl/set_identity.cl
@ -43,17 +43,28 @@
 //
 //M*/

+#if cn != 3
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
+#define scalar scalar_
+#else
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE ((int)sizeof(T1)*3)
+#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
+#endif
+
 __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
-                          T scalar)
+                          ST scalar_)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
-        __global T * src = (__global T *)(srcptr + src_index);
+        int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));

-        src[0] = x == y ? scalar : (T)(0);
+        storepix(x == y ? scalar : (T)(0), srcptr + src_index);
    }
 }
--- a/modules/core/src/opencl/split_merge.cl
+++ b/modules/core/src/opencl/split_merge.cl
@ -45,7 +45,7 @@

 #define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,
 #define DECLARE_DATA(index) __global const T * src##index = \
-    (__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T), src##index##_offset)));
+    (__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset)));
 #define PROCESS_ELEM(index) dst[index] = src##index[0];

 __kernel void merge(DECLARE_SRC_PARAMS_N
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@ -4824,7 +4824,7 @@ cvRegisterType( const CvTypeInfo* _info )
            "Type name should contain only letters, digits, - and _" );
    }

-    info = (CvTypeInfo*)malloc( sizeof(*info) + len + 1 );
+    info = (CvTypeInfo*)cvAlloc( sizeof(*info) + len + 1 );

    *info = *_info;
    info->type_name = (char*)(info + 1);
@ -4862,7 +4862,7 @@ cvUnregisterType( const char* type_name )
        if( !CvType::first || !CvType::last )
            CvType::first = CvType::last = 0;

-        free( info );
+        cvFree( &info );
    }
 }

@ -5486,11 +5486,27 @@ internal::WriteStructContext::WriteStructContext(FileStorage& _fs,
 {
    cvStartWriteStruct(**fs, !name.empty() ? name.c_str() : 0, flags,
                       !typeName.empty() ? typeName.c_str() : 0);
+    fs->elname = String();
+    if ((flags & FileNode::TYPE_MASK) == FileNode::SEQ)
+    {
+        fs->state = FileStorage::VALUE_EXPECTED;
+        fs->structs.push_back('[');
+    }
+    else
+    {
+        fs->state = FileStorage::NAME_EXPECTED + FileStorage::INSIDE_MAP;
+        fs->structs.push_back('{');
+    }
 }

 internal::WriteStructContext::~WriteStructContext()
 {
    cvEndWriteStruct(**fs);
+    fs->structs.pop_back();
+    fs->state = fs->structs.empty() || fs->structs.back() == '{' ?
+        FileStorage::NAME_EXPECTED + FileStorage::INSIDE_MAP :
+        FileStorage::VALUE_EXPECTED;
+    fs->elname = String();
 }


--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -414,24 +414,23 @@ const String& getBuildInformation()

 String format( const char* fmt, ... )
 {
-    char buf[1024];
+    AutoBuffer<char, 1024> buf;

-    va_list va;
-    va_start(va, fmt);
-    int len = vsnprintf(buf, sizeof(buf), fmt, va);
-    va_end(va);
-
-    if (len >= (int)sizeof(buf))
+    for ( ; ; )
    {
-        String s(len, '\0');
+        va_list va;
        va_start(va, fmt);
-        len = vsnprintf((char*)s.c_str(), len + 1, fmt, va);
-        (void)len;
+        int bsize = static_cast<int>(buf.size()),
+                len = vsnprintf((char *)buf, bsize, fmt, va);
        va_end(va);
-        return s;
-    }

-    return String(buf, len);
+        if (len < 0 || len >= bsize)
+        {
+            buf.resize(std::max(bsize << 1, len + 1));
+            continue;
+        }
+        return String((char *)buf, len);
+    }
 }

 String tempfile( const char* suffix )
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@ -88,8 +88,10 @@ void UMatData::unlock()

 MatAllocator* UMat::getStdAllocator()
 {
-    if( ocl::haveOpenCL() )
+#ifdef HAVE_OPENCL
+    if( ocl::haveOpenCL() && ocl::useOpenCL() )
        return ocl::getOpenCLAllocator();
+#endif
    return Mat::getStdAllocator();
 }

@ -665,7 +667,7 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
        copyTo(_dst);
        return;
    }
-
+#ifdef HAVE_OPENCL
    int cn = channels(), mtype = _mask.type(), mdepth = CV_MAT_DEPTH(mtype), mcn = CV_MAT_CN(mtype);
    CV_Assert( mdepth == CV_8U && (mcn == 1 || mcn == cn) );

@ -692,7 +694,7 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
                return;
        }
    }
-
+#endif
    Mat src = getMat(ACCESS_READ);
    src.copyTo(_dst, _mask);
 }
@ -713,7 +715,7 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
        copyTo(_dst);
        return;
    }
-
+#ifdef HAVE_OPENCL
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
    bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
    if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
@ -748,7 +750,7 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
                return;
        }
    }
-
+#endif
    Mat m = getMat(ACCESS_READ);
    m.convertTo(_dst, _type, alpha, beta);
 }
@ -756,7 +758,9 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
 UMat& UMat::setTo(InputArray _value, InputArray _mask)
 {
    bool haveMask = !_mask.empty();
+#ifdef HAVE_OPENCL
    int tp = type(), cn = CV_MAT_CN(tp);
+
    if( dims <= 2 && cn <= 4 && CV_MAT_DEPTH(tp) < CV_64F && ocl::useOpenCL() )
    {
        Mat value = _value.getMat();
@ -795,6 +799,7 @@ UMat& UMat::setTo(InputArray _value, InputArray _mask)
                return *this;
        }
    }
+#endif
    Mat m = getMat(haveMask ? ACCESS_RW : ACCESS_WRITE);
    m.setTo(_value, _mask);
    return *this;
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@ -57,9 +57,9 @@ PARAM_TEST_CASE(Lut, MatDepth, MatDepth, Channels, bool, bool)
    int cn;
    bool use_roi, same_cn;

-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_INPUT_PARAMETER(lut)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_INPUT_PARAMETER(lut);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -87,14 +87,14 @@ PARAM_TEST_CASE(Lut, MatDepth, MatDepth, Channels, bool, bool)
        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, roiSize, dstBorder, dst_type, 5, 16);

-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_INPUT_PARAMETER(lut)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_INPUT_PARAMETER(lut);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }

    void Near(double threshold = 0.)
    {
-        OCL_EXPECT_MATS_NEAR(dst, threshold)
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
    }
 };

@ -121,11 +121,11 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
    cv::Scalar val;
    cv::Scalar val_in_range;

-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(mask)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(mask);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst1);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2);

    virtual void SetUp()
    {
@ -167,21 +167,21 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
                                      rng.uniform(minV, maxV), rng.uniform(minV, maxV));
        }

-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(mask)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(mask);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2);
    }

    void Near(double threshold = 0.)
    {
-        OCL_EXPECT_MATS_NEAR(dst1, threshold)
+        OCL_EXPECT_MATS_NEAR(dst1, threshold);
    }

    void Near1(double threshold = 0.)
    {
-        OCL_EXPECT_MATS_NEAR(dst2, threshold)
+        OCL_EXPECT_MATS_NEAR(dst2, threshold);
    }
 };

@ -556,6 +556,12 @@ OCL_TEST_P(Transpose, Mat)
    {
        generateTestData();

+        Size roiSize = src1_roi.size();
+        Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst1, dst1_roi, Size(roiSize.height, roiSize.width), dst1Border, src1.type(), 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(dst1);
+
        OCL_OFF(cv::transpose(src1_roi, dst1_roi));
        OCL_ON(cv::transpose(usrc1_roi, udst1_roi));

@ -580,7 +586,7 @@ OCL_TEST_P(Transpose, SquareInplace)
        OCL_OFF(cv::transpose(src1_roi, src1_roi));
        OCL_ON(cv::transpose(usrc1_roi, usrc1_roi));

-        OCL_EXPECT_MATS_NEAR(src1, 0)
+        OCL_EXPECT_MATS_NEAR(src1, 0);
    }
 }

@ -761,7 +767,7 @@ OCL_TEST_P(Bitwise_not, Mat)
 typedef ArithmTestBase Compare;

 static const int cmp_codes[] = { CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE };
-static const char* cmp_strs[] = { "CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE" };
+static const char * cmp_strs[] = { "CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE" };
 static const int cmp_num = sizeof(cmp_codes) / sizeof(int);

 OCL_TEST_P(Compare, Mat)
@ -826,12 +832,14 @@ OCL_TEST_P(Pow, Mat)
    for (int j = 0; j < test_loop_times; j++)
        for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
        {
+            SCOPED_TRACE(pows[k]);
+
            generateTestData();

            OCL_OFF(cv::pow(src1_roi, pows[k], dst1_roi));
            OCL_ON(cv::pow(usrc1_roi, pows[k], udst1_roi));

-            Near(1); // FIXIT: Relative error check!
+            OCL_EXPECT_MATS_NEAR_RELATIVE(dst1, 1e-5);
        }
 }

@ -893,8 +901,8 @@ struct RepeatTestCase :
        Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst1, dst1_roi, dstRoiSize, dst1Border, type, 5, 16);

-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1);
    }
 };

@ -1450,10 +1458,10 @@ PARAM_TEST_CASE(InRange, MatDepth, Channels, bool /*Scalar or not*/, bool /*Roi*
    bool scalars, use_roi;
    cv::Scalar val1, val2;

-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(src3)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(src3);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -1485,15 +1493,15 @@ PARAM_TEST_CASE(InRange, MatDepth, Channels, bool /*Scalar or not*/, bool /*Roi*
        val2 = cv::Scalar(rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0),
                          rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0));

-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(src3)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(src3);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }

    void Near()
    {
-        OCL_EXPECT_MATS_NEAR(dst, 0)
+        OCL_EXPECT_MATS_NEAR(dst, 0);
    }
 };

@ -1565,7 +1573,7 @@ PARAM_TEST_CASE(PatchNaNs, Channels, bool)
    bool use_roi;
    double value;

-    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_INPUT_PARAMETER(src);

    virtual void SetUp()
    {
@ -1592,12 +1600,12 @@ PARAM_TEST_CASE(PatchNaNs, Channels, bool)

        value = randomDouble(-100, 100);

-        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
    }

    void Near()
    {
-        OCL_EXPECT_MATS_NEAR(src, 0)
+        OCL_EXPECT_MATS_NEAR(src, 0);
    }
 };

@ -1640,8 +1648,8 @@ PARAM_TEST_CASE(Reduce, std::pair<MatDepth, MatDepth>, Channels, int, bool)
    int sdepth, ddepth, cn, dim, dtype;
    bool use_roi;

-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -1666,8 +1674,8 @@ PARAM_TEST_CASE(Reduce, std::pair<MatDepth, MatDepth>, Channels, int, bool)
        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, dstRoiSize, dstBorder, dtype, 5, 16);

-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }
 };

@ -1683,7 +1691,7 @@ OCL_TEST_P(ReduceSum, Mat)
        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_SUM, dtype));

        double eps = ddepth <= CV_32S ? 1 : 1e-4;
-        OCL_EXPECT_MATS_NEAR(dst, eps)
+        OCL_EXPECT_MATS_NEAR(dst, eps);
    }
 }

@ -1698,7 +1706,7 @@ OCL_TEST_P(ReduceMax, Mat)
        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_MAX, dtype));
        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_MAX, dtype));

-        OCL_EXPECT_MATS_NEAR(dst, 0)
+        OCL_EXPECT_MATS_NEAR(dst, 0);
    }
 }

@ -1713,7 +1721,7 @@ OCL_TEST_P(ReduceMin, Mat)
        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_MIN, dtype));
        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_MIN, dtype));

-        OCL_EXPECT_MATS_NEAR(dst, 0)
+        OCL_EXPECT_MATS_NEAR(dst, 0);
    }
 }

@ -1729,7 +1737,7 @@ OCL_TEST_P(ReduceAvg, Mat)
        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_AVG, dtype));

        double eps = ddepth <= CV_32S ? 1 : 5e-6;
-        OCL_EXPECT_MATS_NEAR(dst, eps)
+        OCL_EXPECT_MATS_NEAR(dst, eps);
    }
 }

--- a/modules/core/test/ocl/test_channels.cpp
+++ b/modules/core/test/ocl/test_channels.cpp
@ -54,16 +54,16 @@ namespace ocl {

 //////////////////////////////////////// Merge ///////////////////////////////////////////////

-PARAM_TEST_CASE(Merge, MatDepth, Channels, bool)
+PARAM_TEST_CASE(Merge, MatDepth, int, bool)
 {
-    int depth, cn;
+    int depth, nsrc;
    bool use_roi;

-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(src3)
-    TEST_DECLARE_INPUT_PARAMETER(src4)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(src3);
+    TEST_DECLARE_INPUT_PARAMETER(src4);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    std::vector<Mat> src_roi;
    std::vector<UMat> usrc_roi;
@ -71,10 +71,15 @@ PARAM_TEST_CASE(Merge, MatDepth, Channels, bool)
    virtual void SetUp()
    {
        depth = GET_PARAM(0);
-        cn = GET_PARAM(1);
+        nsrc = GET_PARAM(1);
        use_roi = GET_PARAM(2);

-        CV_Assert(cn >= 1 && cn <= 4);
+        CV_Assert(nsrc >= 1 && nsrc <= 4);
+    }
+
+    int type()
+    {
+        return CV_MAKE_TYPE(depth, randomInt(1, 3));
    }

    void generateTestData()
@ -83,34 +88,39 @@ PARAM_TEST_CASE(Merge, MatDepth, Channels, bool)

        {
            Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src1, src1_roi, roiSize, src1Border, depth, 2, 11);
+            randomSubMat(src1, src1_roi, roiSize, src1Border, type(), 2, 11);

            Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src2, src2_roi, roiSize, src2Border, depth, -1540, 1740);
+            randomSubMat(src2, src2_roi, roiSize, src2Border, type(), -1540, 1740);

            Border src3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src3, src3_roi, roiSize, src3Border, depth, -1540, 1740);
+            randomSubMat(src3, src3_roi, roiSize, src3Border, type(), -1540, 1740);

            Border src4Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src4, src4_roi, roiSize, src4Border, depth, -1540, 1740);
+            randomSubMat(src4, src4_roi, roiSize, src4Border, type(), -1540, 1740);
        }

-        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, cn), 5, 16);
-
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(src3)
-        UMAT_UPLOAD_INPUT_PARAMETER(src4)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(src3);
+        UMAT_UPLOAD_INPUT_PARAMETER(src4);

        src_roi.push_back(src1_roi), usrc_roi.push_back(usrc1_roi);
-        if (cn >= 2)
+        if (nsrc >= 2)
            src_roi.push_back(src2_roi), usrc_roi.push_back(usrc2_roi);
-        if (cn >= 3)
+        if (nsrc >= 3)
            src_roi.push_back(src3_roi), usrc_roi.push_back(usrc3_roi);
-        if (cn >= 4)
+        if (nsrc >= 4)
            src_roi.push_back(src4_roi), usrc_roi.push_back(usrc4_roi);
+
+        int dcn = 0;
+        for (int i = 0; i < nsrc; ++i)
+            dcn += src_roi[i].channels();
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, dcn), 5, 16);
+
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }

    void Near(double threshold = 0.)
@ -139,11 +149,11 @@ PARAM_TEST_CASE(Split, MatType, Channels, bool)
    int depth, cn;
    bool use_roi;

-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst3)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst4)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst1);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst3);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst4);

    std::vector<Mat> dst_roi, dst;
    std::vector<UMat> udst_roi, udst;
@ -177,11 +187,11 @@ PARAM_TEST_CASE(Split, MatType, Channels, bool)
            randomSubMat(dst4, dst4_roi, roiSize, dst4Border, depth, -1540, 1740);
        }

-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4);

        dst_roi.push_back(dst1_roi), udst_roi.push_back(udst1_roi),
                dst.push_back(dst1), udst.push_back(udst1);
@ -221,14 +231,14 @@ PARAM_TEST_CASE(MixChannels, MatType, bool)
    int depth;
    bool use_roi;

-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(src3)
-    TEST_DECLARE_INPUT_PARAMETER(src4)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst3)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst4)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(src3);
+    TEST_DECLARE_INPUT_PARAMETER(src4);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst1);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst3);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst4);

    std::vector<Mat> src_roi, dst_roi, dst;
    std::vector<UMat> usrc_roi, udst_roi, udst;
@ -287,15 +297,15 @@ PARAM_TEST_CASE(MixChannels, MatType, bool)
            randomSubMat(dst4, dst4_roi, roiSize, dst4Border, type(), -1540, 1740);
        }

-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(src3)
-        UMAT_UPLOAD_INPUT_PARAMETER(src4)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(src3);
+        UMAT_UPLOAD_INPUT_PARAMETER(src4);

-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4);

        int nsrc = randomInt(1, 5), ndst = randomInt(1, 5);

@ -360,8 +370,8 @@ PARAM_TEST_CASE(InsertChannel, MatDepth, Channels, bool)
    int depth, cn, coi;
    bool use_roi;

-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -381,8 +391,8 @@ PARAM_TEST_CASE(InsertChannel, MatDepth, Channels, bool)
        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, cn), 5, 16);

-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }
 };

@ -406,8 +416,8 @@ PARAM_TEST_CASE(ExtractChannel, MatDepth, Channels, bool)
    int depth, cn, coi;
    bool use_roi;

-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -427,8 +437,8 @@ PARAM_TEST_CASE(ExtractChannel, MatDepth, Channels, bool)
        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, roiSize, dstBorder, depth, 5, 16);

-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }
 };

@ -447,7 +457,7 @@ OCL_TEST_P(ExtractChannel, Accuracy)

 //////////////////////////////////////// Instantiation ///////////////////////////////////////////////

-OCL_INSTANTIATE_TEST_CASE_P(Channels, Merge, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Channels, Merge, Combine(OCL_ALL_DEPTHS, Values(1, 2, 3, 4), Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Channels, Split, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Channels, MixChannels, Combine(OCL_ALL_DEPTHS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Channels, InsertChannel, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
--- a/modules/core/test/ocl/test_dft.cpp
+++ b/modules/core/test/ocl/test_dft.cpp
@ -60,8 +60,8 @@ PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
    int	dft_flags, depth;
    bool inplace;

-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -106,9 +106,9 @@ PARAM_TEST_CASE(MulSpectrums, bool, bool)
 {
    bool ccorr, useRoi;

-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -129,9 +129,9 @@ PARAM_TEST_CASE(MulSpectrums, bool, bool)
        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, srcRoiSize, dstBorder, CV_32FC2, 5, 16);

-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }
 };

--- a/modules/core/test/ocl/test_gemm.cpp
+++ b/modules/core/test/ocl/test_gemm.cpp
@ -67,10 +67,10 @@ PARAM_TEST_CASE(Gemm,

    double alpha, beta;

-    TEST_DECLARE_INPUT_PARAMETER(A)
-    TEST_DECLARE_INPUT_PARAMETER(B)
-    TEST_DECLARE_INPUT_PARAMETER(C)
-    TEST_DECLARE_OUTPUT_PARAMETER(D)
+    TEST_DECLARE_INPUT_PARAMETER(A);
+    TEST_DECLARE_INPUT_PARAMETER(B);
+    TEST_DECLARE_INPUT_PARAMETER(C);
+    TEST_DECLARE_OUTPUT_PARAMETER(D);

    virtual void SetUp()
    {
@ -119,10 +119,10 @@ PARAM_TEST_CASE(Gemm,
        alpha = randomDouble(-4, 4);
        beta = randomDouble(-4, 4);

-        UMAT_UPLOAD_INPUT_PARAMETER(A)
-        UMAT_UPLOAD_INPUT_PARAMETER(B)
-        UMAT_UPLOAD_INPUT_PARAMETER(C)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(D)
+        UMAT_UPLOAD_INPUT_PARAMETER(A);
+        UMAT_UPLOAD_INPUT_PARAMETER(B);
+        UMAT_UPLOAD_INPUT_PARAMETER(C);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(D);
    }
 };

--- a/modules/core/test/ocl/test_matrix_operation.cpp
+++ b/modules/core/test/ocl/test_matrix_operation.cpp
@ -59,8 +59,8 @@ PARAM_TEST_CASE(ConvertTo, MatDepth, MatDepth, Channels, bool)
    int src_depth, cn, dstType;
    bool use_roi;

-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -80,8 +80,8 @@ PARAM_TEST_CASE(ConvertTo, MatDepth, MatDepth, Channels, bool)
        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 5, 16);

-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }
 };

@ -108,9 +108,9 @@ PARAM_TEST_CASE(CopyTo, MatDepth, Channels, bool, bool)
    int depth, cn;
    bool use_roi, use_mask;

-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_INPUT_PARAMETER(mask)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_INPUT_PARAMETER(mask);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);

    virtual void SetUp()
    {
@ -139,10 +139,10 @@ PARAM_TEST_CASE(CopyTo, MatDepth, Channels, bool, bool)
        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);

-        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
        if (use_mask)
-            UMAT_UPLOAD_INPUT_PARAMETER(mask)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+            UMAT_UPLOAD_INPUT_PARAMETER(mask);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }
 };

@ -169,7 +169,7 @@ OCL_TEST_P(CopyTo, Accuracy)
 }

 OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
-                            OCL_ALL_DEPTHS, OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+                                OCL_ALL_DEPTHS, OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));

 OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
                                OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@ -380,6 +380,40 @@ TEST(Core_InputOutput, write_read_consistency) { Core_IOTest test; test.safe_run

 extern void testFormatter();

+
+struct UserDefinedType
+{
+    int a;
+    float b;
+};
+
+static inline bool operator==(const UserDefinedType &x,
+                              const UserDefinedType &y) {
+    return (x.a == y.a) && (x.b == y.b);
+}
+
+static inline void write(FileStorage &fs,
+                         const String&,
+                         const UserDefinedType &value)
+{
+    fs << "{:" << "a" << value.a << "b" << value.b << "}";
+}
+
+static inline void read(const FileNode& node,
+                        UserDefinedType& value,
+                        const UserDefinedType& default_value
+                          = UserDefinedType()) {
+    if(node.empty())
+    {
+        value = default_value;
+    }
+    else
+    {
+        node["a"] >> value.a;
+        node["b"] >> value.b;
+    }
+}
+
 class CV_MiscIOTest : public cvtest::BaseTest
 {
 public:
@ -393,11 +427,14 @@ protected:
            string fname = cv::tempfile(".xml");
            vector<int> mi, mi2, mi3, mi4;
            vector<Mat> mv, mv2, mv3, mv4;
+            vector<UserDefinedType> vudt, vudt2, vudt3, vudt4;
            Mat m(10, 9, CV_32F);
            Mat empty;
+            UserDefinedType udt = { 8, 3.3f };
            randu(m, 0, 1);
            mi3.push_back(5);
            mv3.push_back(m);
+            vudt3.push_back(udt);
            Point_<float> p1(1.1f, 2.2f), op1;
            Point3i p2(3, 4, 5), op2;
            Size s1(6, 7), os1;
@ -412,6 +449,8 @@ protected:
            fs << "mv" << mv;
            fs << "mi3" << mi3;
            fs << "mv3" << mv3;
+            fs << "vudt" << vudt;
+            fs << "vudt3" << vudt3;
            fs << "empty" << empty;
            fs << "p1" << p1;
            fs << "p2" << p2;
@ -428,6 +467,8 @@ protected:
            fs["mv"] >> mv2;
            fs["mi3"] >> mi4;
            fs["mv3"] >> mv4;
+            fs["vudt"] >> vudt2;
+            fs["vudt3"] >> vudt4;
            fs["empty"] >> empty;
            fs["p1"] >> op1;
            fs["p2"] >> op2;
@ -442,6 +483,8 @@ protected:
            CV_Assert( norm(mi3, mi4, CV_C) == 0 );
            CV_Assert( mv4.size() == 1 );
            double n = norm(mv3[0], mv4[0], CV_C);
+            CV_Assert( vudt2.empty() );
+            CV_Assert( vudt3 == vudt4 );
            CV_Assert( n == 0 );
            CV_Assert( op1 == p1 );
            CV_Assert( op2 == p2 );
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@ -795,4 +795,176 @@ TEST(UMat, ReadBufferRect)
    EXPECT_MAT_NEAR(t, t2, 0);
 }

+// Use iGPU or OPENCV_OPENCL_DEVICE=:CPU: to catch problem
+TEST(UMat, DISABLED_synchronization_map_unmap)
+{
+    class TestParallelLoopBody : public cv::ParallelLoopBody
+    {
+        UMat u_;
+    public:
+        TestParallelLoopBody(const UMat& u) : u_(u) { }
+        void operator() (const cv::Range& range) const
+        {
+            printf("range: %d, %d -- begin\n", range.start, range.end);
+            for (int i = 0; i < 10; i++)
+            {
+                printf("%d: %d map...\n", range.start, i);
+                Mat m = u_.getMat(cv::ACCESS_READ);
+
+                printf("%d: %d unmap...\n", range.start, i);
+                m.release();
+            }
+            printf("range: %d, %d -- end\n", range.start, range.end);
+        }
+    };
+    try
+    {
+        UMat u(1000, 1000, CV_32FC1);
+        parallel_for_(cv::Range(0, 2), TestParallelLoopBody(u));
+    }
+    catch (const cv::Exception& e)
+    {
+        FAIL() << "Exception: " << e.what();
+        ADD_FAILURE();
+    }
+    catch (...)
+    {
+        FAIL() << "Exception!";
+    }
+}
+
 } } // namespace cvtest::ocl
+
+TEST(UMat, DISABLED_bug_with_unmap)
+{
+    for (int i = 0; i < 20; i++)
+    {
+        try
+        {
+            Mat m = Mat(1000, 1000, CV_8UC1);
+            UMat u = m.getUMat(ACCESS_READ);
+            UMat dst;
+            add(u, Scalar::all(0), dst); // start async operation
+            u.release();
+            m.release();
+        }
+        catch (const cv::Exception& e)
+        {
+            printf("i = %d... %s\n", i, e.what());
+            ADD_FAILURE();
+        }
+        catch (...)
+        {
+            printf("i = %d...\n", i);
+            ADD_FAILURE();
+        }
+    }
+}
+
+TEST(UMat, DISABLED_bug_with_unmap_in_class)
+{
+    class Logic
+    {
+    public:
+        Logic() {}
+        void processData(InputArray input)
+        {
+            Mat m = input.getMat();
+            {
+                Mat dst;
+                m.convertTo(dst, CV_32FC1);
+                // some additional CPU-based per-pixel processing into dst
+                intermediateResult = dst.getUMat(ACCESS_READ);
+                std::cout << "data processed..." << std::endl;
+            } // problem is here: dst::~Mat()
+            std::cout << "leave ProcessData()" << std::endl;
+        }
+        UMat getResult() const { return intermediateResult; }
+    protected:
+        UMat intermediateResult;
+    };
+    try
+    {
+        Mat m = Mat(1000, 1000, CV_8UC1);
+        Logic l;
+        l.processData(m);
+        UMat result = l.getResult();
+    }
+    catch (const cv::Exception& e)
+    {
+        printf("exception... %s\n", e.what());
+        ADD_FAILURE();
+    }
+    catch (...)
+    {
+        printf("exception... \n");
+        ADD_FAILURE();
+    }
+}
+
+TEST(UMat, Test_same_behaviour_read_and_read)
+{
+    bool exceptionDetected = false;
+    try
+    {
+        UMat u(Size(10, 10), CV_8UC1);
+        Mat m = u.getMat(ACCESS_READ);
+        UMat dst;
+        add(u, Scalar::all(1), dst);
+    }
+    catch (...)
+    {
+        exceptionDetected = true;
+    }
+    ASSERT_FALSE(exceptionDetected); // no data race, 2+ reads are valid
+}
+
+// VP: this test (and probably others from same_behaviour series) is not valid in my opinion.
+TEST(UMat, DISABLED_Test_same_behaviour_read_and_write)
+{
+    bool exceptionDetected = false;
+    try
+    {
+        UMat u(Size(10, 10), CV_8UC1);
+        Mat m = u.getMat(ACCESS_READ);
+        add(u, Scalar::all(1), u);
+    }
+    catch (...)
+    {
+        exceptionDetected = true;
+    }
+    ASSERT_TRUE(exceptionDetected); // data race
+}
+
+TEST(UMat, DISABLED_Test_same_behaviour_write_and_read)
+{
+    bool exceptionDetected = false;
+    try
+    {
+        UMat u(Size(10, 10), CV_8UC1);
+        Mat m = u.getMat(ACCESS_WRITE);
+        UMat dst;
+        add(u, Scalar::all(1), dst);
+    }
+    catch (...)
+    {
+        exceptionDetected = true;
+    }
+    ASSERT_TRUE(exceptionDetected); // data race
+}
+
+TEST(UMat, DISABLED_Test_same_behaviour_write_and_write)
+{
+    bool exceptionDetected = false;
+    try
+    {
+        UMat u(Size(10, 10), CV_8UC1);
+        Mat m = u.getMat(ACCESS_WRITE);
+        add(u, Scalar::all(1), u);
+    }
+    catch (...)
+    {
+        exceptionDetected = true;
+    }
+    ASSERT_TRUE(exceptionDetected); // data race
+}
--- a/modules/cudabgsegm/src/cuda/mog2.cu
+++ b/modules/cudabgsegm/src/cuda/mog2.cu
@ -163,7 +163,7 @@ namespace cv { namespace cuda { namespace device
            {
                //need only weight if fit is found
                float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
-
+                int swap_count = 0;
                //fit not found yet
                if (!fitsPDF)
                {
@ -214,6 +214,7 @@ namespace cv { namespace cuda { namespace device
                            if (weight < gmm_weight((i - 1) * frame.rows + y, x))
                                break;

+                            swap_count++;
                            //swap one up
                            swap(gmm_weight, x, y, i - 1, frame.rows);
                            swap(gmm_variance, x, y, i - 1, frame.rows);
@ -231,7 +232,7 @@ namespace cv { namespace cuda { namespace device
                    nmodes--;
                }

-                gmm_weight(mode * frame.rows + y, x) = weight; //update weight by the calculated value
+                gmm_weight((mode - swap_count) * frame.rows + y, x) = weight; //update weight by the calculated value
                totalWeight += weight;
            }

--- a/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst
@ -69,7 +69,7 @@ Computes the descriptors for a set of keypoints detected in an image (first vari

    :param keypoints: Input collection of keypoints. Keypoints for which a descriptor cannot be computed are removed. Sometimes new keypoints can be added, for example: ``SIFT`` duplicates keypoint with several dominant orientations (for each orientation).

-    :param descriptors: Computed descriptors. In the second variant of the method ``descriptors[i]`` are descriptors computed for a ``keypoints[i]`. Row ``j`` is the ``keypoints`` (or ``keypoints[i]``) is the descriptor for keypoint ``j``-th keypoint.
+    :param descriptors: Computed descriptors. In the second variant of the method ``descriptors[i]`` are descriptors computed for a ``keypoints[i]``. Row ``j`` is the ``keypoints`` (or ``keypoints[i]``) is the descriptor for keypoint ``j``-th keypoint.


 DescriptorExtractor::create
--- a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
@ -249,7 +249,7 @@ Brute-force matcher constructor.

    :param normType: One of ``NORM_L1``, ``NORM_L2``, ``NORM_HAMMING``, ``NORM_HAMMING2``. ``L1`` and ``L2`` norms are preferable choices for SIFT and SURF descriptors, ``NORM_HAMMING`` should be used with ORB, BRISK and BRIEF, ``NORM_HAMMING2`` should be used with ORB when ``WTA_K==3`` or ``4`` (see ORB::ORB constructor description).

-    :param crossCheck: If it is false, this is will be default BFMatcher behaviour when it finds the k nearest neighbors for each query descriptor. If ``crossCheck==true``, then the ``knnMatch()`` method with ``k=1`` will only return pairs ``(i,j)`` such that for ``i-th`` query descriptor the ``j-th`` descriptor in the matcher's collection is the nearest and vice versa, i.e. the ``BFMathcher`` will only return consistent pairs. Such technique usually produces best results with minimal number of outliers when there are enough matches. This is alternative to the ratio test, used by D. Lowe in SIFT paper.
+    :param crossCheck: If it is false, this is will be default BFMatcher behaviour when it finds the k nearest neighbors for each query descriptor. If ``crossCheck==true``, then the ``knnMatch()`` method with ``k=1`` will only return pairs ``(i,j)`` such that for ``i-th`` query descriptor the ``j-th`` descriptor in the matcher's collection is the nearest and vice versa, i.e. the ``BFMatcher`` will only return consistent pairs. Such technique usually produces best results with minimal number of outliers when there are enough matches. This is alternative to the ratio test, used by D. Lowe in SIFT paper.


 FlannBasedMatcher
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -616,14 +616,14 @@ protected:
 };


-class CV_EXPORTS DenseFeatureDetector : public FeatureDetector
+class CV_EXPORTS_W DenseFeatureDetector : public FeatureDetector
 {
 public:
-    explicit DenseFeatureDetector( float initFeatureScale=1.f, int featureScaleLevels=1,
-                                   float featureScaleMul=0.1f,
-                                   int initXyStep=6, int initImgBound=0,
-                                   bool varyXyStepWithScale=true,
-                                   bool varyImgBoundWithScale=false );
+    CV_WRAP explicit DenseFeatureDetector( float initFeatureScale=1.f, int featureScaleLevels=1,
+                                           float featureScaleMul=0.1f,
+                                           int initXyStep=6, int initImgBound=0,
+                                           bool varyXyStepWithScale=true,
+                                           bool varyImgBoundWithScale=false );
    AlgorithmInfo* info() const;

 protected:
--- a/modules/features2d/perf/opencl/perf_brute_force_matcher.cpp
+++ b/modules/features2d/perf/opencl/perf_brute_force_matcher.cpp
@ -123,7 +123,7 @@ OCL_PERF_TEST_P(BruteForceMatcherFixture, RadiusMatch, ::testing::Combine(OCL_PE
    SANITY_CHECK_MATCHES(matches1, 1e-3);
 }

-}//ocl
-}//cvtest
+} // ocl
+} // cvtest

-#endif //HAVE_OPENCL
+#endif // HAVE_OPENCL
--- a/modules/features2d/perf/opencl/perf_fast.cpp
+++ b/modules/features2d/perf/opencl/perf_fast.cpp
@ -0,0 +1,50 @@
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+enum { TYPE_5_8 =FastFeatureDetector::TYPE_5_8, TYPE_7_12 = FastFeatureDetector::TYPE_7_12, TYPE_9_16 = FastFeatureDetector::TYPE_9_16 };
+CV_ENUM(FastType, TYPE_5_8, TYPE_7_12)
+
+typedef std::tr1::tuple<string, FastType> File_Type_t;
+typedef TestBaseWithParam<File_Type_t> FASTFixture;
+
+#define FAST_IMAGES \
+    "cv/detectors_descriptors_evaluation/images_datasets/leuven/img1.png",\
+    "stitching/a3.png"
+
+OCL_PERF_TEST_P(FASTFixture, FastDetect, testing::Combine(
+                            testing::Values(FAST_IMAGES),
+                            FastType::all()
+                          ))
+{
+    string filename = getDataPath(get<0>(GetParam()));
+    int type = get<1>(GetParam());
+    Mat mframe = imread(filename, IMREAD_GRAYSCALE);
+
+    if (mframe.empty())
+        FAIL() << "Unable to load source image " << filename;
+
+    UMat frame;
+    mframe.copyTo(frame);
+    declare.in(frame);
+
+    Ptr<FeatureDetector> fd = Algorithm::create<FeatureDetector>("Feature2D.FAST");
+    ASSERT_FALSE( fd.empty() );
+    fd->set("threshold", 20);
+    fd->set("nonmaxSuppression", true);
+    fd->set("type", type);
+    vector<KeyPoint> points;
+
+    OCL_TEST_CYCLE() fd->detect(frame, points);
+
+    SANITY_CHECK_KEYPOINTS(points);
+}
+
+} // ocl
+} // cvtest
+
+#endif // HAVE_OPENCL
--- a/modules/features2d/perf/opencl/perf_orb.cpp
+++ b/modules/features2d/perf/opencl/perf_orb.cpp
@ -0,0 +1,86 @@
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+typedef ::perf::TestBaseWithParam<std::string> ORBFixture;
+
+#define ORB_IMAGES OCL_PERF_ENUM("cv/detectors_descriptors_evaluation/images_datasets/leuven/img1.png", "stitching/a3.png")
+
+OCL_PERF_TEST_P(ORBFixture, ORB_Detect, ORB_IMAGES)
+{
+    string filename = getDataPath(GetParam());
+    Mat mframe = imread(filename, IMREAD_GRAYSCALE);
+
+    if (mframe.empty())
+        FAIL() << "Unable to load source image " << filename;
+
+    UMat frame, mask;
+    mframe.copyTo(frame);
+
+    declare.in(frame);
+    ORB detector(1500, 1.3f, 1);
+    vector<KeyPoint> points;
+
+    OCL_TEST_CYCLE() detector(frame, mask, points);
+
+    std::sort(points.begin(), points.end(), comparators::KeypointGreater());
+    SANITY_CHECK_KEYPOINTS(points, 1e-5);
+}
+
+OCL_PERF_TEST_P(ORBFixture, ORB_Extract, ORB_IMAGES)
+{
+    string filename = getDataPath(GetParam());
+    Mat mframe = imread(filename, IMREAD_GRAYSCALE);
+
+    if (mframe.empty())
+        FAIL() << "Unable to load source image " << filename;
+
+    UMat mask, frame;
+    mframe.copyTo(frame);
+
+    declare.in(frame);
+
+    ORB detector(1500, 1.3f, 1);
+    vector<KeyPoint> points;
+    detector(frame, mask, points);
+    std::sort(points.begin(), points.end(), comparators::KeypointGreater());
+
+    UMat descriptors;
+
+    OCL_TEST_CYCLE() detector(frame, mask, points, descriptors, true);
+
+    SANITY_CHECK(descriptors);
+}
+
+OCL_PERF_TEST_P(ORBFixture, ORB_Full, ORB_IMAGES)
+{
+    string filename = getDataPath(GetParam());
+    Mat mframe = imread(filename, IMREAD_GRAYSCALE);
+
+    if (mframe.empty())
+        FAIL() << "Unable to load source image " << filename;
+
+    UMat mask, frame;
+    mframe.copyTo(frame);
+
+    declare.in(frame);
+    ORB detector(1500, 1.3f, 1);
+
+    vector<KeyPoint> points;
+    UMat descriptors;
+
+    OCL_TEST_CYCLE() detector(frame, mask, points, descriptors, false);
+
+    ::perf::sort(points, descriptors);
+    SANITY_CHECK_KEYPOINTS(points, 1e-5);
+    SANITY_CHECK(descriptors);
+}
+
+} // ocl
+} // cvtest
+
+#endif // HAVE_OPENCL
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@ -758,10 +758,13 @@ private:

                    for (int k=0; k<indices_length; ++k) {
                        if (belongs_to[k]==j) {
-                            belongs_to[k] = i;
-                            count[j]--;
-                            count[i]++;
-                            break;
+                            // for cluster j, we move the furthest element from the center to the empty cluster i
+                            if ( distance_(dataset_[indices[k]], dcenters[j], veclen_) == radiuses[j] ) {
+                                belongs_to[k] = i;
+                                count[j]--;
+                                count[i]++;
+                                break;
+                            }
                        }
                    }
                    converged = false;
--- a/modules/highgui/doc/reading_and_writing_images_and_video.rst
+++ b/modules/highgui/doc/reading_and_writing_images_and_video.rst
@ -483,7 +483,7 @@ VideoWriter constructors

    :param filename: Name of the output video file.

-    :param fourcc: 4-character code of codec used to compress the frames. For example, ``CV_FOURCC('P','I','M,'1')``  is a MPEG-1 codec, ``CV_FOURCC('M','J','P','G')``  is a motion-jpeg codec etc. List of codes can be obtained at `Video Codecs by FOURCC <http://www.fourcc.org/codecs.php>`_ page.
+    :param fourcc: 4-character code of codec used to compress the frames. For example, ``CV_FOURCC('P','I','M','1')``  is a MPEG-1 codec, ``CV_FOURCC('M','J','P','G')``  is a motion-jpeg codec etc. List of codes can be obtained at `Video Codecs by FOURCC <http://www.fourcc.org/codecs.php>`_ page.

    :param fps: Framerate of the created video stream.

--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@ -215,12 +215,14 @@ enum { IMREAD_UNCHANGED  = -1, // 8bit, color or not
       IMREAD_ANYCOLOR   = 4   // ?, any color
     };

-enum { IMWRITE_JPEG_QUALITY    = 1,
-       IMWRITE_PNG_COMPRESSION = 16,
-       IMWRITE_PNG_STRATEGY    = 17,
-       IMWRITE_PNG_BILEVEL     = 18,
-       IMWRITE_PXM_BINARY      = 32,
-       IMWRITE_WEBP_QUALITY    = 64
+enum { IMWRITE_JPEG_QUALITY     = 1,
+       IMWRITE_JPEG_PROGRESSIVE = 2,
+       IMWRITE_JPEG_OPTIMIZE    = 3,
+       IMWRITE_PNG_COMPRESSION  = 16,
+       IMWRITE_PNG_STRATEGY     = 17,
+       IMWRITE_PNG_BILEVEL      = 18,
+       IMWRITE_PXM_BINARY       = 32,
+       IMWRITE_WEBP_QUALITY     = 64
     };

 enum { IMWRITE_PNG_STRATEGY_DEFAULT      = 0,
--- a/modules/highgui/include/opencv2/highgui/highgui_c.h
+++ b/modules/highgui/include/opencv2/highgui/highgui_c.h
@ -220,6 +220,8 @@ CVAPI(CvMat*) cvLoadImageM( const char* filename, int iscolor CV_DEFAULT(CV_LOAD
 enum
 {
    CV_IMWRITE_JPEG_QUALITY =1,
+    CV_IMWRITE_JPEG_PROGRESSIVE =2,
+    CV_IMWRITE_JPEG_OPTIMIZE =3,
    CV_IMWRITE_PNG_COMPRESSION =16,
    CV_IMWRITE_PNG_STRATEGY =17,
    CV_IMWRITE_PNG_BILEVEL =18,
@ -463,6 +465,8 @@ enum
    CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_NEAR = 8006,
    CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_OPTIMAL = 8007,
    CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_FAR = 8008,
+    CV_CAP_PROP_ANDROID_EXPOSE_LOCK = 8009,
+    CV_CAP_PROP_ANDROID_WHITEBALANCE_LOCK = 8010,

    // Properties of cameras available through AVFOUNDATION interface
    CV_CAP_PROP_IOS_DEVICE_FOCUS = 9001,
@ -543,6 +547,7 @@ enum
 enum
 {
    CV_CAP_ANDROID_FOCUS_MODE_AUTO = 0,
+    CV_CAP_ANDROID_FOCUS_MODE_CONTINUOUS_PICTURE,
    CV_CAP_ANDROID_FOCUS_MODE_CONTINUOUS_VIDEO,
    CV_CAP_ANDROID_FOCUS_MODE_EDOF,
    CV_CAP_ANDROID_FOCUS_MODE_FIXED,
--- a/modules/highgui/src/cap_android.cpp
+++ b/modules/highgui/src/cap_android.cpp
@ -289,6 +289,10 @@ double CvCapture_Android::getProperty( int propIdx )
        return (double)m_activity->getProperty(ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_OPTIMAL);
    case CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_FAR:
        return (double)m_activity->getProperty(ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_FAR);
+    case CV_CAP_PROP_ANDROID_EXPOSE_LOCK:
+        return (double)m_activity->getProperty(ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK);
+    case CV_CAP_PROP_ANDROID_WHITEBALANCE_LOCK:
+        return (double)m_activity->getProperty(ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK);
    default:
        CV_Error( CV_StsOutOfRange, "Failed attempt to GET unsupported camera property." );
        break;
@ -327,14 +331,23 @@ bool CvCapture_Android::setProperty( int propIdx, double propValue )
        case CV_CAP_PROP_ANDROID_ANTIBANDING:
            m_activity->setProperty(ANDROID_CAMERA_PROPERTY_ANTIBANDING, propValue);
            break;
+        case CV_CAP_PROP_ANDROID_EXPOSE_LOCK:
+            m_activity->setProperty(ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK, propValue);
+            break;
+        case CV_CAP_PROP_ANDROID_WHITEBALANCE_LOCK:
+            m_activity->setProperty(ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK, propValue);
+            break;
        default:
            CV_Error( CV_StsOutOfRange, "Failed attempt to SET unsupported camera property." );
            return false;
        }

-        if (propIdx != CV_CAP_PROP_AUTOGRAB) {// property for highgui class CvCapture_Android only
+        // Only changes in frame size require camera restart
+        if ((propIdx == CV_CAP_PROP_FRAME_WIDTH) || (propIdx == CV_CAP_PROP_FRAME_HEIGHT))
+        {   // property for highgui class CvCapture_Android only
            m_CameraParamsChanged = true;
        }
+
        res = true;
    }

--- a/modules/highgui/src/cap_avfoundation.mm
+++ b/modules/highgui/src/cap_avfoundation.mm
@ -1309,6 +1309,8 @@ bool CvVideoWriter_AVFoundation::writeFrame(const IplImage* iplimage) {
    }

    //cleanup
+    CFRelease(cfData);
+    CVPixelBufferRelease(pixelBuffer);
    CGImageRelease(cgImage);
    CGDataProviderRelease(provider);
    CGColorSpaceRelease(colorSpace);
--- a/modules/highgui/src/grfmt_jpeg.cpp
+++ b/modules/highgui/src/grfmt_jpeg.cpp
@ -598,6 +598,8 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
        cinfo.in_color_space = channels > 1 ? JCS_RGB : JCS_GRAYSCALE;

        int quality = 95;
+        int progressive = 0;
+        int optimize = 0;

        for( size_t i = 0; i < params.size(); i += 2 )
        {
@ -606,11 +608,25 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
                quality = params[i+1];
                quality = MIN(MAX(quality, 0), 100);
            }
+
+            if( params[i] == CV_IMWRITE_JPEG_PROGRESSIVE )
+            {
+                progressive = params[i+1];
+            }
+
+            if( params[i] == CV_IMWRITE_JPEG_OPTIMIZE )
+            {
+                optimize = params[i+1];
+            }
        }

        jpeg_set_defaults( &cinfo );
        jpeg_set_quality( &cinfo, quality,
                          TRUE /* limit to baseline-JPEG values */ );
+        if( progressive )
+            jpeg_simple_progression( &cinfo );
+        if( optimize )
+            cinfo.optimize_coding = TRUE;
        jpeg_start_compress( &cinfo, TRUE );

        if( channels > 1 )
--- a/modules/highgui/test/test_grfmt.cpp
+++ b/modules/highgui/test/test_grfmt.cpp
@ -386,6 +386,54 @@ TEST(Highgui_Jpeg, encode_empty)

    ASSERT_THROW(cv::imencode(".jpg", img, jpegImg), cv::Exception);
 }
+
+TEST(Highgui_Jpeg, encode_decode_progressive_jpeg)
+{
+    cvtest::TS& ts = *cvtest::TS::ptr();
+    string input = string(ts.get_data_path()) + "../cv/shared/lena.png";
+    cv::Mat img = cv::imread(input);
+    ASSERT_FALSE(img.empty());
+
+    std::vector<int> params;
+    params.push_back(IMWRITE_JPEG_PROGRESSIVE);
+    params.push_back(1);
+
+    string output_progressive = cv::tempfile(".jpg");
+    EXPECT_NO_THROW(cv::imwrite(output_progressive, img, params));
+    cv::Mat img_jpg_progressive = cv::imread(output_progressive);
+
+    string output_normal = cv::tempfile(".jpg");
+    EXPECT_NO_THROW(cv::imwrite(output_normal, img));
+    cv::Mat img_jpg_normal = cv::imread(output_normal);
+
+    EXPECT_EQ(0, cv::norm(img_jpg_progressive, img_jpg_normal, NORM_INF));
+
+    remove(output_progressive.c_str());
+}
+
+TEST(Highgui_Jpeg, encode_decode_optimize_jpeg)
+{
+    cvtest::TS& ts = *cvtest::TS::ptr();
+    string input = string(ts.get_data_path()) + "../cv/shared/lena.png";
+    cv::Mat img = cv::imread(input);
+    ASSERT_FALSE(img.empty());
+
+    std::vector<int> params;
+    params.push_back(IMWRITE_JPEG_OPTIMIZE);
+    params.push_back(1);
+
+    string output_optimized = cv::tempfile(".jpg");
+    EXPECT_NO_THROW(cv::imwrite(output_optimized, img, params));
+    cv::Mat img_jpg_optimized = cv::imread(output_optimized);
+
+    string output_normal = cv::tempfile(".jpg");
+    EXPECT_NO_THROW(cv::imwrite(output_normal, img));
+    cv::Mat img_jpg_normal = cv::imread(output_normal);
+
+    EXPECT_EQ(0, cv::norm(img_jpg_optimized, img_jpg_normal, NORM_INF));
+
+    remove(output_optimized.c_str());
+}
 #endif


--- a/modules/imgproc/perf/opencl/perf_filters.cpp
+++ b/modules/imgproc/perf/opencl/perf_filters.cpp
@ -211,7 +211,7 @@ OCL_PERF_TEST_P(SobelFixture, Sobel,

    OCL_TEST_CYCLE() cv::Sobel(src, dst, -1, dx, dy);

-    SANITY_CHECK(dst);
+    SANITY_CHECK(dst, 1e-6);
 }

 ///////////// Scharr ////////////////////////
--- a/modules/imgproc/perf/opencl/perf_imgproc.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgproc.cpp
@ -95,6 +95,34 @@ OCL_PERF_TEST_P(CalcHistFixture, CalcHist, OCL_TEST_SIZES)
    SANITY_CHECK(hist);
 }

+///////////// calcHist ////////////////////////
+
+typedef TestBaseWithParam<Size> CalcBackProjFixture;
+
+OCL_PERF_TEST_P(CalcBackProjFixture, CalcBackProj, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+
+    const std::vector<int> channels(1, 0);
+    std::vector<float> ranges(2);
+    std::vector<int> histSize(1, 256);
+    ranges[0] = 0;
+    ranges[1] = 256;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), hist(256, 1, CV_32FC1), dst(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(hist);
+
+    cv::calcHist(std::vector<UMat>(1, src), channels, noArray(), hist, histSize, ranges, false);
+
+    declare.in(src, WARMUP_RNG).out(dst);
+    OCL_TEST_CYCLE() cv::calcBackProject(std::vector<UMat>(1,src), channels, hist, dst, ranges, 1);
+
+    SANITY_CHECK_NOTHING();
+}
+
+
 /////////// CopyMakeBorder //////////////////////

 CV_ENUM(Border, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101)
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@ -100,19 +100,29 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
        low_thresh = std::min(32767.0f, low_thresh);
        high_thresh = std::min(32767.0f, high_thresh);

-        if (low_thresh > 0) low_thresh *= low_thresh;
-        if (high_thresh > 0) high_thresh *= high_thresh;
+        if (low_thresh > 0)
+            low_thresh *= low_thresh;
+        if (high_thresh > 0)
+            high_thresh *= high_thresh;
    }
    int low = cvFloor(low_thresh), high = cvFloor(high_thresh);
    Size esize(size.width + 2, size.height + 2);

    UMat mag;
-    size_t globalsize[2] = { size.width * cn, size.height }, localsize[2] = { 16, 16 };
+    size_t globalsize[2] = { size.width, size.height }, localsize[2] = { 16, 16 };

    if (aperture_size == 3 && !_src.isSubmatrix())
    {
        // Sobel calculation
-        ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc);
+        char cvt[2][40];
+        ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc,
+                                           format("-D OP_SOBEL -D cn=%d -D shortT=%s -D ucharT=%s"
+                                                  " -D convertToIntT=%s -D intT=%s -D convertToShortT=%s", cn,
+                                                  ocl::typeToStr(CV_16SC(cn)),
+                                                  ocl::typeToStr(CV_8UC(cn)),
+                                                  ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
+                                                  ocl::typeToStr(CV_32SC(cn)),
+                                                  ocl::convertTypeStr(CV_32S, CV_16S, cn, cvt[1])));
        if (calcSobelRowPassKernel.empty())
            return false;

@ -126,58 +136,62 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float

        // magnitude calculation
        ocl::Kernel magnitudeKernel("calcMagnitude_buf", ocl::imgproc::canny_oclsrc,
-                                    L2gradient ? " -D L2GRAD" : "");
+                                    format("-D cn=%d%s -D OP_MAG_BUF -D shortT=%s -D convertToIntT=%s -D intT=%s",
+                                           cn, L2gradient ? " -D L2GRAD" : "",
+                                           ocl::typeToStr(CV_16SC(cn)),
+                                           ocl::convertTypeStr(CV_16S, CV_32S, cn, cvt[0]),
+                                           ocl::typeToStr(CV_32SC(cn))));
        if (magnitudeKernel.empty())
            return false;

-        mag = UMat(esize, CV_32SC(cn), Scalar::all(0));
+        mag = UMat(esize, CV_32SC1, Scalar::all(0));
        dx.create(size, CV_16SC(cn));
        dy.create(size, CV_16SC(cn));

        magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dxBuf), ocl::KernelArg::ReadOnlyNoSize(dyBuf),
                             ocl::KernelArg::WriteOnlyNoSize(dx), ocl::KernelArg::WriteOnlyNoSize(dy),
-                             ocl::KernelArg::WriteOnlyNoSize(mag, cn), size.height, size.width);
+                             ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width);

        if (!magnitudeKernel.run(2, globalsize, localsize, false))
            return false;
    }
    else
    {
-        dx.create(size, CV_16SC(cn));
-        dy.create(size, CV_16SC(cn));
-
-        Sobel(_src, dx, CV_16SC1, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
-        Sobel(_src, dy, CV_16SC1, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
+        Sobel(_src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
+        Sobel(_src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);

        // magnitude calculation
        ocl::Kernel magnitudeKernel("calcMagnitude", ocl::imgproc::canny_oclsrc,
-                                    L2gradient ? " -D L2GRAD" : "");
+                                    format("-D OP_MAG -D cn=%d%s -D intT=int -D shortT=short -D convertToIntT=convert_int_sat",
+                                           cn, L2gradient ? " -D L2GRAD" : ""));
        if (magnitudeKernel.empty())
            return false;

-        mag = UMat(esize, CV_32SC(cn), Scalar::all(0));
+        mag = UMat(esize, CV_32SC1, Scalar::all(0));
        magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
-                             ocl::KernelArg::WriteOnlyNoSize(mag, cn), size.height, size.width);
+                             ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width);

        if (!magnitudeKernel.run(2, globalsize, NULL, false))
            return false;
    }

    // map calculation
-    ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc);
+    ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc,
+                              format("-D OP_MAP -D cn=%d", cn));
    if (calcMapKernel.empty())
        return false;

-    UMat map(esize, CV_32SC(cn));
+    UMat map(esize, CV_32SC1);
    calcMapKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
-                       ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map, cn),
+                       ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map),
                       size.height, size.width, low, high);

    if (!calcMapKernel.run(2, globalsize, localsize, false))
        return false;

    // local hysteresis thresholding
-    ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc);
+    ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc,
+                                           "-D OP_HYST_LOCAL");
    if (edgesHysteresisLocalKernel.empty())
        return false;

@ -193,7 +207,8 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float

    for ( ; ; )
    {
-        ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc);
+        ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc,
+                                                "-D OP_HYST_GLOBAL");
        if (edgesHysteresisGlobalKernel.empty())
            return false;

@ -221,14 +236,15 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
    }

    // get edges
-    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc);
+    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc, "-D OP_EDGES");
    if (getEdgesKernel.empty())
        return false;

-    _dst.create(size, CV_8UC(cn));
+    _dst.create(size, CV_8UC1);
    UMat dst = _dst.getUMat();

    getEdgesKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::WriteOnly(dst));
+
    return getEdgesKernel.run(2, globalsize, NULL, false);
 }

@ -254,12 +270,12 @@ void cv::Canny( InputArray _src, OutputArray _dst,
    }

    if ((aperture_size & 1) == 0 || (aperture_size != -1 && (aperture_size < 3 || aperture_size > 7)))
-        CV_Error(CV_StsBadFlag, "");
+        CV_Error(CV_StsBadFlag, "Aperture size should be odd");

    if (low_thresh > high_thresh)
        std::swap(low_thresh, high_thresh);

-    CV_OCL_RUN(_dst.isUMat() && cn == 1,
+    CV_OCL_RUN(_dst.isUMat() && (cn == 1 || cn == 3),
               ocl_Canny(_src, _dst, (float)low_thresh, (float)high_thresh, aperture_size, L2gradient, cn, size))

    Mat src = _src.getMat(), dst = _dst.getMat();
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -40,6 +41,8 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 static IppStatus sts = ippInit();
 #endif
@ -495,6 +498,58 @@ void cv::Scharr( InputArray _src, OutputArray _dst, int ddepth, int dx, int dy,
    sepFilter2D( _src, _dst, ddepth, kx, ky, Point(-1, -1), delta, borderType );
 }

+#ifdef HAVE_OPENCL
+
+namespace cv {
+
+static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
+                           const Mat & kd, const Mat & ks, double scale, double delta,
+                           int borderType, int depth, int ddepth)
+{
+    int iscale = cvRound(scale), idelta = cvRound(delta);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+            floatCoeff = std::fabs(delta - idelta) > DBL_EPSILON || std::fabs(scale - iscale) > DBL_EPSILON;
+    int cn = _src.channels(), wdepth = std::max(depth, floatCoeff ? CV_32F : CV_32S), kercn = 1;
+
+    if (!doubleSupport && wdepth == CV_64F)
+        return false;
+
+    char cvt[2][40];
+    ocl::Kernel k("sumConvert", ocl::imgproc::laplacian5_oclsrc,
+                  format("-D srcT=%s -D WT=%s -D dstT=%s -D coeffT=%s -D wdepth=%d "
+                         "-D convertToWT=%s -D convertToDT=%s%s",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
+                         ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
+                         ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
+                         ocl::typeToStr(wdepth), wdepth,
+                         ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
+                         ocl::convertTypeStr(wdepth, ddepth, kercn, cvt[1]),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    UMat d2x, d2y;
+    sepFilter2D(_src, d2x, depth, kd, ks, Point(-1, -1), 0, borderType);
+    sepFilter2D(_src, d2y, depth, ks, kd, Point(-1, -1), 0, borderType);
+
+    UMat dst = _dst.getUMat();
+
+    ocl::KernelArg d2xarg = ocl::KernelArg::ReadOnlyNoSize(d2x),
+            d2yarg = ocl::KernelArg::ReadOnlyNoSize(d2y),
+            dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
+
+    if (wdepth >= CV_32F)
+        k.args(d2xarg, d2yarg, dstarg, (float)scale, (float)delta);
+    else
+        k.args(d2xarg, d2yarg, dstarg, iscale, idelta);
+
+    size_t globalsize[] = { dst.cols * cn / kercn, dst.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+}
+
+#endif

 void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
                    double scale, double delta, int borderType )
@ -531,27 +586,28 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
    }
    else
    {
-        Mat src = _src.getMat(), dst = _dst.getMat();
-        const size_t STRIPE_SIZE = 1 << 14;
-
-        int depth = src.depth();
-        int ktype = std::max(CV_32F, std::max(ddepth, depth));
-        int wdepth = depth == CV_8U && ksize <= 5 ? CV_16S : depth <= CV_32F ? CV_32F : CV_64F;
-        int wtype = CV_MAKETYPE(wdepth, src.channels());
+        int ktype = std::max(CV_32F, std::max(ddepth, sdepth));
+        int wdepth = sdepth == CV_8U && ksize <= 5 ? CV_16S : sdepth <= CV_32F ? CV_32F : CV_64F;
+        int wtype = CV_MAKETYPE(wdepth, cn);
        Mat kd, ks;
        getSobelKernels( kd, ks, 2, 0, ksize, false, ktype );
-        int dtype = CV_MAKETYPE(ddepth, src.channels());

-        int dy0 = std::min(std::max((int)(STRIPE_SIZE/(getElemSize(src.type())*src.cols)), 1), src.rows);
-        Ptr<FilterEngine> fx = createSeparableLinearFilter(src.type(),
+        CV_OCL_RUN(_dst.isUMat(),
+                   ocl_Laplacian5(_src, _dst, kd, ks, scale,
+                                  delta, borderType, wdepth, ddepth))
+
+        const size_t STRIPE_SIZE = 1 << 14;
+        Ptr<FilterEngine> fx = createSeparableLinearFilter(stype,
            wtype, kd, ks, Point(-1,-1), 0, borderType, borderType, Scalar() );
-        Ptr<FilterEngine> fy = createSeparableLinearFilter(src.type(),
+        Ptr<FilterEngine> fy = createSeparableLinearFilter(stype,
            wtype, ks, kd, Point(-1,-1), 0, borderType, borderType, Scalar() );

+        Mat src = _src.getMat(), dst = _dst.getMat();
        int y = fx->start(src), dsty = 0, dy = 0;
        fy->start(src);
        const uchar* sptr = src.data + y*src.step;

+        int dy0 = std::min(std::max((int)(STRIPE_SIZE/(CV_ELEM_SIZE(stype)*src.cols)), 1), src.rows);
        Mat d2x( dy0 + kd.rows - 1, src.cols, wtype );
        Mat d2y( dy0 + kd.rows - 1, src.cols, wtype );

@ -564,7 +620,7 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
                Mat dstripe = dst.rowRange(dsty, dsty + dy);
                d2x.rows = d2y.rows = dy; // modify the headers, which should work
                d2x += d2y;
-                d2x.convertTo( dstripe, dtype, scale, delta );
+                d2x.convertTo( dstripe, ddepth, scale, delta );
            }
        }
    }
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@ -42,7 +42,6 @@

 #include "precomp.hpp"
 #include "opencl_kernels.hpp"
-#include <sstream>

 /****************************************************************************************\
                                    Base Image Filter
@ -3134,7 +3133,7 @@ template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFi
 // b e h b e h 0 0
 // c f i c f i 0 0
 template <typename T>
-static int _prepareKernelFilter2D(std::vector<T>& data, const Mat &kernel)
+static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel)
 {
    Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
    int size_y_aligned = ROUNDUP(kernel.rows * 2, 4);
@ -3154,75 +3153,52 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
                   InputArray _kernel, Point anchor,
                   double delta, int borderType )
 {
-    if (abs(delta) > FLT_MIN)
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    ddepth = ddepth < 0 ? sdepth : ddepth;
+    int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
+            wtype = CV_MAKE_TYPE(wdepth, cn);
+    if (cn > 4)
        return false;

-    int type = _src.type();
-    int cn = CV_MAT_CN(type);
-    if ((1 != cn) && (2 != cn) && (4 != cn))
-        return false;//TODO
-
-    int sdepth = CV_MAT_DEPTH(type);
    Size ksize = _kernel.size();
-    if( anchor.x < 0 )
+    if (anchor.x < 0)
        anchor.x = ksize.width / 2;
-    if( anchor.y < 0 )
+    if (anchor.y < 0)
        anchor.y = ksize.height / 2;
-    if( ddepth < 0 )
-        ddepth = sdepth;
-    else if (ddepth != sdepth)
-        return false;

-    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-    bool useDouble = (CV_64F == sdepth);
+    bool isolated = (borderType & BORDER_ISOLATED) != 0;
+    borderType &= ~BORDER_ISOLATED;
    const cv::ocl::Device &device = cv::ocl::Device::getDefault();
-    int doubleFPConfig = device.doubleFPConfig();
-    if (useDouble && (0 == doubleFPConfig))
+    bool doubleSupport = device.doubleFPConfig() > 0;
+    if (wdepth == CV_64F && !doubleSupport)
        return false;

-    const char* btype = NULL;
-    switch (borderType & ~BORDER_ISOLATED)
-    {
-    case BORDER_CONSTANT:
-        btype = "BORDER_CONSTANT";
-        break;
-    case BORDER_REPLICATE:
-        btype = "BORDER_REPLICATE";
-        break;
-    case BORDER_REFLECT:
-        btype = "BORDER_REFLECT";
-        break;
-    case BORDER_WRAP:
-        return false;
-    case BORDER_REFLECT101:
-        btype = "BORDER_REFLECT_101";
-        break;
-    }
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT",
+                                       "BORDER_WRAP", "BORDER_REFLECT_101" };

    cv::Mat kernelMat = _kernel.getMat();
    std::vector<float> kernelMatDataFloat;
-    std::vector<double> kernelMatDataDouble;
-    int kernel_size_y2_aligned = useDouble ?
-            _prepareKernelFilter2D<double>(kernelMatDataDouble, kernelMat)
-            : _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
+    int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);

+    cv::Size sz = _src.size(), wholeSize;
+    size_t globalsize[2] = { sz.width, sz.height }, localsize[2] = { 0, 1 };

-    cv::Size sz = _src.size();
-    size_t globalsize[2] = {sz.width, sz.height};
-    size_t localsize[2] = {0, 1};
-
-    ocl::Kernel kernel;
-    UMat src; Size wholeSize;
-    if (!isIsolatedBorder)
+    ocl::Kernel k;
+    UMat src = _src.getUMat();
+    if (!isolated)
    {
-        src = _src.getUMat();
        Point ofs;
        src.locateROI(wholeSize, ofs);
    }

-    size_t maxWorkItemSizes[32]; device.maxWorkItemSizes(maxWorkItemSizes);
+    size_t maxWorkItemSizes[32];
+    device.maxWorkItemSizes(maxWorkItemSizes);
    size_t tryWorkItems = maxWorkItemSizes[0];
-    for (;;)
+    char cvt[2][40];
+
+    String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F);
+
+    for ( ; ; )
    {
        size_t BLOCK_SIZE = tryWorkItems;
        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
@ -3242,32 +3218,36 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
        int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x;
        int requiredBottom = ksize.height - 1 - anchor.y;
        int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
-        int h = isIsolatedBorder ? sz.height : wholeSize.height;
-        int w = isIsolatedBorder ? sz.width : wholeSize.width;
+        int h = isolated ? sz.height : wholeSize.height;
+        int w = isolated ? sz.width : wholeSize.width;
        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;

        if ((w < ksize.width) || (h < ksize.height))
            return false;

-        char build_options[1024];
-        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
-                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
-                "-D %s -D %s -D %s",
-                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
-                sdepth, cn, useDouble ? 1 : 0,
-                anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
-                btype,
-                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+        String opts = format("-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D cn=%d "
+                             "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
+                             "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s "
+                             "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
+                             "-D convertToWT=%s -D convertToDstT=%s",
+                             (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, cn, anchor.x, anchor.y,
+                             ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType],
+                             extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                             isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
+                             doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(),
+                             ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
+                             ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
+                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
+                             ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]));

        localsize[0] = BLOCK_SIZE;
        globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE;
        globalsize[1] = DIVUP(sz.height, BLOCK_SIZE_Y);

-        cv::String errmsg;
-        if (!kernel.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, build_options))
+        if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts))
            return false;
-        size_t kernelWorkGroupSize = kernel.workGroupSize();
+
+        size_t kernelWorkGroupSize = k.workGroupSize();
        if (localsize[0] <= kernelWorkGroupSize)
            break;
        if (BLOCK_SIZE < kernelWorkGroupSize)
@ -3275,242 +3255,238 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
        tryWorkItems = kernelWorkGroupSize;
    }

-    _dst.create(sz, CV_MAKETYPE(ddepth, cn));
+    _dst.create(sz, dtype);
    UMat dst = _dst.getUMat();
-    if (src.empty())
-        src = _src.getUMat();
-
-    int idxArg = 0;
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
-    idxArg = kernel.set(idxArg, (int)src.step);

    int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
    int srcOffsetY = (int)(src.offset / src.step);
-    int srcEndX = (isIsolatedBorder ? (srcOffsetX + sz.width) : wholeSize.width);
-    int srcEndY = (isIsolatedBorder ? (srcOffsetY + sz.height) : wholeSize.height);
-    idxArg = kernel.set(idxArg, srcOffsetX);
-    idxArg = kernel.set(idxArg, srcOffsetY);
-    idxArg = kernel.set(idxArg, srcEndX);
-    idxArg = kernel.set(idxArg, srcEndY);
-
-    idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
-    float borderValue[4] = {0, 0, 0, 0};
-    double borderValueDouble[4] = {0, 0, 0, 0};
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
-    {
-        int cnocl = (3 == cn) ? 4 : cn;
-        if (useDouble)
-            idxArg = kernel.set(idxArg, (void *)&borderValueDouble[0], sizeof(double) * cnocl);
-        else
-            idxArg = kernel.set(idxArg, (void *)&borderValue[0], sizeof(float) * cnocl);
-    }
-    if (useDouble)
-    {
-        UMat kernalDataUMat(kernelMatDataDouble, true);
-        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(kernalDataUMat));
-    }
-    else
-    {
-        UMat kernalDataUMat(kernelMatDataFloat, true);
-        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(kernalDataUMat));
-    }
-    return kernel.run(2, globalsize, localsize, true);
+    int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width);
+    int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height);
+
+    k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY,
+           srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta);
+
+    return k.run(2, globalsize, localsize, false);
 }

-static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync)
+static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
+                               int borderType, int ddepth, bool fast8uc1)
 {
-    int type = src.type();
-    int cn = CV_MAT_CN(type);
-    int sdepth = CV_MAT_DEPTH(type);
+    int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
    Size bufSize = buf.size();

+    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
+        return false;
+
 #ifdef ANDROID
    size_t localsize[2] = {16, 10};
 #else
    size_t localsize[2] = {16, 16};
 #endif
+
    size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
-    if (CV_8U == sdepth)
-    {
-        switch (cn)
-        {
-        case 1:
-            globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
-            break;
-        case 2:
-            globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0];
-            break;
-        case 4:
-            globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0];
-            break;
-        }
-    }
+    if (fast8uc1)
+        globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];

-    int radiusX = anchor;
-    int radiusY = (int)((buf.rows - src.rows) >> 1);
+    int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;

-    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-    const char* btype = NULL;
-    switch (borderType & ~BORDER_ISOLATED)
-    {
-    case BORDER_CONSTANT:
-        btype = "BORDER_CONSTANT";
-        break;
-    case BORDER_REPLICATE:
-        btype = "BORDER_REPLICATE";
-        break;
-    case BORDER_REFLECT:
-        btype = "BORDER_REFLECT";
-        break;
-    case BORDER_WRAP:
-        btype = "BORDER_WRAP";
-        break;
-    case BORDER_REFLECT101:
-        btype = "BORDER_REFLECT_101";
-        break;
-    default:
-        return false;
-    }
+    bool isolated = (borderType & BORDER_ISOLATED) != 0;
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" },
+        * const btype = borderMap[borderType & ~BORDER_ISOLATED];

    bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1;
    extra_extrapolation |= src.rows < radiusY;
    extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
    extra_extrapolation |= src.cols < radiusX;

-    cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s",
-        radiusX, (int)localsize[0], (int)localsize[1], cn,
-        btype,
-        extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-        isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+    char cvt[40];
+    cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
+                                          " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s",
+                                          radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
+                                          extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                                          isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
+                                          ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
+                                          ocl::convertTypeStr(sdepth, CV_32F, cn, cvt),
+                                          ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F),
+                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
    build_options += ocl::kernelToStr(kernelX, CV_32F);

    Size srcWholeSize; Point srcOffset;
    src.locateROI(srcWholeSize, srcOffset);

-    std::stringstream strKernel;
-    strKernel << "row_filter";
-    if (-1 != cn)
-        strKernel << "_C" << cn;
-    if (-1 != sdepth)
-        strKernel << "_D" << sdepth;
+    String kernelName("row_filter");
+    if (fast8uc1)
+        kernelName += "_C1_D0";

-    ocl::Kernel kernelRow;
-    if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
-                          build_options))
+    ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
+                  build_options);
+    if (k.empty())
        return false;

-    int idxArg = 0;
-    idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
-    idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize()));
-
-    idxArg = kernelRow.set(idxArg, srcOffset.x);
-    idxArg = kernelRow.set(idxArg, srcOffset.y);
-    idxArg = kernelRow.set(idxArg, src.cols);
-    idxArg = kernelRow.set(idxArg, src.rows);
-    idxArg = kernelRow.set(idxArg, srcWholeSize.width);
-    idxArg = kernelRow.set(idxArg, srcWholeSize.height);
-
-    idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf));
-    idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize()));
-    idxArg = kernelRow.set(idxArg, buf.cols);
-    idxArg = kernelRow.set(idxArg, buf.rows);
-    idxArg = kernelRow.set(idxArg, radiusY);
+    if (fast8uc1)
+        k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
+               srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
+               ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
+               buf.cols, buf.rows, radiusY);
+    else
+        k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x,
+               srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
+               ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY);

-    return kernelRow.run(2, globalsize, localsize, sync);
+    return k.run(2, globalsize, localsize, false);
 }

-static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync)
+static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor)
 {
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    if (dst.depth() == CV_64F && !doubleSupport)
+        return false;
+
 #ifdef ANDROID
-    size_t localsize[2] = {16, 10};
+    size_t localsize[2] = { 16, 10 };
 #else
-    size_t localsize[2] = {16, 16};
+    size_t localsize[2] = { 16, 16 };
 #endif
-    size_t globalsize[2] = {0, 0};
+    size_t globalsize[2] = { 0, 0 };

    int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
    Size sz = dst.size();

    globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
-
-    if (dtype == CV_8UC2)
-        globalsize[0] = DIVUP((sz.width + 1) / 2, localsize[0]) * localsize[0];
-    else
-        globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
+    globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];

    char cvt[40];
-    cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf.type()),
-                                          ocl::typeToStr(dtype), ocl::convertTypeStr(CV_32F, ddepth, cn, cvt));
+    cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
+                                          " -D srcT=%s -D dstT=%s -D convertToDstT=%s"
+                                          " -D srcT1=%s -D dstT1=%s%s",
+                                          anchor, (int)localsize[0], (int)localsize[1], cn,
+                                          ocl::typeToStr(buf.type()), ocl::typeToStr(dtype),
+                                          ocl::convertTypeStr(CV_32F, ddepth, cn, cvt),
+                                          ocl::typeToStr(CV_32F), ocl::typeToStr(ddepth),
+                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
    build_options += ocl::kernelToStr(kernelY, CV_32F);

-    ocl::Kernel kernelCol;
-    if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options))
+    ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
+                  build_options);
+    if (k.empty())
        return false;

-    int idxArg = 0;
-    idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf));
-    idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize()));
-    idxArg = kernelCol.set(idxArg, buf.cols);
-    idxArg = kernelCol.set(idxArg, buf.rows);
+    k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
+           static_cast<float>(delta));
+
+    return k.run(2, globalsize, localsize, false);
+}
+
+const int optimizedSepFilterLocalSize = 16;

-    idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
-    idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize()));
-    idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize()));
-    idxArg = kernelCol.set(idxArg, dst.cols);
-    idxArg = kernelCol.set(idxArg, dst.rows);
+static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
+                                       Mat row_kernel, Mat col_kernel,
+                                       double delta, int borderType, int ddepth)
+{
+    Size size = _src.size(), wholeSize;
+    Point origin;
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
+            esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
+            dtype = CV_MAKE_TYPE(ddepth, cn);
+    size_t src_step = _src.step(), src_offset = _src.offset();
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
+            !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
+              borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
+              borderType == BORDER_REFLECT_101))
+        return false;
+
+    size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize };
+    size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) };
+
+    char cvt[2][40];
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
+                                       "BORDER_REFLECT_101" };
+
+    String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
+                             " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
+                             " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1],
+                             row_kernel.cols / 2, col_kernel.cols / 2,
+                             ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(),
+                             ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(),
+                             ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
+                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
+                             ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
+                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
+
+    ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
+    if (k.empty())
+        return false;

-    return kernelCol.run(2, globalsize, localsize, sync);
+    UMat src = _src.getUMat();
+    _dst.create(size, dtype);
+    UMat dst = _dst.getUMat();
+
+    int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
+    int src_offset_y = static_cast<int>(src_offset / src_step);
+
+    src.locateROI(wholeSize, origin);
+
+    k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
+           wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
+           static_cast<float>(delta));
+
+    return k.run(2, gt2, lt2, false);
 }

 static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
                      InputArray _kernelX, InputArray _kernelY, Point anchor,
                      double delta, int borderType )
 {
-    if (abs(delta)> FLT_MIN)
-        return false;
+    const ocl::Device & d = ocl::Device::getDefault();
+    Size imgSize = _src.size();

-    int type = _src.type();
-    if ( !( (type == CV_8UC1 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC4) &&
-            (ddepth == CV_32F || ddepth == CV_16S || ddepth == CV_8U || ddepth < 0) ) )
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    if (cn > 4)
        return false;

-    int cn = CV_MAT_CN(type);
-
    Mat kernelX = _kernelX.getMat().reshape(1, 1);
-    if (1 != (kernelX.cols % 2))
+    if (kernelX.cols % 2 != 1)
        return false;
    Mat kernelY = _kernelY.getMat().reshape(1, 1);
-    if (1 != (kernelY.cols % 2))
+    if (kernelY.cols % 2 != 1)
        return false;

-    int sdepth = CV_MAT_DEPTH(type);
-    if( anchor.x < 0 )
+    if (ddepth < 0)
+        ddepth = sdepth;
+
+    CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
+                imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) &&
+                imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) &&
+                (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) &&
+                (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
+                ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
+                                           borderType & ~BORDER_ISOLATED, ddepth), true)
+
+    if (anchor.x < 0)
        anchor.x = kernelX.cols >> 1;
-    if( anchor.y < 0 )
+    if (anchor.y < 0)
        anchor.y = kernelY.cols >> 1;

-    if( ddepth < 0 )
-        ddepth = sdepth;
-
    UMat src = _src.getUMat();
    Size srcWholeSize; Point srcOffset;
    src.locateROI(srcWholeSize, srcOffset);
-    if ( (0 != (srcOffset.x % 4))   ||
-         (0 != (src.cols % 4))      ||
-         (0 != ((src.step / src.elemSize()) % 4))
-       )
-        return false;
+
+    bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
+            src.cols % 4 == 0 && src.step % 4 == 0;

    Size srcSize = src.size();
    Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
-    UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn));
-    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false))
+    UMat buf(bufSize, CV_32FC(cn));
+    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1))
        return false;

    _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
    UMat dst = _dst.getUMat();
-    return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false);
+
+    return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y);
 }

 #endif
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -1917,71 +1917,73 @@ class IPPresizeInvoker :
    public ParallelLoopBody
 {
 public:
-    IPPresizeInvoker(Mat &_src, Mat &_dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
-      ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), ok(_ok)
-      {
-          *ok = true;
-          IppiSize srcSize, dstSize;
-          int type = src.type();
-          int specSize = 0, initSize = 0;
-          srcSize.width  = src.cols;
-          srcSize.height = src.rows;
-          dstSize.width  = dst.cols;
-          dstSize.height = dst.rows;
-
-          switch (type)
-          {
-          case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
-          case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
-          case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
-          case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
-          case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
-          case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
-          case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
-          case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
-          case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
-          case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
-          case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
-          case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
-          case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
-          case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
-          case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
-          default: { *ok = false; return;} break;
-          }
-      }
+    IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
+        ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), ok(_ok)
+    {
+        *ok = true;
+        IppiSize srcSize, dstSize;
+        int type = src.type();
+        int specSize = 0, initSize = 0;
+        srcSize.width  = src.cols;
+        srcSize.height = src.rows;
+        dstSize.width  = dst.cols;
+        dstSize.height = dst.rows;

-      ~IPPresizeInvoker()
-      {
-      }
+        switch (type)
+        {
+            case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
+            case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
+            case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
+            case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
+            case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
+            case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
+            case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
+            case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
+            case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
+            case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
+            case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
+            case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
+            case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
+            case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
+            case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
+            default: { *ok = false; return; } break;
+        }
+    }

-      virtual void operator() (const Range& range) const
-      {
-          if (*ok == false) return;
+    ~IPPresizeInvoker()
+    {
+    }

-          int cn = src.channels();
-          int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
-          int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
-          int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
+    virtual void operator() (const Range& range) const
+    {
+        if (*ok == false)
+          return;

-          IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
-          IppiSize  dstSize   = { dstwidth, dstheight - dsty };
-          int bufsize = 0, itemSize = (int)src.elemSize1();
+        int cn = src.channels();
+        int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
+        int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
+        int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);

-          CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
-          CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
+        IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
+        IppiSize  dstSize   = { dstwidth, dstheight - dsty };
+        int bufsize = 0, itemSize = (int)src.elemSize1();

-          Ipp8u* pSrc = (Ipp8u*)src.data + (int)src.step[0] * srcOffset.y + srcOffset.x * cn * itemSize;
-          Ipp8u* pDst = (Ipp8u*)dst.data + (int)dst.step[0] * dstOffset.y + dstOffset.x * cn * itemSize;
+        CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
+        CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));

-          AutoBuffer<uchar> buf(bufsize + 64);
-          uchar* bufptr = alignPtr((uchar*)buf, 32);
+        Ipp8u* pSrc = (Ipp8u*)src.data + (int)src.step[0] * srcOffset.y + srcOffset.x * cn * itemSize;
+        Ipp8u* pDst = (Ipp8u*)dst.data + (int)dst.step[0] * dstOffset.y + dstOffset.x * cn * itemSize;
+
+        AutoBuffer<uchar> buf(bufsize + 64);
+        uchar* bufptr = alignPtr((uchar*)buf, 32);
+
+        if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
+            *ok = false;
+    }

-          if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
-              *ok = false;
-      }
 private:
-    Mat &src;
-    Mat &dst;
+    Mat & src;
+    Mat & dst;
    double inv_scale_x;
    double inv_scale_y;
    void *pSpec;
@ -1993,12 +1995,13 @@ private:
    bool *ok;
    const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
 };
+
 #endif

 #ifdef HAVE_OPENCL

 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
-                                          float * const alpha_tab, int * const ofs_tab)
+                                      float * const alpha_tab, int * const ofs_tab)
 {
    int k = 0, dx = 0;
    for ( ; dx < dsize; dx++)
@ -2049,8 +2052,16 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
 {
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);

-    double inv_fx = 1. / fx, inv_fy = 1. / fy;
+    double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
    float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
+    int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
+    bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
+        std::abs(inv_fy - iscale_y) < DBL_EPSILON;
+
+    // in case of scale_x && scale_y is equal to 2
+    // INTER_AREA (fast) also is equal to INTER_LINEAR
+    if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
+        /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower

    if( !(cn <= 4 &&
           (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
@ -2061,39 +2072,105 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
    _dst.create(dsize, type);
    UMat dst = _dst.getUMat();

+    Size ssize = src.size();
    ocl::Kernel k;
    size_t globalsize[] = { dst.cols, dst.rows };

    if (interpolation == INTER_LINEAR)
    {
-        int wdepth = std::max(depth, CV_32S);
-        int wtype = CV_MAKETYPE(wdepth, cn);
        char buf[2][32];
-        k.create("resizeLN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D PIXTYPE1=%s "
-                        "-D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d",
-                        depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
-                        ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
-                        ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
-                        cn));
+
+        // integer path is slower because of CPU part, so it's disabled
+        if (depth == CV_8U && ((void)0, 0))
+        {
+            AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
+            int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
+            short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
+            float fxx, fyy;
+            int sx, sy;
+
+            for (int dx = 0; dx < dsize.width; dx++)
+            {
+                fxx = (float)((dx+0.5)*inv_fx - 0.5);
+                sx = cvFloor(fxx);
+                fxx -= sx;
+
+                if (sx < 0)
+                    fxx = 0, sx = 0;
+
+                if (sx >= ssize.width-1)
+                    fxx = 0, sx = ssize.width-1;
+
+                xofs[dx] = sx;
+                ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
+                ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
+            }
+
+            for (int dy = 0; dy < dsize.height; dy++)
+            {
+                fyy = (float)((dy+0.5)*inv_fy - 0.5);
+                sy = cvFloor(fyy);
+                fyy -= sy;
+
+                yofs[dy] = sy;
+                ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
+                ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
+            }
+
+            int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+            UMat coeffs;
+            Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
+
+            k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                     format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
+                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                            "-D INTER_RESIZE_COEF_BITS=%d",
+                            depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                            ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                            ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                            cn, INTER_RESIZE_COEF_BITS));
+            if (k.empty())
+                return false;
+
+            k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+                   ocl::KernelArg::PtrReadOnly(coeffs));
+        }
+        else
+        {
+            int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+            k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                     format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
+                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                            "-D INTER_RESIZE_COEF_BITS=%d",
+                            depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                            ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                            ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                            cn, INTER_RESIZE_COEF_BITS));
+            if (k.empty())
+                return false;
+
+            k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+                   (float)inv_fx, (float)inv_fy);
+        }
    }
    else if (interpolation == INTER_NEAREST)
    {
        k.create("resizeNN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_NEAREST -D PIXTYPE=%s -D PIXTYPE1=%s -D cn=%d",
+                 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
                        ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), cn));
+        if (k.empty())
+            return false;
+
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+               (float)inv_fx, (float)inv_fy);
    }
    else if (interpolation == INTER_AREA)
    {
-        int iscale_x = saturate_cast<int>(inv_fx);
-        int iscale_y = saturate_cast<int>(inv_fy);
-        bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
-                        std::abs(inv_fy - iscale_y) < DBL_EPSILON;
        int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
        int wtype = CV_MAKE_TYPE(wdepth, cn);

        char cvt[2][40];
-        String buildOption = format("-D INTER_AREA -D PIXTYPE=%s -D PIXTYPE1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
+        String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
                                    ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
                                    ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);

@ -2103,7 +2180,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
        if (is_area_fast)
        {
            int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
-            buildOption = buildOption + format(" -D convertToPIXTYPE=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+            buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
                                               ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
                                               ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
@ -2126,12 +2203,11 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
        }
        else
        {
-            buildOption = buildOption + format(" -D convertToPIXTYPE=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
+            buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
            k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
            if (k.empty())
                return false;

-            Size ssize = src.size();
            int xytab_size = (ssize.width + ssize.height) << 1;
            int tabofs_size = dsize.height + dsize.width + 2;

@ -2161,11 +2237,6 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
        return k.run(2, globalsize, NULL, false);
    }

-    if( k.empty() )
-        return false;
-    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-           (float)inv_fx, (float)inv_fy);
-
    return k.run(2, globalsize, 0, false);
 }

--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@ -42,7 +42,6 @@

 #include "precomp.hpp"
 #include <limits.h>
-#include <stdio.h>
 #include "opencl_kernels.hpp"

 /****************************************************************************************\
@ -1291,9 +1290,10 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
 {
    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);

+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

-    if (_src.depth() == CV_64F && !doubleSupport)
+    if (depth == CV_64F && !doubleSupport)
        return false;

    UMat kernel8U;
@ -1324,13 +1324,14 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
        return false;

    static const char * const op2str[] = { "ERODE", "DILATE" };
-    String buildOptions = format("-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s%s%s -D GENTYPE=%s -D DEPTH_%d",
-                                 anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], op2str[op],
+    String buildOptions = format("-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s%s%s"
+                                 " -D T=%s -D DEPTH_%d -D cn=%d -D T1=%s", anchor.x, anchor.y,
+                                 (int)localThreads[0], (int)localThreads[1], op2str[op],
                                 doubleSupport ? " -D DOUBLE_SUPPORT" : "", rectKernel ? " -D RECTKERNEL" : "",
-                                 ocl::typeToStr(_src.type()), _src.depth() );
+                                 ocl::typeToStr(_src.type()), _src.depth(), cn, ocl::typeToStr(depth));

    std::vector<ocl::Kernel> kernels;
-    for (int i = 0; i<iterations; i++)
+    for (int i = 0; i < iterations; i++)
    {
        ocl::Kernel k("morph", ocl::imgproc::morph_oclsrc, buildOptions);
        if (k.empty())
@ -1341,38 +1342,35 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
    _dst.create(src.size(), src.type());
    UMat dst = _dst.getUMat();

-    if( iterations== 1 && src.u != dst.u)
+    if (iterations == 1 && src.u != dst.u)
    {
        Size wholesize;
        Point ofs;
        src.locateROI(wholesize, ofs);
        int wholecols = wholesize.width, wholerows = wholesize.height;

-        int idxArg = 0;
-        idxArg = kernels[0].set(idxArg, ocl::KernelArg::ReadOnlyNoSize(src));
-        idxArg = kernels[0].set(idxArg, ocl::KernelArg::WriteOnlyNoSize(dst));
-        idxArg = kernels[0].set(idxArg, ofs.x);
-        idxArg = kernels[0].set(idxArg, ofs.y);
-        idxArg = kernels[0].set(idxArg, src.cols);
-        idxArg = kernels[0].set(idxArg, src.rows);
-        idxArg = kernels[0].set(idxArg, ocl::KernelArg::PtrReadOnly(kernel8U));
-        idxArg = kernels[0].set(idxArg, wholecols);
-        idxArg = kernels[0].set(idxArg, wholerows);
+        kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst),
+                        ofs.x, ofs.y, src.cols, src.rows, ocl::KernelArg::PtrReadOnly(kernel8U),
+                        wholecols, wholerows);

        return kernels[0].run(2, globalThreads, localThreads, false);
    }

-    for(int i = 0; i< iterations; i++)
+    for (int i = 0; i < iterations; i++)
    {
        UMat source;
        Size wholesize;
        Point ofs;
-        if( i == 0)
+
+        if (i == 0)
        {
            int cols =  src.cols, rows = src.rows;
            src.locateROI(wholesize,ofs);
            src.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x);
-            src.copyTo(source);
+            if(src.u != dst.u)
+                source = src;
+            else
+                src.copyTo(source);
            src.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
            source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
        }
@ -1385,20 +1383,11 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
            dst.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
            source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
        }
-
        source.locateROI(wholesize, ofs);
-        int wholecols = wholesize.width, wholerows = wholesize.height;

-        int idxArg = 0;
-        idxArg = kernels[i].set(idxArg, ocl::KernelArg::ReadOnlyNoSize(source));
-        idxArg = kernels[i].set(idxArg, ocl::KernelArg::WriteOnlyNoSize(dst));
-        idxArg = kernels[i].set(idxArg, ofs.x);
-        idxArg = kernels[i].set(idxArg, ofs.y);
-        idxArg = kernels[i].set(idxArg, source.cols);
-        idxArg = kernels[i].set(idxArg, source.rows);
-        idxArg = kernels[i].set(idxArg, ocl::KernelArg::PtrReadOnly(kernel8U));
-        idxArg = kernels[i].set(idxArg, wholecols);
-        idxArg = kernels[i].set(idxArg, wholerows);
+        kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst),
+                        ofs.x, ofs.y, source.cols, source.rows, ocl::KernelArg::PtrReadOnly(kernel8U),
+                        wholesize.width, wholesize.height);

        if (!kernels[i].run(2, globalThreads, localThreads, false))
            return false;
@ -1414,7 +1403,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
                     int borderType, const Scalar& borderValue )
 {
 #ifdef HAVE_OPENCL
-    int src_type = _src.type(), dst_type = _dst.type(),
+    int src_type = _src.type(),
        src_cn = CV_MAT_CN(src_type), src_depth = CV_MAT_DEPTH(src_type);
 #endif

@ -1427,13 +1416,13 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
        return;
 #endif

-    if( iterations == 0 || kernel.rows*kernel.cols == 1 )
+    if (iterations == 0 || kernel.rows*kernel.cols == 1)
    {
        _src.copyTo(_dst);
        return;
    }

-    if( !kernel.data )
+    if (!kernel.data)
    {
        kernel = getStructuringElement(MORPH_RECT, Size(1+iterations*2,1+iterations*2));
        anchor = Point(iterations, iterations);
@ -1449,8 +1438,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
        iterations = 1;
    }

-    CV_OCL_RUN(_dst.isUMat() && _src.size() == _dst.size() && src_type == dst_type &&
-               _src.dims() <= 2 && (src_cn == 1 || src_cn == 4) &&
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && src_cn <= 4 &&
               (src_depth == CV_8U || src_depth == CV_32F || src_depth == CV_64F ) &&
               borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue() &&
               (op == MORPH_ERODE || op == MORPH_DILATE),
--- a/modules/imgproc/src/opencl/bilateral.cl
+++ b/modules/imgproc/src/opencl/bilateral.cl
@ -32,6 +32,28 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.

+#if cn != 3
+#define loadpix(addr) *(__global const uchar_t *)(addr)
+#define storepix(val, addr)  *(__global uchar_t *)(addr) = val
+#define TSIZE cn
+#else
+#define loadpix(addr) vload3(0, (__global const uchar *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global uchar *)(addr))
+#define TSIZE 3
+#endif
+
+#if cn == 1
+#define SUM(a) a
+#elif cn == 2
+#define SUM(a) a.x + a.y
+#elif cn == 3
+#define SUM(a) a.x + a.y + a.z
+#elif cn == 4
+#define SUM(a) a.x + a.y + a.z + a.w
+#else
+#error "cn should be <= 4"
+#endif
+
 __kernel void bilateral(__global const uchar * src, int src_step, int src_offset,
                        __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                        __constant float * color_weight, __constant float * space_weight, __constant int * space_ofs)
@ -41,19 +63,23 @@ __kernel void bilateral(__global const uchar * src, int src_step, int src_offset

    if (y < dst_rows && x < dst_cols)
    {
-        int src_index = mad24(y + radius, src_step, x + radius + src_offset);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-        float sum = 0.f, wsum = 0.f;
-        int val0 = convert_int(src[src_index]);
+        int src_index = mad24(y + radius, src_step, mad24(x + radius, TSIZE, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
+
+        float_t sum = (float_t)(0.0f);
+        float wsum = 0.0f;
+        int_t val0 = convert_int_t(loadpix(src + src_index));

        #pragma unroll
        for (int k = 0; k < maxk; k++ )
        {
-            int val = convert_int(src[src_index + space_ofs[k]]);
-            float w = space_weight[k] * color_weight[abs(val - val0)];
-            sum += (float)(val) * w;
+            int_t val = convert_int_t(loadpix(src + src_index + space_ofs[k]));
+            uint_t diff = abs(val - val0);
+            float w = space_weight[k] * color_weight[SUM(diff)];
+            sum += convert_float_t(val) * (float_t)(w);
            wsum += w;
        }
-        dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
+
+        storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index);
    }
 }
--- a/modules/imgproc/src/opencl/boxFilter.cl
+++ b/modules/imgproc/src/opencl/boxFilter.cl
@ -47,6 +47,18 @@
 #endif
 #endif

+#if cn != 3
+#define loadpix(addr) *(__global const ST *)(addr)
+#define storepix(val, addr)  *(__global DT *)(addr) = val
+#define SRCSIZE (int)sizeof(ST)
+#define DSTSIZE (int)sizeof(DT)
+#else
+#define loadpix(addr) vload3(0, (__global const ST1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global DT1 *)(addr))
+#define SRCSIZE (int)sizeof(ST1)*cn
+#define DSTSIZE (int)sizeof(DT1)*cn
+#endif
+
 #ifdef BORDER_CONSTANT
 #elif defined BORDER_REPLICATE
 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
@ -123,8 +135,8 @@ inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, co
    if (pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #endif
    {
-        int src_index = mad24(pos.y, src_step, pos.x * (int)sizeof(ST));
-        WT value = convertToWT(*(__global const ST *)(srcptr + src_index));
+        int src_index = mad24(pos.y, src_step, pos.x * SRCSIZE);
+        WT value = convertToWT(loadpix(srcptr + src_index));

        return PROCESS_ELEM(value);
    }
@ -143,8 +155,8 @@ inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, co
 #endif
            srcCoords.x2, srcCoords.y2);

-        int src_index = mad24(selected_row, src_step, selected_col * (int)sizeof(ST));
-        WT value = convertToWT(*(__global const ST *)(srcptr + src_index));
+        int src_index = mad24(selected_row, src_step, selected_col * SRCSIZE);
+        WT value = convertToWT(loadpix(srcptr + src_index));

        return PROCESS_ELEM(value);
 #endif
@ -180,7 +192,7 @@ __kernel void boxFilter(__global const uchar * srcptr, int src_step, int srcOffs
    sumOfCols[local_id] = tmp_sum;
    barrier(CLK_LOCAL_MEM_FENCE);

-    int dst_index = mad24(y, dst_step, x * (int)sizeof(DT) + dst_offset);
+    int dst_index = mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset));
    __global DT * dst = (__global DT *)(dstptr + dst_index);

    int sy_index = 0; // current index in data[] array
@ -196,10 +208,11 @@ __kernel void boxFilter(__global const uchar * srcptr, int src_step, int srcOffs
                total_sum += sumOfCols[local_id + sx - ANCHOR_X];

 #ifdef NORMALIZE
-            dst[0] = convertToDT((WT)(alpha) * total_sum);
+            DT dstval = convertToDT((WT)(alpha) * total_sum);
 #else
-            dst[0] = convertToDT(total_sum);
+            DT dstval = convertToDT(total_sum);
 #endif
+            storepix(dstval, dst);
        }
        barrier(CLK_LOCAL_MEM_FENCE);

--- a/modules/imgproc/src/opencl/canny.cl
+++ b/modules/imgproc/src/opencl/canny.cl
@ -43,6 +43,18 @@
 //
 //M*/

+#ifdef OP_SOBEL
+
+#if cn != 3
+#define loadpix(addr) convertToIntT(*(__global const ucharT *)(addr))
+#define storepix(val, addr) *(__global shortT *)(addr) = convertToShortT(val)
+#define shortSize (int)sizeof(shortT)
+#else
+#define loadpix(addr) convertToIntT(vload3(0, (__global const uchar *)(addr)))
+#define storepix(val, addr) vstore3(convertToShortT(val), 0, (__global short *)(addr))
+#define shortSize (int)sizeof(short) * cn
+#endif
+
 // Smoothing perpendicular to the derivative direction with a triangle filter
 // only support 3x3 Sobel kernel
 // h (-1) =  1, h (0) =  2, h (1) =  1
@ -54,11 +66,9 @@
 // dx_buf	output dx buffer
 // dy_buf	output dy buffer

-__kernel void __attribute__((reqd_work_group_size(16, 16, 1)))
-calcSobelRowPass
-    (__global const uchar * src, int src_step, int src_offset, int rows, int cols,
-     __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
-     __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset)
+__kernel void calcSobelRowPass(__global const uchar * src, int src_step, int src_offset, int rows, int cols,
+                               __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
+                               __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset)
 {
    int gidx = get_global_id(0);
    int gidy = get_global_id(1);
@ -66,34 +76,39 @@ calcSobelRowPass
    int lidx = get_local_id(0);
    int lidy = get_local_id(1);

-    __local int smem[16][18];
+    __local intT smem[16][18];

-    smem[lidy][lidx + 1] = src[mad24(src_step, min(gidy, rows - 1), gidx + src_offset)];
+    smem[lidy][lidx + 1] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(gidx, cn, src_offset)));
    if (lidx == 0)
    {
-        smem[lidy][0]  = src[mad24(src_step, min(gidy, rows - 1), max(gidx - 1,  0)        + src_offset)];
-        smem[lidy][17] = src[mad24(src_step, min(gidy, rows - 1), min(gidx + 16, cols - 1) + src_offset)];
+        smem[lidy][0]  = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(max(gidx - 1,  0), cn, src_offset)));
+        smem[lidy][17] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(min(gidx + 16, cols - 1), cn, src_offset)));
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (gidy < rows && gidx < cols)
    {
-        *(__global short *)(dx_buf + mad24(gidy, dx_buf_step, gidx * (int)sizeof(short) + dx_buf_offset)) =
-            smem[lidy][lidx + 2] - smem[lidy][lidx];
-        *(__global short *)(dy_buf + mad24(gidy, dy_buf_step, gidx * (int)sizeof(short) + dy_buf_offset)) =
-            smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
+        storepix(smem[lidy][lidx + 2] - smem[lidy][lidx],
+                 dx_buf + mad24(gidy, dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
+        storepix(mad24(2, smem[lidy][lidx + 1], smem[lidy][lidx] + smem[lidy][lidx + 2]),
+                 dy_buf + mad24(gidy, dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
    }
 }

-inline int calc(short x, short y)
+#elif defined OP_MAG_BUF || defined OP_MAG
+
+inline intT calc(shortT x, shortT y)
 {
 #ifdef L2GRAD
-    return x * x + y * y;
+    intT intx = convertToIntT(x), inty = convertToIntT(y);
+    return intx * intx + inty * inty;
 #else
-    return (x >= 0 ? x : -x) + (y >= 0 ? y : -y);
+    return convertToIntT( (x >= (shortT)(0) ? x : -x) + (y >= (shortT)(0) ? y : -y) );
 #endif
 }

+#ifdef OP_MAG
+
 // calculate the magnitude of the filter pass combining both x and y directions
 // This is the non-buffered version(non-3x3 sobel)
 //
@ -112,18 +127,43 @@ __kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_of

    if (y < rows && x < cols)
    {
-        int dx_index = mad24(dx_step, y, x * (int)sizeof(short) + dx_offset);
-        int dy_index = mad24(dy_step, y, x * (int)sizeof(short) + dy_offset);
-        int mag_index = mad24(mag_step, y + 1, (x + 1) * (int)sizeof(int) + mag_offset);
+        int dx_index = mad24(dx_step, y, mad24(x, (int)sizeof(short) * cn, dx_offset));
+        int dy_index = mad24(dy_step, y, mad24(x, (int)sizeof(short) * cn, dy_offset));
+        int mag_index = mad24(mag_step, y + 1, mad24(x + 1, (int)sizeof(int), mag_offset));

-        __global const short * dx = (__global const short *)(dxptr + dx_index);
-        __global const short * dy = (__global const short *)(dyptr + dy_index);
+        __global short * dx = (__global short *)(dxptr + dx_index);
+        __global short * dy = (__global short *)(dyptr + dy_index);
        __global int * mag = (__global int *)(magptr + mag_index);

-        mag[0] = calc(dx[0], dy[0]);
+        int cmag = calc(dx[0], dy[0]);
+#if cn > 1
+        short cx = dx[0], cy = dy[0];
+        int pmag;
+
+        #pragma unroll
+        for (int i = 1; i < cn; ++i)
+        {
+            pmag = calc(dx[i], dy[i]);
+            if (pmag > cmag)
+                cmag = pmag, cx = dx[i], cy = dy[i];
+        }
+
+        dx[0] = cx, dy[0] = cy;
+#endif
+        mag[0] = cmag;
    }
 }

+#elif defined OP_MAG_BUF
+
+#if cn != 3
+#define loadpix(addr) *(__global const shortT *)(addr)
+#define shortSize (int)sizeof(shortT)
+#else
+#define loadpix(addr) vload3(0, (__global const short *)(addr))
+#define shortSize (int)sizeof(short)*cn
+#endif
+
 // calculate the magnitude of the filter pass combining both x and y directions
 // This is the buffered version(3x3 sobel)
 //
@ -132,59 +172,64 @@ __kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_of
 // dx			direvitive in x direction output
 // dy			direvitive in y direction output
 // mag			magnitude direvitive of xy output
-__kernel void __attribute__((reqd_work_group_size(16, 16, 1)))
-calcMagnitude_buf
-    (__global const short * dx_buf, int dx_buf_step, int dx_buf_offset,
-     __global const short * dy_buf, int dy_buf_step, int dy_buf_offset,
-     __global short * dx, int dx_step, int dx_offset,
-     __global short * dy, int dy_step, int dy_offset,
-     __global int * mag, int mag_step, int mag_offset,
-     int rows, int cols)
+__kernel void calcMagnitude_buf(__global const uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
+                                __global const uchar * dy_buf, int dy_buf_step, int dy_buf_offset,
+                                __global uchar * dx, int dx_step, int dx_offset,
+                                __global uchar * dy, int dy_step, int dy_offset,
+                                __global uchar * mag, int mag_step, int mag_offset, int rows, int cols)
 {
-    dx_buf_step    /= sizeof(*dx_buf);
-    dx_buf_offset  /= sizeof(*dx_buf);
-    dy_buf_step    /= sizeof(*dy_buf);
-    dy_buf_offset  /= sizeof(*dy_buf);
-    dx_step    /= sizeof(*dx);
-    dx_offset  /= sizeof(*dx);
-    dy_step    /= sizeof(*dy);
-    dy_offset  /= sizeof(*dy);
-    mag_step   /= sizeof(*mag);
-    mag_offset /= sizeof(*mag);
-
    int gidx = get_global_id(0);
    int gidy = get_global_id(1);

    int lidx = get_local_id(0);
    int lidy = get_local_id(1);

-    __local short sdx[18][16];
-    __local short sdy[18][16];
+    __local shortT sdx[18][16];
+    __local shortT sdy[18][16];

-    sdx[lidy + 1][lidx] = dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
-    sdy[lidy + 1][lidx] = dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
+    sdx[lidy + 1][lidx] = loadpix(dx_buf + mad24(min(gidy, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
+    sdy[lidy + 1][lidx] = loadpix(dy_buf + mad24(min(gidy, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
    if (lidy == 0)
    {
-        sdx[0][lidx]  = dx_buf[gidx + min(max(gidy - 1, 0), rows - 1) * dx_buf_step + dx_buf_offset];
-        sdx[17][lidx] = dx_buf[gidx + min(gidy + 16, rows - 1)        * dx_buf_step + dx_buf_offset];
+        sdx[0][lidx]  = loadpix(dx_buf + mad24(clamp(gidy - 1, 0, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
+        sdx[17][lidx] = loadpix(dx_buf + mad24(min(gidy + 16, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));

-        sdy[0][lidx]  = dy_buf[gidx + min(max(gidy - 1, 0), rows - 1) * dy_buf_step + dy_buf_offset];
-        sdy[17][lidx] = dy_buf[gidx + min(gidy + 16, rows - 1)        * dy_buf_step + dy_buf_offset];
+        sdy[0][lidx]  = loadpix(dy_buf + mad24(clamp(gidy - 1, 0, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
+        sdy[17][lidx] = loadpix(dy_buf + mad24(min(gidy + 16, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (gidx < cols && gidy < rows)
    {
-        short x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
-        short y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+        shortT x = sdx[lidy + 1][lidx] * (shortT)(2) + sdx[lidy][lidx] + sdx[lidy + 2][lidx];
+        shortT y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+
+#if cn == 1
+        *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = x;
+        *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = y;
+
+        *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = calc(x, y);
+#elif cn == 3
+        intT magv = calc(x, y);
+        short cx = x.x, cy = y.x;
+        int cmag = magv.x;

-        dx[gidx + gidy * dx_step + dx_offset] = x;
-        dy[gidx + gidy * dy_step + dy_offset] = y;
+        if (cmag < magv.y)
+            cx = x.y, cy = y.y, cmag = magv.y;
+        if (cmag < magv.z)
+            cx = x.z, cy = y.z, cmag = magv.z;

-        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
+        *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = cx;
+        *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = cy;
+
+        *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = cmag;
+#endif
    }
 }

+#endif
+
+#elif defined OP_MAP

 //////////////////////////////////////////////////////////////////////////////////////////
 // 0.4142135623730950488016887242097 is tan(22.5)
@ -208,13 +253,11 @@ calcMagnitude_buf
 // mag			magnitudes calculated from calcMagnitude function
 // map			output containing raw edge types

-__kernel void __attribute__((reqd_work_group_size(16,16,1)))
-calcMap(
-    __global const uchar * dx, int dx_step, int dx_offset,
-    __global const uchar * dy, int dy_step, int dy_offset,
-    __global const uchar * mag, int mag_step, int mag_offset,
-    __global uchar * map, int map_step, int map_offset,
-    int rows, int cols, int low_thresh, int high_thresh)
+__kernel void calcMap(__global const uchar * dx, int dx_step, int dx_offset,
+                      __global const uchar * dy, int dy_step, int dy_offset,
+                      __global const uchar * mag, int mag_step, int mag_offset,
+                      __global uchar * map, int map_step, int map_offset,
+                      int rows, int cols, int low_thresh, int high_thresh)
 {
    __local int smem[18][18];

@ -227,7 +270,7 @@ calcMap(
    int grp_idx = get_global_id(0) & 0xFFFFF0;
    int grp_idy = get_global_id(1) & 0xFFFFF0;

-    int tid = lidx + lidy * 16;
+    int tid = mad24(lidy, 16, lidx);
    int lx = tid % 18;
    int ly = tid / 18;

@ -250,8 +293,8 @@ calcMap(

        if (m > low_thresh)
        {
-            short xs = *(__global const short *)(dx + mad24(gidy, dx_step, dx_offset + (int)sizeof(short) * gidx));
-            short ys = *(__global const short *)(dy + mad24(gidy, dy_step, dy_offset + (int)sizeof(short) * gidx));
+            short xs = *(__global const short *)(dx + mad24(gidy, dx_step, mad24(gidx, (int)sizeof(short) * cn, dx_offset)));
+            short ys = *(__global const short *)(dy + mad24(gidy, dy_step, mad24(gidx, (int)sizeof(short) * cn, dy_offset)));
            int x = abs(xs), y = abs(ys);

            int tg22x = x * TG22;
@ -278,13 +321,15 @@ calcMap(
                }
            }
        }
-        *(__global int *)(map + mad24(map_step, gidy + 1, (gidx + 1) * (int)sizeof(int) + map_offset)) = edge_type;
+        *(__global int *)(map + mad24(map_step, gidy + 1, mad24(gidx + 1, (int)sizeof(int), + map_offset))) = edge_type;
    }
 }

 #undef CANNY_SHIFT
 #undef TG22

+#elif defined OP_HYST_LOCAL
+
 struct PtrStepSz
 {
    __global uchar * ptr;
@ -312,11 +357,9 @@ inline void set(struct PtrStepSz data, int y, int x, int value)
 // stack	the potiential edge points found in this kernel call
 // counter	the number of potiential edge points

-__kernel void __attribute__((reqd_work_group_size(16,16,1)))
-edgesHysteresisLocal
-    (__global uchar * map_ptr, int map_step, int map_offset,
-     __global ushort2 * st, __global unsigned int * counter,
-    int rows, int cols)
+__kernel void edgesHysteresisLocal(__global uchar * map_ptr, int map_step, int map_offset,
+                                   __global ushort2 * st, __global unsigned int * counter,
+                                   int rows, int cols)
 {
    struct PtrStepSz map = { map_ptr + map_offset, map_step, rows + 1, cols + 1 };

@ -402,6 +445,8 @@ edgesHysteresisLocal
    }
 }

+#elif defined OP_HYST_GLOBAL
+
 __constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
 __constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};

@ -409,10 +454,9 @@ __constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 #define stack_size 512
 #define map_index mad24(map_step, pos.y, pos.x * (int)sizeof(int))

-__kernel void __attribute__((reqd_work_group_size(128, 1, 1)))
-edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
-    __global ushort2 * st1, __global ushort2 * st2, __global int * counter,
-    int rows, int cols, int count)
+__kernel void edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
+                                    __global ushort2 * st1, __global ushort2 * st2, __global int * counter,
+                                    int rows, int cols, int count)
 {
    map += map_offset;

@ -492,6 +536,8 @@ edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
 #undef map_index
 #undef stack_size

+#elif defined OP_EDGES
+
 // Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
 // map		edge type mappings
 // dst		edge output
@ -504,7 +550,7 @@ __kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offs

    if (y < rows && x < cols)
    {
-        int map_index = mad24(map_step, y + 1, (x + 1) * (int)sizeof(int) + map_offset);
+        int map_index = mad24(map_step, y + 1, mad24(x + 1, (int)sizeof(int), map_offset));
        int dst_index = mad24(dst_step, y, x + dst_offset);

        __global const int * map = (__global const int *)(mapptr + map_index);
@ -512,3 +558,5 @@ __kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offs
        dst[dst_index] = (uchar)(-(map[0] >> 1));
    }
 }
+
+#endif
--- a/modules/imgproc/src/opencl/filter2D.cl
+++ b/modules/imgproc/src/opencl/filter2D.cl
@ -122,7 +122,7 @@
    }
 #ifdef BORDER_REFLECT
 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
-#elif defined(BORDER_REFLECT_101)
+#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101)
 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
 #endif
 #else
@ -142,109 +142,49 @@
    }
 #endif

-#if USE_DOUBLE
+#ifdef DOUBLE_SUPPORT
 #ifdef cl_amd_fp64
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
-#define FPTYPE double
-#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
-#else
-#define FPTYPE float
-#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
-#endif
-
-#if DATA_DEPTH == 0
-#define BASE_TYPE uchar
-#elif DATA_DEPTH == 1
-#define BASE_TYPE char
-#elif DATA_DEPTH == 2
-#define BASE_TYPE ushort
-#elif DATA_DEPTH == 3
-#define BASE_TYPE short
-#elif DATA_DEPTH == 4
-#define BASE_TYPE int
-#elif DATA_DEPTH == 5
-#define BASE_TYPE float
-#elif DATA_DEPTH == 6
-#define BASE_TYPE double
-#else
-#error data_depth
 #endif

-#define __CAT(x, y) x##y
-#define CAT(x, y) __CAT(x, y)
-
-#define uchar1 uchar
-#define char1 char
-#define ushort1 ushort
-#define short1 short
-#define int1 int
-#define float1 float
-#define double1 double
-
-#define convert_uchar1_sat_rte convert_uchar_sat_rte
-#define convert_char1_sat_rte convert_char_sat_rte
-#define convert_ushort1_sat_rte convert_ushort_sat_rte
-#define convert_short1_sat_rte convert_short_sat_rte
-#define convert_int1_sat_rte convert_int_sat_rte
-#define convert_float1
-#define convert_double1
-
-#if DATA_DEPTH == 5 || DATA_DEPTH == 6
-#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
+#if cn != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define SRCSIZE (int)sizeof(srcT)
+#define DSTSIZE (int)sizeof(dstT)
 #else
-#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
+#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define SRCSIZE (int)sizeof(srcT1) * cn
+#define DSTSIZE (int)sizeof(dstT1) * cn
 #endif

-#define VEC_SIZE DATA_CHAN
-
-#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
-#define TYPE VEC_TYPE
-
-#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
-
-#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
+#define noconvert

 struct RectCoords
 {
    int x1, y1, x2, y2;
 };

-//#define DEBUG
-#ifdef DEBUG
-#define DEBUG_ONLY(x) x
-#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
-#else
-#define DEBUG_ONLY(x) (void)0
-#define ASSERT(condition) (void)0
-#endif
-
-
-inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global const uchar* srcptr, int srcstep, const struct RectCoords srcCoords
-#ifdef BORDER_CONSTANT
-               , SCALAR_TYPE borderValue
-#endif
-    )
+inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, const struct RectCoords srcCoords)
 {
 #ifdef BORDER_ISOLATED
-    if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+    if (pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #else
-    if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+    if (pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #endif
    {
-        //__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
-        __global TYPE* ptr = (__global TYPE*)(srcptr + pos.y * srcstep + pos.x * sizeof(TYPE));
-        return CONVERT_TO_FPTYPE(*ptr);
+        return convertToWT(loadpix(srcptr + mad24(pos.y, src_step, pos.x * SRCSIZE)));
    }
    else
    {
 #ifdef BORDER_CONSTANT
-        return borderValue;
+        return (WT)(0);
 #else
-        int selected_col = pos.x;
-        int selected_row = pos.y;
+        int selected_col = pos.x, selected_row = pos.y;

        EXTRAPOLATE(selected_col, selected_row,
 #ifdef BORDER_ISOLATED
@ -255,68 +195,43 @@ inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global const uchar* srcptr, in
                srcCoords.x2, srcCoords.y2
         );

-        // debug border mapping
-        //printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
-
-        pos = (int2)(selected_col, selected_row);
-        if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-        {
-            //__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
-            __global TYPE* ptr = (__global TYPE*)(srcptr + pos.y * srcstep + pos.x * sizeof(TYPE));
-            return CONVERT_TO_FPTYPE(*ptr);
-        }
-        else
-        {
-            // for debug only
-            DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
-            return (FPTYPE)(0.0f);
-        }
+        return convertToWT(loadpix(srcptr + mad24(selected_row, src_step, selected_col * SRCSIZE)));
 #endif
    }
 }

-// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
+#define DIG(a) a,
+__constant WT1 kernelData[] = { COEFF };

-__kernel
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-void filter2D(__global const uchar* srcptr, int srcstep, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
-                __global uchar* dstptr, int dststep, int dstoffset,
-               int rows, int cols,
-#ifdef BORDER_CONSTANT
-               SCALAR_TYPE borderValue,
-#endif
-               __constant FPTYPE* kernelData // transposed: [KERNEL_SIZE_X][KERNEL_SIZE_Y2_ALIGNED]
-               )
+__kernel void filter2D(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
+                       __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, float delta)
 {
-    const struct RectCoords srcCoords = {srcOffsetX, srcOffsetY, srcEndX, srcEndY}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
+    const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY }; // for non-isolated border: offsetX, offsetY, wholeX, wholeY

-    const int local_id = get_local_id(0);
-    const int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
-    const int y = get_global_id(1) * BLOCK_SIZE_Y;
+    int local_id = get_local_id(0);
+    int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
+    int y = get_global_id(1) * BLOCK_SIZE_Y;

-    INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
-    __local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
+    WT data[KERNEL_SIZE_Y];
+    __local WT sumOfCols[LOCAL_SIZE];

    int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);

    int2 pos = (int2)(x, y);
-    __global TYPE* dstPtr = (__global TYPE*)((__global char*)dstptr + pos.y * dststep + dstoffset + pos.x * sizeof(TYPE)); // Pointer can be out of bounds!
-    bool writeResult = ((local_id >= ANCHOR_X) && (local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X)) &&
-                        (pos.x >= 0) && (pos.x < cols));
+    __global dstT * dst = (__global dstT *)(dstptr + mad24(pos.y, dst_step, mad24(pos.x, DSTSIZE, dst_offset))); // Pointer can be out of bounds!
+    bool writeResult = local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
+                        pos.x >= 0 && pos.x < cols;

 #if BLOCK_SIZE_Y > 1
    bool readAllpixels = true;
    int sy_index = 0; // current index in data[] array

    dstRowsMax = min(rows, pos.y + BLOCK_SIZE_Y);
-    for (;
-         pos.y < dstRowsMax;
-         pos.y++,
-         dstPtr = (__global TYPE*)((__global char*)dstptr + dststep))
+    for ( ;
+          pos.y < dstRowsMax;
+          pos.y++, dst = (__global dstT *)((__global uchar *)dst + dst_step))
 #endif
    {
-        ASSERT(pos.y < dstRowsMax);
-
        for (
 #if BLOCK_SIZE_Y > 1
            int sy = readAllpixels ? 0 : -1; sy < (readAllpixels ? KERNEL_SIZE_Y : 0);
@ -325,27 +240,21 @@ void filter2D(__global const uchar* srcptr, int srcstep, int srcOffsetX, int src
 #endif
            sy++, srcPos.y++)
        {
-            data[sy + sy_index] = readSrcPixel(srcPos, srcptr, srcstep, srcCoords
-#ifdef BORDER_CONSTANT
-                    , borderValue
-#endif
-                    );
+            data[sy + sy_index] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);
        }

-        INTERMEDIATE_TYPE total_sum = 0;
+        WT total_sum = 0;
        for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
        {
            {
-                __constant FPTYPE* k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * sx
+                __constant WT1 * k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * sx
 #if BLOCK_SIZE_Y > 1
                                                   + KERNEL_SIZE_Y - sy_index
 #endif
                                                   ];
-                INTERMEDIATE_TYPE tmp_sum = 0;
+                WT tmp_sum = 0;
                for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)
-                {
                    tmp_sum += data[sy] * k[sy];
-                }

                sumOfCols[local_id] = tmp_sum;
                barrier(CLK_LOCAL_MEM_FENCE);
@ -359,14 +268,12 @@ void filter2D(__global const uchar* srcptr, int srcstep, int srcOffsetX, int src
        }

        if (writeResult)
-        {
-            *dstPtr = CONVERT_TO_TYPE(total_sum);
-        }
+            storepix(convertToDstT(total_sum + (WT)(delta)), dst);

 #if BLOCK_SIZE_Y > 1
        readAllpixels = false;
 #if BLOCK_SIZE_Y > KERNEL_SIZE_Y
-        sy_index = (sy_index + 1 <= KERNEL_SIZE_Y) ? sy_index + 1 : 1;
+        sy_index = sy_index + 1 <= KERNEL_SIZE_Y ? sy_index + 1 : 1;
 #else
        sy_index++;
 #endif
--- a/modules/imgproc/src/opencl/filterSepCol.cl
+++ b/modules/imgproc/src/opencl/filterSepCol.cl
@ -34,47 +34,36 @@
 //
 //

+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
 #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
 #define RADIUS 1
-#if CN ==1
-#define ALIGN (((RADIUS)+3)>>2<<2)
-#elif CN==2
-#define ALIGN (((RADIUS)+1)>>1<<1)
-#elif CN==3
-#define ALIGN (((RADIUS)+3)>>2<<2)
-#elif CN==4
-#define ALIGN (RADIUS)
-#define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0)
-#endif

 #define noconvert

-/**********************************************************************************
-These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
-Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
-kernel must be in the center. ROI is not supported either.
-Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
-from LDS to calculate the result.
-The length of the convovle kernel supported is only related to the MAX size of LDS,
-which is HW related.
-Niko
-6/29/2011
-The info above maybe obsolete.
-***********************************************************************************/
+#if CN != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define SRCSIZE (int)sizeof(srcT)
+#define DSTSIZE (int)sizeof(dstT)
+#else
+#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define SRCSIZE (int)sizeof(srcT1)*3
+#define DSTSIZE (int)sizeof(dstT1)*3
+#endif

 #define DIG(a) a,
 __constant float mat_kernel[] = { COEFF };

-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
-                        (__global const GENTYPE_SRC * restrict src,
-                         const int src_step_in_pixel,
-                         const int src_whole_cols,
-                         const int src_whole_rows,
-                         __global GENTYPE_DST * dst,
-                         const int dst_offset_in_pixel,
-                         const int dst_step_in_pixel,
-                         const int dst_cols,
-                         const int dst_rows)
+__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
+                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@ -82,38 +71,38 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
    int l_x = get_local_id(0);
    int l_y = get_local_id(1);

-    int start_addr = mad24(y, src_step_in_pixel, x);
-    int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
+    int start_addr = mad24(y, src_step, x * SRCSIZE);
+    int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);

-    int i;
-    GENTYPE_SRC sum, temp[READ_TIMES_COL];
-    __local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
+    srcT sum, temp[READ_TIMES_COL];
+    __local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];

-    //read pixels from src
-    for(i = 0;i<READ_TIMES_COL;i++)
+    // read pixels from src
+    for (int i = 0; i < READ_TIMES_COL; ++i)
    {
-        int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
+        int current_addr = mad24(i, LSIZE1 * src_step, start_addr);
        current_addr = current_addr < end_addr ? current_addr : 0;
-        temp[i] = src[current_addr];
-    }
-    //save pixels to lds
-    for(i = 0;i<READ_TIMES_COL;i++)
-    {
-        LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
+        temp[i] = loadpix(src + current_addr);
    }
+
+    // save pixels to lds
+    for (int i = 0; i < READ_TIMES_COL; ++i)
+        LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i];
    barrier(CLK_LOCAL_MEM_FENCE);
-    //read pixels from lds and calculate the result
-    sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
-    for(i=1;i<=RADIUSY;i++)
+
+    // read pixels from lds and calculate the result
+    sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];
+    for (int i = 1; i <= RADIUSY; ++i)
    {
-        temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
-        temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
-        sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
+        temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
+        temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
+        sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
    }
-    //write the result to dst
-    if((x<dst_cols) & (y<dst_rows))
+
+    // write the result to dst
+    if (x < dst_cols && y < dst_rows)
    {
-        start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel);
-        dst[start_addr] = convert_to_DST(sum);
+        start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));
+        storepix(convertToDstT(sum + (srcT)(delta)), dst + start_addr);
    }
 }
--- a/Show More
+++ b/Show More