diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake
new file mode 100644
index 0000000000..35f6497c9e
--- /dev/null
+++ b/cmake/FindCUDA.cmake
@@ -0,0 +1,1792 @@
+#.rst:
+# FindCUDA
+# --------
+#
+# Tools for building CUDA C files: libraries and build dependencies.
+#
+# This script locates the NVIDIA CUDA C tools.  It should work on linux,
+# windows, and mac and should be reasonably up to date with CUDA C
+# releases.
+#
+# This script makes use of the standard find_package arguments of
+# <VERSION>, REQUIRED and QUIET.  CUDA_FOUND will report if an
+# acceptable version of CUDA was found.
+#
+# The script will prompt the user to specify CUDA_TOOLKIT_ROOT_DIR if
+# the prefix cannot be determined by the location of nvcc in the system
+# path and REQUIRED is specified to find_package().  To use a different
+# installed version of the toolkit set the environment variable
+# CUDA_BIN_PATH before running cmake (e.g.
+# CUDA_BIN_PATH=/usr/local/cuda1.0 instead of the default
+# /usr/local/cuda) or set CUDA_TOOLKIT_ROOT_DIR after configuring.  If
+# you change the value of CUDA_TOOLKIT_ROOT_DIR, various components that
+# depend on the path will be relocated.
+#
+# It might be necessary to set CUDA_TOOLKIT_ROOT_DIR manually on certain
+# platforms, or to use a cuda runtime not installed in the default
+# location.  In newer versions of the toolkit the cuda library is
+# included with the graphics driver- be sure that the driver version
+# matches what is needed by the cuda runtime version.
+#
+# The following variables affect the behavior of the macros in the
+# script (in alphebetical order).  Note that any of these flags can be
+# changed multiple times in the same directory before calling
+# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX
+# or CUDA_WRAP_SRCS.
+#
+# ::
+#
+#   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
+#   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
+#      Note that making this different from the host code when generating object
+#      or C files from CUDA code just won't work, because size_t gets defined by
+#      nvcc in the generated source.  If you compile to PTX and then load the
+#      file yourself, you can mix bit sizes between device and host.
+#
+#
+#
+# ::
+#
+#   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
+#   -- Set to ON if you want the custom build rule to be attached to the source
+#      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
+#      targets.
+#
+#
+#
+# ::
+#
+#      This allows the user to build the target from the CUDA file; however, bad
+#      things can happen if the CUDA source file is added to multiple targets.
+#      When performing parallel builds it is possible for the custom build
+#      command to be run more than once and in parallel causing cryptic build
+#      errors.  VS runs the rules for every source file in the target, and a
+#      source can have only one rule no matter how many projects it is added to.
+#      When the rule is run from multiple targets race conditions can occur on
+#      the generated file.  Eventually everything will get built, but if the user
+#      is unaware of this behavior, there may be confusion.  It would be nice if
+#      this script could detect the reuse of source files across multiple targets
+#      and turn the option off for the user, but no good solution could be found.
+#
+#
+#
+# ::
+#
+#   CUDA_BUILD_CUBIN (Default OFF)
+#   -- Set to ON to enable and extra compilation pass with the -cubin option in
+#      Device mode. The output is parsed and register, shared memory usage is
+#      printed during build.
+#
+#
+#
+# ::
+#
+#   CUDA_BUILD_EMULATION (Default OFF for device mode)
+#   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
+#      when CUDA_BUILD_EMULATION is TRUE.
+#
+#
+#
+# ::
+#
+#   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
+#   -- Set to the path you wish to have the generated files placed.  If it is
+#      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
+#      Intermediate files will always be placed in
+#      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
+#
+#
+#
+# ::
+#
+#   CUDA_HOST_COMPILATION_CPP (Default ON)
+#   -- Set to OFF for C compilation of host code.
+#
+#
+#
+# ::
+#
+#   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
+#   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
+#      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
+#      CUDA_NVCC_FLAGS_<CONFIG> variables.  For Visual Studio targets
+#      $(VCInstallDir)/bin is a special value that expands out to the path when
+#      the command is run from withing VS.
+#
+#
+#
+# ::
+#
+#   CUDA_NVCC_FLAGS
+#   CUDA_NVCC_FLAGS_<CONFIG>
+#   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
+#      semi-colon delimited (e.g. --compiler-options;-Wall)
+#
+#
+#
+# ::
+#
+#   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
+#   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
+#      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
+#      host compiler through nvcc's -Xcompiler flag.  This helps make the
+#      generated host code match the rest of the system better.  Sometimes
+#      certain flags give nvcc problems, and this will help you turn the flag
+#      propagation off.  This does not affect the flags supplied directly to nvcc
+#      via CUDA_NVCC_FLAGS or through the OPTION flags specified through
+#      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
+#      shared library compilation are not affected by this flag.
+#
+#
+#
+# ::
+#
+#   CUDA_SEPARABLE_COMPILATION (Default OFF)
+#   -- If set this will enable separable compilation for all CUDA runtime object
+#      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
+#      (e.g. calling CUDA_WRAP_SRCS directly),
+#      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
+#
+#
+#
+# ::
+#
+#   CUDA_VERBOSE_BUILD (Default OFF)
+#   -- Set to ON to see all the commands used when building the CUDA file.  When
+#      using a Makefile generator the value defaults to VERBOSE (run make
+#      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
+#      always print the output.
+#
+#
+#
+# The script creates the following macros (in alphebetical order):
+#
+# ::
+#
+#   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
+#   -- Adds the cufft library to the target (can be any target).  Handles whether
+#      you are in emulation mode or not.
+#
+#
+#
+# ::
+#
+#   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
+#   -- Adds the cublas library to the target (can be any target).  Handles
+#      whether you are in emulation mode or not.
+#
+#
+#
+# ::
+#
+#   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
+#                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Creates an executable "cuda_target" which is made up of the files
+#      specified.  All of the non CUDA C files are compiled using the standard
+#      build rules specified by CMAKE and the cuda files are compiled to object
+#      files using nvcc and the host compiler.  In addition CUDA_INCLUDE_DIRS is
+#      added automatically to include_directories().  Some standard CMake target
+#      calls can be used on the target after calling this macro
+#      (e.g. set_target_properties and target_link_libraries), but setting
+#      properties that adjust compilation flags will not affect code compiled by
+#      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
+#
+#
+#
+# ::
+#
+#   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
+#                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
+#
+#
+#
+# ::
+#
+#   CUDA_BUILD_CLEAN_TARGET()
+#   -- Creates a convience target that deletes all the dependency files
+#      generated.  You should make clean after running this target to ensure the
+#      dependency files get regenerated.
+#
+#
+#
+# ::
+#
+#   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
+#                 [OPTIONS ...] )
+#   -- Returns a list of generated files from the input source files to be used
+#      with ADD_LIBRARY or ADD_EXECUTABLE.
+#
+#
+#
+# ::
+#
+#   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of PTX files generated from the input source files.
+#
+#
+#
+# ::
+#
+#   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
+#                                                        cuda_target
+#                                                        object_files )
+#   -- Compute the name of the intermediate link file used for separable
+#      compilation.  This file name is typically passed into
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
+#      based on cuda_target the list of objects files that need separable
+#      compilation as specified by object_files.  If the object_files list is
+#      empty, then output_file_var will be empty.  This function is called
+#      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
+#      this is a function and not a macro.
+#
+#
+#
+# ::
+#
+#   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
+#   -- Sets the directories that should be passed to nvcc
+#      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
+#      files.
+#
+#
+#
+#
+#
+# ::
+#
+#   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
+#                                            nvcc_flags object_files)
+#
+#
+#
+# ::
+#
+#   -- Generates the link object required by separable compilation from the given
+#      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
+#      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
+#      directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
+#      nvcc_flags passed in are the same as the flags passed in via the OPTIONS
+#      argument.  The only nvcc flag added automatically is the bitness flag as
+#      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
+#      instead of a macro.
+#
+#
+#
+# ::
+#
+#   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
+#                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
+#   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
+#      function under the hood.
+#
+#
+#
+# ::
+#
+#      Given the list of files (file0 file1 ... fileN) this macro generates
+#      custom commands that generate either PTX or linkable objects (use "PTX" or
+#      "OBJ" for the format argument to switch).  Files that don't end with .cu
+#      or have the HEADER_FILE_ONLY property are ignored.
+#
+#
+#
+# ::
+#
+#      The arguments passed in after OPTIONS are extra command line options to
+#      give to nvcc.  You can also specify per configuration options by
+#      specifying the name of the configuration followed by the options.  General
+#      options must preceed configuration specific options.  Not all
+#      configurations need to be specified, only the ones provided will be used.
+#
+#
+#
+# ::
+#
+#         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
+#         DEBUG -g
+#         RELEASE --use_fast_math
+#         RELWITHDEBINFO --use_fast_math;-g
+#         MINSIZEREL --use_fast_math
+#
+#
+#
+# ::
+#
+#      For certain configurations (namely VS generating object files with
+#      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
+#      be produced for the given cuda file.  This is because when you add the
+#      cuda file to Visual Studio it knows that this file produces an object file
+#      and will link in the resulting object file automatically.
+#
+#
+#
+# ::
+#
+#      This script will also generate a separate cmake script that is used at
+#      build time to invoke nvcc.  This is for several reasons.
+#
+#
+#
+# ::
+#
+#        1. nvcc can return negative numbers as return values which confuses
+#        Visual Studio into thinking that the command succeeded.  The script now
+#        checks the error codes and produces errors when there was a problem.
+#
+#
+#
+# ::
+#
+#        2. nvcc has been known to not delete incomplete results when it
+#        encounters problems.  This confuses build systems into thinking the
+#        target was generated when in fact an unusable file exists.  The script
+#        now deletes the output files if there was an error.
+#
+#
+#
+# ::
+#
+#        3. By putting all the options that affect the build into a file and then
+#        make the build rule dependent on the file, the output files will be
+#        regenerated when the options change.
+#
+#
+#
+# ::
+#
+#      This script also looks at optional arguments STATIC, SHARED, or MODULE to
+#      determine when to target the object compilation for a shared library.
+#      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
+#      CUDA_ADD_LIBRARY.  On some systems special flags are added for building
+#      objects intended for shared libraries.  A preprocessor macro,
+#      <target_name>_EXPORTS is defined when a shared library compilation is
+#      detected.
+#
+#
+#
+# ::
+#
+#      Flags passed into add_definitions with -D or /D are passed along to nvcc.
+#
+#
+#
+# The script defines the following variables:
+#
+# ::
+#
+#   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
+#   CUDA_VERSION_MINOR    -- The minor version.
+#   CUDA_VERSION
+#   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
+#
+#
+#
+# ::
+#
+#   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
+#   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
+#                            SDK.  This script will not directly support finding
+#                            specific libraries or headers, as that isn't
+#                            supported by NVIDIA.  If you want to change
+#                            libraries when the path changes see the
+#                            FindCUDA.cmake script for an example of how to clear
+#                            these variables.  There are also examples of how to
+#                            use the CUDA_SDK_ROOT_DIR to locate headers or
+#                            libraries, if you so choose (at your own risk).
+#   CUDA_INCLUDE_DIRS     -- Include directory for cuda headers.  Added automatically
+#                            for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY.
+#   CUDA_LIBRARIES        -- Cuda RT library.
+#   CUDA_CUFFT_LIBRARIES  -- Device or emulation library for the Cuda FFT
+#                            implementation (alternative to:
+#                            CUDA_ADD_CUFFT_TO_TARGET macro)
+#   CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
+#                            implementation (alterative to:
+#                            CUDA_ADD_CUBLAS_TO_TARGET macro).
+#   CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives library.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_nppc_LIBRARY      -- NVIDIA Performance Primitives library (core).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nppi_LIBRARY      -- NVIDIA Performance Primitives library (image processing).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_npps_LIBRARY      -- NVIDIA Performance Primitives library (signal processing).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#   CUDA_nvcuvid_LIBRARY  -- CUDA Video Decoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#
+#
+#
+#
+#
+# ::
+#
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#
+#
+# ::
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#
+#
+# ::
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#
+#
+# ::
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+# FindCUDA.cmake
+
+# We need to have at least this version to support the VERSION_LESS argument to 'if' (2.6.2) and unset (2.6.3)
+cmake_policy(PUSH)
+cmake_minimum_required(VERSION 2.6.3)
+cmake_policy(POP)
+
+# This macro helps us find the location of helper files we will need the full path to
+macro(CUDA_FIND_HELPER_FILE _name _extension)
+  set(_full_name "${_name}.${_extension}")
+  # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being
+  # processed.  Using this variable, we can pull out the current path, and
+  # provide a way to get access to the other files we need local to here.
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  set(CUDA_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindCUDA/${_full_name}")
+  if(NOT EXISTS "${CUDA_${_name}}")
+    set(error_message "${_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindCUDA")
+    if(CUDA_FIND_REQUIRED)
+      message(FATAL_ERROR "${error_message}")
+    else()
+      if(NOT CUDA_FIND_QUIETLY)
+        message(STATUS "${error_message}")
+      endif()
+    endif()
+  endif()
+  # Set this variable as internal, so the user isn't bugged with it.
+  set(CUDA_${_name} ${CUDA_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
+endmacro()
+
+#####################################################################
+## CUDA_INCLUDE_NVCC_DEPENDENCIES
+##
+
+# So we want to try and include the dependency file if it exists.  If
+# it doesn't exist then we need to create an empty one, so we can
+# include it.
+
+# If it does exist, then we need to check to see if all the files it
+# depends on exist.  If they don't then we should clear the dependency
+# file and regenerate it later.  This covers the case where a header
+# file has disappeared or moved.
+
+macro(CUDA_INCLUDE_NVCC_DEPENDENCIES dependency_file)
+  set(CUDA_NVCC_DEPEND)
+  set(CUDA_NVCC_DEPEND_REGENERATE FALSE)
+
+
+  # Include the dependency file.  Create it first if it doesn't exist .  The
+  # INCLUDE puts a dependency that will force CMake to rerun and bring in the
+  # new info when it changes.  DO NOT REMOVE THIS (as I did and spent a few
+  # hours figuring out why it didn't work.
+  if(NOT EXISTS ${dependency_file})
+    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+  endif()
+  # Always include this file to force CMake to run again next
+  # invocation and rebuild the dependencies.
+  #message("including dependency_file = ${dependency_file}")
+  include(${dependency_file})
+
+  # Now we need to verify the existence of all the included files
+  # here.  If they aren't there we need to just blank this variable and
+  # make the file regenerate again.
+#   if(DEFINED CUDA_NVCC_DEPEND)
+#     message("CUDA_NVCC_DEPEND set")
+#   else()
+#     message("CUDA_NVCC_DEPEND NOT set")
+#   endif()
+  if(CUDA_NVCC_DEPEND)
+    #message("CUDA_NVCC_DEPEND found")
+    foreach(f ${CUDA_NVCC_DEPEND})
+      # message("searching for ${f}")
+      if(NOT EXISTS ${f})
+        #message("file ${f} not found")
+        set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
+      endif()
+    endforeach()
+  else()
+    #message("CUDA_NVCC_DEPEND false")
+    # No dependencies, so regenerate the file.
+    set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
+  endif()
+
+  #message("CUDA_NVCC_DEPEND_REGENERATE = ${CUDA_NVCC_DEPEND_REGENERATE}")
+  # No incoming dependencies, so we need to generate them.  Make the
+  # output depend on the dependency file itself, which should cause the
+  # rule to re-run.
+  if(CUDA_NVCC_DEPEND_REGENERATE)
+    set(CUDA_NVCC_DEPEND ${dependency_file})
+    #message("Generating an empty dependency_file: ${dependency_file}")
+    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+  endif()
+
+endmacro()
+
+###############################################################################
+###############################################################################
+# Setup variables' defaults
+###############################################################################
+###############################################################################
+
+# Allow the user to specify if the device code is supposed to be 32 or 64 bit.
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT ON)
+else()
+  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT OFF)
+endif()
+option(CUDA_64_BIT_DEVICE_CODE "Compile device code in 64 bit mode" ${CUDA_64_BIT_DEVICE_CODE_DEFAULT})
+
+# Attach the build rule to the source file in VS.  This option
+option(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE "Attach the build rule to the CUDA source file.  Enable only when the CUDA source file is added to at most one target." ON)
+
+# Prints out extra information about the cuda file during compilation
+option(CUDA_BUILD_CUBIN "Generate and parse .cubin files in Device mode." OFF)
+
+# Set whether we are using emulation or device mode.
+option(CUDA_BUILD_EMULATION "Build in Emulation mode" OFF)
+
+# Where to put the generated output.
+set(CUDA_GENERATED_OUTPUT_DIR "" CACHE PATH "Directory to put all the output files.  If blank it will default to the CMAKE_CURRENT_BINARY_DIR")
+
+# Parse HOST_COMPILATION mode.
+option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON)
+
+# Extra user settable flags
+set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
+
+if(CMAKE_GENERATOR MATCHES "Visual Studio")
+  set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC")
+else()
+  set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+endif()
+
+# Propagate the host flags to the host compiler via -Xcompiler
+option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
+
+# Enable CUDA_SEPARABLE_COMPILATION
+option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
+
+# Specifies whether the commands used when compiling the .cu file will be printed out.
+option(CUDA_VERBOSE_BUILD "Print out the commands run while compiling the CUDA source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
+
+mark_as_advanced(
+  CUDA_64_BIT_DEVICE_CODE
+  CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE
+  CUDA_GENERATED_OUTPUT_DIR
+  CUDA_HOST_COMPILATION_CPP
+  CUDA_NVCC_FLAGS
+  CUDA_PROPAGATE_HOST_FLAGS
+  )
+
+# Makefile and similar generators don't define CMAKE_CONFIGURATION_TYPES, so we
+# need to add another entry for the CMAKE_BUILD_TYPE.  We also need to add the
+# standerd set of 4 build types (Debug, MinSizeRel, Release, and RelWithDebInfo)
+# for completeness.  We need run this loop in order to accomodate the addition
+# of extra configuration types.  Duplicate entries will be removed by
+# REMOVE_DUPLICATES.
+set(CUDA_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
+list(REMOVE_DUPLICATES CUDA_configuration_types)
+foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    set(CUDA_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semi-colon delimit multiple arguments.")
+    mark_as_advanced(CUDA_NVCC_FLAGS_${config_upper})
+endforeach()
+
+###############################################################################
+###############################################################################
+# Locate CUDA, Set Build Type, etc.
+###############################################################################
+###############################################################################
+
+macro(cuda_unset_include_and_libraries)
+  unset(CUDA_TOOLKIT_INCLUDE CACHE)
+  unset(CUDA_CUDART_LIBRARY CACHE)
+  unset(CUDA_CUDA_LIBRARY CACHE)
+  # Make sure you run this before you unset CUDA_VERSION.
+  if(CUDA_VERSION VERSION_EQUAL "3.0")
+    # This only existed in the 3.0 version of the CUDA toolkit
+    unset(CUDA_CUDARTEMU_LIBRARY CACHE)
+  endif()
+  unset(CUDA_cupti_LIBRARY CACHE)
+  unset(CUDA_cublas_LIBRARY CACHE)
+  unset(CUDA_cublasemu_LIBRARY CACHE)
+  unset(CUDA_cufft_LIBRARY CACHE)
+  unset(CUDA_cufftemu_LIBRARY CACHE)
+  unset(CUDA_curand_LIBRARY CACHE)
+  unset(CUDA_cusparse_LIBRARY CACHE)
+  unset(CUDA_npp_LIBRARY CACHE)
+  unset(CUDA_nppc_LIBRARY CACHE)
+  unset(CUDA_nppi_LIBRARY CACHE)
+  unset(CUDA_npps_LIBRARY CACHE)
+  unset(CUDA_nvcuvenc_LIBRARY CACHE)
+  unset(CUDA_nvcuvid_LIBRARY CACHE)
+endmacro()
+
+# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
+# if they have then clear the cache variables, so that will be detected again.
+if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
+  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
+  unset(CUDA_NVCC_EXECUTABLE CACHE)
+  unset(CUDA_VERSION CACHE)
+  cuda_unset_include_and_libraries()
+endif()
+
+if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
+  cuda_unset_include_and_libraries()
+endif()
+
+if(NOT "${CUDA_SDK_ROOT_DIR}" STREQUAL "${CUDA_SDK_ROOT_DIR_INTERNAL}")
+  # No specific variables to catch.  Use this kind of code before calling
+  # find_package(CUDA) to clean up any variables that may depend on this path.
+
+  #   unset(MY_SPECIAL_CUDA_SDK_INCLUDE_DIR CACHE)
+  #   unset(MY_SPECIAL_CUDA_SDK_LIBRARY CACHE)
+endif()
+
+# Search for the cuda distribution.
+if(NOT CUDA_TOOLKIT_ROOT_DIR)
+
+  # Search in the CUDA_BIN_PATH first.
+  find_path(CUDA_TOOLKIT_ROOT_DIR
+    NAMES nvcc nvcc.exe
+    PATHS
+      ENV CUDA_PATH
+      ENV CUDA_BIN_PATH
+    PATH_SUFFIXES bin bin64
+    DOC "Toolkit location."
+    NO_DEFAULT_PATH
+    )
+  # Now search default paths
+  find_path(CUDA_TOOLKIT_ROOT_DIR
+    NAMES nvcc nvcc.exe
+    PATHS /usr/local/bin
+          /usr/local/cuda/bin
+    DOC "Toolkit location."
+    )
+
+  if (CUDA_TOOLKIT_ROOT_DIR)
+    string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+    # We need to force this back into the cache.
+    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE)
+  endif()
+  if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+    if(CUDA_FIND_REQUIRED)
+      message(FATAL_ERROR "Specify CUDA_TOOLKIT_ROOT_DIR")
+    elseif(NOT CUDA_FIND_QUIETLY)
+      message("CUDA_TOOLKIT_ROOT_DIR not found or specified")
+    endif()
+  endif ()
+endif ()
+
+# CUDA_NVCC_EXECUTABLE
+find_program(CUDA_NVCC_EXECUTABLE
+  NAMES nvcc
+  PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
+  ENV CUDA_PATH
+  ENV CUDA_BIN_PATH
+  PATH_SUFFIXES bin bin64
+  NO_DEFAULT_PATH
+  )
+# Search default search paths, after we search our own set of paths.
+find_program(CUDA_NVCC_EXECUTABLE nvcc)
+mark_as_advanced(CUDA_NVCC_EXECUTABLE)
+
+if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
+  # Compute the version.
+  execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
+  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}" CACHE STRING "Version of CUDA as computed from nvcc.")
+  mark_as_advanced(CUDA_VERSION)
+else()
+  # Need to set these based off of the cached value
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CUDA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
+endif()
+
+# Always set this convenience variable
+set(CUDA_VERSION_STRING "${CUDA_VERSION}")
+
+# Support for arm cross compilation with CUDA 5.5
+if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
+  set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf" CACHE PATH "Toolkit target location.")
+else()
+  set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE PATH "Toolkit target location.")
+endif()
+mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
+
+# Target CPU architecture
+if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+  set(_cuda_target_cpu_arch_initial "ARM")
+else()
+  set(_cuda_target_cpu_arch_initial "")
+endif()
+set(CUDA_TARGET_CPU_ARCH ${_cuda_target_cpu_arch_initial} CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.")
+mark_as_advanced(CUDA_TARGET_CPU_ARCH)
+
+# CUDA_TOOLKIT_INCLUDE
+find_path(CUDA_TOOLKIT_INCLUDE
+  device_functions.h # Header included in toolkit
+  PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
+  ENV CUDA_PATH
+  ENV CUDA_INC_PATH
+  PATH_SUFFIXES include
+  NO_DEFAULT_PATH
+  )
+# Search default search paths, after we search our own set of paths.
+find_path(CUDA_TOOLKIT_INCLUDE device_functions.h)
+mark_as_advanced(CUDA_TOOLKIT_INCLUDE)
+
+# Set the user list of include dir to nothing to initialize it.
+set (CUDA_NVCC_INCLUDE_ARGS_USER "")
+set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+
+macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # CUDA 3.2+ on Windows moved the library directories, so we need the new
+    # and old paths.
+    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
+  endif()
+  # CUDA 3.2+ on Windows moved the library directories, so we need to new
+  # (lib/Win32) and the old path (lib).
+  find_library(${_var}
+    NAMES ${_names}
+    PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
+    ENV CUDA_PATH
+    ENV CUDA_LIB_PATH
+    PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32"
+    DOC ${_doc}
+    NO_DEFAULT_PATH
+    )
+  # Search default search paths, after we search our own set of paths.
+  find_library(${_var}
+    NAMES ${_names}
+    PATHS "/usr/lib/nvidia-current"
+    DOC ${_doc}
+    )
+endmacro()
+
+macro(cuda_find_library_local_first _var _names _doc)
+  cuda_find_library_local_first_with_path_ext( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+macro(find_library_local_first _var _names _doc )
+  cuda_find_library_local_first( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+
+# CUDA_LIBRARIES
+cuda_find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library")
+if(CUDA_VERSION VERSION_EQUAL "3.0")
+  # The cudartemu library only existed for the 3.0 version of CUDA.
+  cuda_find_library_local_first(CUDA_CUDARTEMU_LIBRARY cudartemu "\"cudartemu\" library")
+  mark_as_advanced(
+    CUDA_CUDARTEMU_LIBRARY
+    )
+endif()
+
+# CUPTI library showed up in cuda toolkit 4.0
+if(NOT CUDA_VERSION VERSION_LESS "4.0")
+  cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/")
+  mark_as_advanced(CUDA_cupti_LIBRARY)
+endif()
+
+# If we are using emulation mode and we found the cudartemu library then use
+# that one instead of cudart.
+if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
+  set(CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY})
+else()
+  set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
+endif()
+if(APPLE)
+  # We need to add the path to cudart to the linker using rpath, since the
+  # library name for the cuda libraries is prepended with @rpath.
+  if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
+    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDARTEMU_LIBRARY}" PATH)
+  else()
+    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDART_LIBRARY}" PATH)
+  endif()
+  if(_cuda_path_to_cudart)
+    list(APPEND CUDA_LIBRARIES -Wl,-rpath "-Wl,${_cuda_path_to_cudart}")
+  endif()
+endif()
+
+# 1.1 toolkit on linux doesn't appear to have a separate library on
+# some platforms.
+cuda_find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).")
+
+mark_as_advanced(
+  CUDA_CUDA_LIBRARY
+  CUDA_CUDART_LIBRARY
+  )
+
+#######################
+# Look for some of the toolkit helper libraries
+macro(FIND_CUDA_HELPER_LIBS _name)
+  cuda_find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library")
+  mark_as_advanced(CUDA_${_name}_LIBRARY)
+endmacro()
+
+#######################
+# Disable emulation for v3.1 onward
+if(CUDA_VERSION VERSION_GREATER "3.0")
+  if(CUDA_BUILD_EMULATION)
+    message(FATAL_ERROR "CUDA_BUILD_EMULATION is not supported in version 3.1 and onwards.  You must disable it to proceed.  You have version ${CUDA_VERSION}.")
+  endif()
+endif()
+
+# Search for additional CUDA toolkit libraries.
+if(CUDA_VERSION VERSION_LESS "3.1")
+  # Emulation libraries aren't available in version 3.1 onward.
+  find_cuda_helper_libs(cufftemu)
+  find_cuda_helper_libs(cublasemu)
+endif()
+find_cuda_helper_libs(cufft)
+find_cuda_helper_libs(cublas)
+if(NOT CUDA_VERSION VERSION_LESS "3.2")
+  # cusparse showed up in version 3.2
+  find_cuda_helper_libs(cusparse)
+  find_cuda_helper_libs(curand)
+  if (WIN32)
+    find_cuda_helper_libs(nvcuvenc)
+    find_cuda_helper_libs(nvcuvid)
+  endif()
+endif()
+if(CUDA_VERSION VERSION_GREATER "5.0")
+  # In CUDA 5.5 NPP was splitted onto 3 separate libraries.
+  find_cuda_helper_libs(nppc)
+  find_cuda_helper_libs(nppi)
+  find_cuda_helper_libs(npps)
+  set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}")
+elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
+  find_cuda_helper_libs(npp)
+endif()
+
+if (CUDA_BUILD_EMULATION)
+  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufftemu_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
+else()
+  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY})
+endif()
+
+########################
+# Look for the SDK stuff.  As of CUDA 3.0 NVSDKCUDA_ROOT has been replaced with
+# NVSDKCOMPUTE_ROOT with the old CUDA C contents moved into the C subdirectory
+find_path(CUDA_SDK_ROOT_DIR common/inc/cutil.h
+ HINTS
+  "$ENV{NVSDKCOMPUTE_ROOT}/C"
+  ENV NVSDKCUDA_ROOT
+  "[HKEY_LOCAL_MACHINE\\SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA SDK 10\\Compute;InstallDir]"
+ PATHS
+  "/Developer/GPU\ Computing/C"
+  )
+
+# Keep the CUDA_SDK_ROOT_DIR first in order to be able to override the
+# environment variables.
+set(CUDA_SDK_SEARCH_PATH
+  "${CUDA_SDK_ROOT_DIR}"
+  "${CUDA_TOOLKIT_ROOT_DIR}/local/NVSDK0.2"
+  "${CUDA_TOOLKIT_ROOT_DIR}/NVSDK0.2"
+  "${CUDA_TOOLKIT_ROOT_DIR}/NV_CUDA_SDK"
+  "$ENV{HOME}/NVIDIA_CUDA_SDK"
+  "$ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX"
+  "/Developer/CUDA"
+  )
+
+# Example of how to find an include file from the CUDA_SDK_ROOT_DIR
+
+# find_path(CUDA_CUT_INCLUDE_DIR
+#   cutil.h
+#   PATHS ${CUDA_SDK_SEARCH_PATH}
+#   PATH_SUFFIXES "common/inc"
+#   DOC "Location of cutil.h"
+#   NO_DEFAULT_PATH
+#   )
+# # Now search system paths
+# find_path(CUDA_CUT_INCLUDE_DIR cutil.h DOC "Location of cutil.h")
+
+# mark_as_advanced(CUDA_CUT_INCLUDE_DIR)
+
+
+# Example of how to find a library in the CUDA_SDK_ROOT_DIR
+
+# # cutil library is called cutil64 for 64 bit builds on windows.  We don't want
+# # to get these confused, so we are setting the name based on the word size of
+# # the build.
+
+# if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+#   set(cuda_cutil_name cutil64)
+# else()
+#   set(cuda_cutil_name cutil32)
+# endif()
+
+# find_library(CUDA_CUT_LIBRARY
+#   NAMES cutil ${cuda_cutil_name}
+#   PATHS ${CUDA_SDK_SEARCH_PATH}
+#   # The new version of the sdk shows up in common/lib, but the old one is in lib
+#   PATH_SUFFIXES "common/lib" "lib"
+#   DOC "Location of cutil library"
+#   NO_DEFAULT_PATH
+#   )
+# # Now search system paths
+# find_library(CUDA_CUT_LIBRARY NAMES cutil ${cuda_cutil_name} DOC "Location of cutil library")
+# mark_as_advanced(CUDA_CUT_LIBRARY)
+# set(CUDA_CUT_LIBRARIES ${CUDA_CUT_LIBRARY})
+
+
+
+#############################
+# Check for required components
+set(CUDA_FOUND TRUE)
+
+set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
+set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
+set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUDA
+  REQUIRED_VARS
+    CUDA_TOOLKIT_ROOT_DIR
+    CUDA_NVCC_EXECUTABLE
+    CUDA_INCLUDE_DIRS
+    CUDA_CUDART_LIBRARY
+  VERSION_VAR
+    CUDA_VERSION
+  )
+
+
+
+###############################################################################
+###############################################################################
+# Macros
+###############################################################################
+###############################################################################
+
+###############################################################################
+# Add include directories to pass to the nvcc command.
+macro(CUDA_INCLUDE_DIRECTORIES)
+  foreach(dir ${ARGN})
+    list(APPEND CUDA_NVCC_INCLUDE_ARGS_USER -I${dir})
+  endforeach()
+endmacro()
+
+
+##############################################################################
+cuda_find_helper_file(parse_cubin cmake)
+cuda_find_helper_file(make2cmake cmake)
+cuda_find_helper_file(run_nvcc cmake)
+
+##############################################################################
+# Separate the OPTIONS out from the sources
+#
+macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
+  set( ${_sources} )
+  set( ${_cmake_options} )
+  set( ${_options} )
+  set( _found_options FALSE )
+  foreach(arg ${ARGN})
+    if(arg STREQUAL "OPTIONS")
+      set( _found_options TRUE )
+    elseif(
+        arg STREQUAL "WIN32" OR
+        arg STREQUAL "MACOSX_BUNDLE" OR
+        arg STREQUAL "EXCLUDE_FROM_ALL" OR
+        arg STREQUAL "STATIC" OR
+        arg STREQUAL "SHARED" OR
+        arg STREQUAL "MODULE"
+        )
+      list(APPEND ${_cmake_options} ${arg})
+    else()
+      if ( _found_options )
+        list(APPEND ${_options} ${arg})
+      else()
+        # Assume this is a file
+        list(APPEND ${_sources} ${arg})
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+##############################################################################
+# Parse the OPTIONS from ARGN and set the variables prefixed by _option_prefix
+#
+macro(CUDA_PARSE_NVCC_OPTIONS _option_prefix)
+  set( _found_config )
+  foreach(arg ${ARGN})
+    # Determine if we are dealing with a perconfiguration flag
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      if (arg STREQUAL "${config_upper}")
+        set( _found_config _${arg})
+        # Set arg to nothing to keep it from being processed further
+        set( arg )
+      endif()
+    endforeach()
+
+    if ( arg )
+      list(APPEND ${_option_prefix}${_found_config} "${arg}")
+    endif()
+  endforeach()
+endmacro()
+
+##############################################################################
+# Helper to add the include directory for CUDA only once
+function(CUDA_ADD_CUDA_INCLUDE_ONCE)
+  get_directory_property(_include_directories INCLUDE_DIRECTORIES)
+  set(_add TRUE)
+  if(_include_directories)
+    foreach(dir ${_include_directories})
+      if("${dir}" STREQUAL "${CUDA_INCLUDE_DIRS}")
+        set(_add FALSE)
+      endif()
+    endforeach()
+  endif()
+  if(_add)
+    include_directories(${CUDA_INCLUDE_DIRS})
+  endif()
+endfunction()
+
+function(CUDA_BUILD_SHARED_LIBRARY shared_flag)
+  set(cmake_args ${ARGN})
+  # If SHARED, MODULE, or STATIC aren't already in the list of arguments, then
+  # add SHARED or STATIC based on the value of BUILD_SHARED_LIBS.
+  list(FIND cmake_args SHARED _cuda_found_SHARED)
+  list(FIND cmake_args MODULE _cuda_found_MODULE)
+  list(FIND cmake_args STATIC _cuda_found_STATIC)
+  if( _cuda_found_SHARED GREATER -1 OR
+      _cuda_found_MODULE GREATER -1 OR
+      _cuda_found_STATIC GREATER -1)
+    set(_cuda_build_shared_libs)
+  else()
+    if (BUILD_SHARED_LIBS)
+      set(_cuda_build_shared_libs SHARED)
+    else()
+      set(_cuda_build_shared_libs STATIC)
+    endif()
+  endif()
+  set(${shared_flag} ${_cuda_build_shared_libs} PARENT_SCOPE)
+endfunction()
+
+##############################################################################
+# Helper to avoid clashes of files with the same basename but different paths.
+# This doesn't attempt to do exactly what CMake internals do, which is to only
+# add this path when there is a conflict, since by the time a second collision
+# in names is detected it's already too late to fix the first one.  For
+# consistency sake the relative path will be added to all files.
+function(CUDA_COMPUTE_BUILD_PATH path build_path)
+  #message("CUDA_COMPUTE_BUILD_PATH([${path}] ${build_path})")
+  # Only deal with CMake style paths from here on out
+  file(TO_CMAKE_PATH "${path}" bpath)
+  if (IS_ABSOLUTE "${bpath}")
+    # Absolute paths are generally unnessary, especially if something like
+    # file(GLOB_RECURSE) is used to pick up the files.
+
+    string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
+    if (_binary_dir_pos EQUAL 0)
+      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
+    else()
+      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
+    endif()
+  endif()
+
+  # This recipie is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
+  # CMake source.
+
+  # Remove leading /
+  string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
+  # Avoid absolute paths by removing ':'
+  string(REPLACE ":" "_" bpath "${bpath}")
+  # Avoid relative paths that go up the tree
+  string(REPLACE "../" "__/" bpath "${bpath}")
+  # Avoid spaces
+  string(REPLACE " " "_" bpath "${bpath}")
+
+  # Strip off the filename.  I wait until here to do it, since removin the
+  # basename can make a path that looked like path/../basename turn into
+  # path/.. (notice the trailing slash).
+  get_filename_component(bpath "${bpath}" PATH)
+
+  set(${build_path} "${bpath}" PARENT_SCOPE)
+  #message("${build_path} = ${bpath}")
+endfunction()
+
+##############################################################################
+# This helper macro populates the following variables and setups up custom
+# commands and targets to invoke the nvcc compiler to generate C or PTX source
+# dependent upon the format parameter.  The compiler is invoked once with -M
+# to generate a dependency file and a second time with -cuda or -ptx to generate
+# a .cpp or .ptx file.
+# INPUT:
+#   cuda_target         - Target name
+#   format              - PTX or OBJ
+#   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
+#   OPTIONS             - Extra options to NVCC
+# OUTPUT:
+#   generated_files     - List of generated files
+##############################################################################
+##############################################################################
+
+macro(CUDA_WRAP_SRCS cuda_target format generated_files)
+
+  # If CMake doesn't support separable compilation, complain
+  if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1")
+    message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1")
+  endif()
+
+  # Set up all the command line flags here, so that they can be overridden on a per target basis.
+
+  set(nvcc_flags "")
+
+  # Emulation if the card isn't present.
+  if (CUDA_BUILD_EMULATION)
+    # Emulation.
+    set(nvcc_flags ${nvcc_flags} --device-emulation -D_DEVICEEMU -g)
+  else()
+    # Device mode.  No flags necessary.
+  endif()
+
+  if(CUDA_HOST_COMPILATION_CPP)
+    set(CUDA_C_OR_CXX CXX)
+  else()
+    if(CUDA_VERSION VERSION_LESS "3.0")
+      set(nvcc_flags ${nvcc_flags} --host-compilation C)
+    else()
+      message(WARNING "--host-compilation flag is deprecated in CUDA version >= 3.0.  Removing --host-compilation C flag" )
+    endif()
+    set(CUDA_C_OR_CXX C)
+  endif()
+
+  set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+
+  if(CUDA_64_BIT_DEVICE_CODE)
+    set(nvcc_flags ${nvcc_flags} -m64)
+  else()
+    set(nvcc_flags ${nvcc_flags} -m32)
+  endif()
+
+  if(CUDA_TARGET_CPU_ARCH)
+    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
+  endif()
+
+  # This needs to be passed in at this stage, because VS needs to fill out the
+  # value of VCInstallDir from within VS.  Note that CCBIN is only used if
+  # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
+  # $(VCInstallDir)/bin.
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    set(ccbin_flags -D "\"CCBIN:PATH=$(VCInstallDir)bin\"" )
+  else()
+    set(ccbin_flags)
+  endif()
+
+  # Figure out which configure we will use and pass that in as an argument to
+  # the script.  We need to defer the decision until compilation time, because
+  # for VS projects we won't know if we are making a debug or release build
+  # until build time.
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    set( CUDA_build_configuration "$(ConfigurationName)" )
+  else()
+    set( CUDA_build_configuration "${CMAKE_BUILD_TYPE}")
+  endif()
+
+  # Initialize our list of includes with the user ones followed by the CUDA system ones.
+  set(CUDA_NVCC_INCLUDE_ARGS ${CUDA_NVCC_INCLUDE_ARGS_USER} "-I${CUDA_INCLUDE_DIRS}")
+  # Get the include directories for this directory and use them for our nvcc command.
+  # Remove duplicate entries which may be present since include_directories
+  # in CMake >= 2.8.8 does not remove them.
+  get_directory_property(CUDA_NVCC_INCLUDE_DIRECTORIES INCLUDE_DIRECTORIES)
+  list(REMOVE_DUPLICATES CUDA_NVCC_INCLUDE_DIRECTORIES)
+  if(CUDA_NVCC_INCLUDE_DIRECTORIES)
+    foreach(dir ${CUDA_NVCC_INCLUDE_DIRECTORIES})
+      list(APPEND CUDA_NVCC_INCLUDE_ARGS -I${dir})
+    endforeach()
+  endif()
+
+  # Reset these variables
+  set(CUDA_WRAP_OPTION_NVCC_FLAGS)
+  foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    set(CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper})
+  endforeach()
+
+  CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${ARGN})
+  CUDA_PARSE_NVCC_OPTIONS(CUDA_WRAP_OPTION_NVCC_FLAGS ${_cuda_wrap_options})
+
+  # Figure out if we are building a shared library.  BUILD_SHARED_LIBS is
+  # respected in CUDA_ADD_LIBRARY.
+  set(_cuda_build_shared_libs FALSE)
+  # SHARED, MODULE
+  list(FIND _cuda_wrap_cmake_options SHARED _cuda_found_SHARED)
+  list(FIND _cuda_wrap_cmake_options MODULE _cuda_found_MODULE)
+  if(_cuda_found_SHARED GREATER -1 OR _cuda_found_MODULE GREATER -1)
+    set(_cuda_build_shared_libs TRUE)
+  endif()
+  # STATIC
+  list(FIND _cuda_wrap_cmake_options STATIC _cuda_found_STATIC)
+  if(_cuda_found_STATIC GREATER -1)
+    set(_cuda_build_shared_libs FALSE)
+  endif()
+
+  # CUDA_HOST_FLAGS
+  if(_cuda_build_shared_libs)
+    # If we are setting up code for a shared library, then we need to add extra flags for
+    # compiling objects for shared libraries.
+    set(CUDA_HOST_SHARED_FLAGS ${CMAKE_SHARED_LIBRARY_${CUDA_C_OR_CXX}_FLAGS})
+  else()
+    set(CUDA_HOST_SHARED_FLAGS)
+  endif()
+  # Only add the CMAKE_{C,CXX}_FLAGS if we are propagating host flags.  We
+  # always need to set the SHARED_FLAGS, though.
+  if(CUDA_PROPAGATE_HOST_FLAGS)
+    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CMAKE_${CUDA_C_OR_CXX}_FLAGS} ${CUDA_HOST_SHARED_FLAGS})")
+  else()
+    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CUDA_HOST_SHARED_FLAGS})")
+  endif()
+
+  set(_cuda_nvcc_flags_config "# Build specific configuration flags")
+  # Loop over all the configuration types to generate appropriate flags for run_nvcc.cmake
+  foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    # CMAKE_FLAGS are strings and not lists.  By not putting quotes around CMAKE_FLAGS
+    # we convert the strings to lists (like we want).
+
+    if(CUDA_PROPAGATE_HOST_FLAGS)
+      # nvcc chokes on -g3 in versions previous to 3.0, so replace it with -g
+      set(_cuda_fix_g3 FALSE)
+
+      if(CMAKE_COMPILER_IS_GNUCC)
+        if (CUDA_VERSION VERSION_LESS  "3.0" OR
+            CUDA_VERSION VERSION_EQUAL "4.1" OR
+            CUDA_VERSION VERSION_EQUAL "4.2"
+            )
+          set(_cuda_fix_g3 TRUE)
+        endif()
+      endif()
+      if(_cuda_fix_g3)
+        string(REPLACE "-g3" "-g" _cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      else()
+        set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      endif()
+
+      set(_cuda_host_flags "${_cuda_host_flags}\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})")
+    endif()
+
+    # Note that if we ever want CUDA_NVCC_FLAGS_<CONFIG> to be string (instead of a list
+    # like it is currently), we can remove the quotes around the
+    # ${CUDA_NVCC_FLAGS_${config_upper}} variable like the CMAKE_HOST_FLAGS_<CONFIG> variable.
+    set(_cuda_nvcc_flags_config "${_cuda_nvcc_flags_config}\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})")
+  endforeach()
+
+  # Get the list of definitions from the directory property
+  get_directory_property(CUDA_NVCC_DEFINITIONS COMPILE_DEFINITIONS)
+  if(CUDA_NVCC_DEFINITIONS)
+    foreach(_definition ${CUDA_NVCC_DEFINITIONS})
+      list(APPEND nvcc_flags "-D${_definition}")
+    endforeach()
+  endif()
+
+  if(_cuda_build_shared_libs)
+    list(APPEND nvcc_flags "-D${cuda_target}_EXPORTS")
+  endif()
+
+  # Reset the output variable
+  set(_cuda_wrap_generated_files "")
+
+  # Iterate over the macro arguments and create custom
+  # commands for all the .cu files.
+  foreach(file ${ARGN})
+    # Ignore any file marked as a HEADER_FILE_ONLY
+    get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
+    if(${file} MATCHES ".*\\.cu$" AND NOT _is_header)
+
+      # Allow per source file overrides of the format.
+      get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
+      if(NOT _cuda_source_format)
+        set(_cuda_source_format ${format})
+      endif()
+
+      if( ${_cuda_source_format} MATCHES "PTX" )
+        set( compile_to_ptx ON )
+      elseif( ${_cuda_source_format} MATCHES "OBJ")
+        set( compile_to_ptx OFF )
+      else()
+        message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ or PTX.")
+      endif()
+
+
+      if(compile_to_ptx)
+        # Don't use any of the host compilation flags for PTX targets.
+        set(CUDA_HOST_FLAGS)
+        set(CUDA_NVCC_FLAGS_CONFIG)
+      else()
+        set(CUDA_HOST_FLAGS ${_cuda_host_flags})
+        set(CUDA_NVCC_FLAGS_CONFIG ${_cuda_nvcc_flags_config})
+      endif()
+
+      # Determine output directory
+      cuda_compute_build_path("${file}" cuda_build_path)
+      set(cuda_compile_intermediate_directory "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${cuda_build_path}")
+      if(CUDA_GENERATED_OUTPUT_DIR)
+        set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
+      else()
+        if ( compile_to_ptx )
+          set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
+        else()
+          set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
+        endif()
+      endif()
+
+      # Add a custom target to generate a c or ptx file. ######################
+
+      get_filename_component( basename ${file} NAME )
+      if( compile_to_ptx )
+        set(generated_file_path "${cuda_compile_output_dir}")
+        set(generated_file_basename "${cuda_target}_generated_${basename}.ptx")
+        set(format_flag "-ptx")
+        file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
+      else()
+        set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
+        set(generated_file_basename "${cuda_target}_generated_${basename}${generated_extension}")
+        if(CUDA_SEPARABLE_COMPILATION)
+          set(format_flag "-dc")
+        else()
+          set(format_flag "-c")
+        endif()
+      endif()
+
+      # Set all of our file names.  Make sure that whatever filenames that have
+      # generated_file_path in them get passed in through as a command line
+      # argument, so that the ${CMAKE_CFG_INTDIR} gets expanded at run time
+      # instead of configure time.
+      set(generated_file "${generated_file_path}/${generated_file_basename}")
+      set(cmake_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.depend")
+      set(NVCC_generated_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.NVCC-depend")
+      set(generated_cubin_file "${generated_file_path}/${generated_file_basename}.cubin.txt")
+      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake")
+
+      # Setup properties for obj files:
+      if( NOT compile_to_ptx )
+        set_source_files_properties("${generated_file}"
+          PROPERTIES
+          EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
+          )
+      endif()
+
+      # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path.
+      get_filename_component(file_path "${file}" PATH)
+      if(IS_ABSOLUTE "${file_path}")
+        set(source_file "${file}")
+      else()
+        set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
+      endif()
+
+      if( NOT compile_to_ptx AND CUDA_SEPARABLE_COMPILATION)
+        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
+      endif()
+
+      # Bring in the dependencies.  Creates a variable CUDA_NVCC_DEPEND #######
+      cuda_include_nvcc_dependencies(${cmake_dependency_file})
+
+      # Convience string for output ###########################################
+      if(CUDA_BUILD_EMULATION)
+        set(cuda_build_type "Emulation")
+      else()
+        set(cuda_build_type "Device")
+      endif()
+
+      # Build the NVCC made dependency file ###################################
+      set(build_cubin OFF)
+      if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
+         if ( NOT compile_to_ptx )
+           set ( build_cubin ON )
+         endif()
+      endif()
+
+      # Configure the build script
+      configure_file("${CUDA_run_nvcc}" "${custom_target_script}" @ONLY)
+
+      # So if a user specifies the same cuda file as input more than once, you
+      # can have bad things happen with dependencies.  Here we check an option
+      # to see if this is the behavior they want.
+      if(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE)
+        set(main_dep MAIN_DEPENDENCY ${source_file})
+      else()
+        set(main_dep DEPENDS ${source_file})
+      endif()
+
+      if(CUDA_VERBOSE_BUILD)
+        set(verbose_output ON)
+      elseif(CMAKE_GENERATOR MATCHES "Makefiles")
+        set(verbose_output "$(VERBOSE)")
+      else()
+        set(verbose_output OFF)
+      endif()
+
+      # Create up the comment string
+      file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
+      if(compile_to_ptx)
+        set(cuda_build_comment_string "Building NVCC ptx file ${generated_file_relative_path}")
+      else()
+        set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
+      endif()
+
+      # Build the generated file and dependency file ##########################
+      add_custom_command(
+        OUTPUT ${generated_file}
+        # These output files depend on the source_file and the contents of cmake_dependency_file
+        ${main_dep}
+        DEPENDS ${CUDA_NVCC_DEPEND}
+        DEPENDS ${custom_target_script}
+        # Make sure the output directory exists before trying to write to it.
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
+        COMMAND ${CMAKE_COMMAND} ARGS
+          -D verbose:BOOL=${verbose_output}
+          ${ccbin_flags}
+          -D build_configuration:STRING=${CUDA_build_configuration}
+          -D "generated_file:STRING=${generated_file}"
+          -D "generated_cubin_file:STRING=${generated_cubin_file}"
+          -P "${custom_target_script}"
+        WORKING_DIRECTORY "${cuda_compile_intermediate_directory}"
+        COMMENT "${cuda_build_comment_string}"
+        )
+
+      # Make sure the build system knows the file is generated.
+      set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
+
+      list(APPEND _cuda_wrap_generated_files ${generated_file})
+
+      # Add the other files that we want cmake to clean on a cleanup ##########
+      list(APPEND CUDA_ADDITIONAL_CLEAN_FILES "${cmake_dependency_file}")
+      list(REMOVE_DUPLICATES CUDA_ADDITIONAL_CLEAN_FILES)
+      set(CUDA_ADDITIONAL_CLEAN_FILES ${CUDA_ADDITIONAL_CLEAN_FILES} CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
+
+    endif()
+  endforeach()
+
+  # Set the return parameter
+  set(${generated_files} ${_cuda_wrap_generated_files})
+endmacro()
+
+function(_cuda_get_important_host_flags important_flags flag_string)
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(REGEX MATCHALL "/M[DT][d]?" flags ${flag_string})
+    list(APPEND ${important_flags} ${flags})
+  else()
+    string(REGEX MATCHALL "-fPIC" flags ${flag_string})
+    list(APPEND ${important_flags} ${flags})
+  endif()
+  set(${important_flags} ${${important_flags}} PARENT_SCOPE)
+endfunction()
+
+###############################################################################
+###############################################################################
+# Separable Compilation Link
+###############################################################################
+###############################################################################
+
+# Compute the filename to be used by CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS
+function(CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME output_file_var cuda_target object_files)
+  if (object_files)
+    set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${CMAKE_CFG_INTDIR}/${cuda_target}_intermediate_link${generated_extension}")
+  else()
+    set(output_file)
+  endif()
+
+  set(${output_file_var} "${output_file}" PARENT_SCOPE)
+endfunction()
+
+# Setup the build rule for the separable compilation intermediate link file.
+function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options object_files)
+  if (object_files)
+
+    set_source_files_properties("${output_file}"
+      PROPERTIES
+      EXTERNAL_OBJECT TRUE # This is an object file not to be compiled, but only
+                           # be linked.
+      GENERATED TRUE       # This file is generated during the build
+      )
+
+    # For now we are ignoring all the configuration specific flags.
+    set(nvcc_flags)
+    CUDA_PARSE_NVCC_OPTIONS(nvcc_flags ${options})
+    if(CUDA_64_BIT_DEVICE_CODE)
+      list(APPEND nvcc_flags -m64)
+    else()
+      list(APPEND nvcc_flags -m32)
+    endif()
+    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
+    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
+    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+      list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
+    endif()
+    set(flags)
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      set(important_host_flags)
+      _cuda_get_important_host_flags(important_host_flags ${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}})
+      foreach(f ${important_host_flags})
+        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
+      endforeach()
+    endforeach()
+    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
+
+    # Some generators don't handle the multiple levels of custom command
+    # dependencies correctly (obj1 depends on file1, obj2 depends on obj1), so
+    # we work around that issue by compiling the intermediate link object as a
+    # pre-link custom command in that situation.
+    set(do_obj_build_rule TRUE)
+    if (MSVC_VERSION GREATER 1599)
+      # VS 2010 and 2012 have this problem.  If future versions fix this issue,
+      # it should still work, it just won't be as nice as the other method.
+      set(do_obj_build_rule FALSE)
+    endif()
+
+    if (do_obj_build_rule)
+      add_custom_command(
+        OUTPUT ${output_file}
+        DEPENDS ${object_files}
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file}
+        ${flags}
+        COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
+        )
+    else()
+      add_custom_command(
+        TARGET ${cuda_target}
+        PRE_LINK
+        COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}"
+        )
+    endif()
+ endif()
+endfunction()
+
+###############################################################################
+###############################################################################
+# ADD LIBRARY
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_LIBRARY cuda_target)
+
+  CUDA_ADD_CUDA_INCLUDE_ONCE()
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  CUDA_BUILD_SHARED_LIBRARY(_cuda_shared_flag ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources}
+    ${_cmake_options} ${_cuda_shared_flag}
+    OPTIONS ${_options} )
+
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  # Add the library.
+  add_library(${cuda_target} ${_cmake_options}
+    ${_generated_files}
+    ${_sources}
+    ${link_file}
+    )
+
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  target_link_libraries(${cuda_target}
+    ${CUDA_LIBRARIES}
+    )
+
+  # We need to set the linker language based on what the expected generated file
+  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
+  set_target_properties(${cuda_target}
+    PROPERTIES
+    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
+    )
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# ADD EXECUTABLE
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_EXECUTABLE cuda_target)
+
+  CUDA_ADD_CUDA_INCLUDE_ONCE()
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} OPTIONS ${_options} )
+
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  # Add the library.
+  add_executable(${cuda_target} ${_cmake_options}
+    ${_generated_files}
+    ${_sources}
+    ${link_file}
+    )
+
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  target_link_libraries(${cuda_target}
+    ${CUDA_LIBRARIES}
+    )
+
+  # We need to set the linker language based on what the expected generated file
+  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
+  set_target_properties(${cuda_target}
+    PROPERTIES
+    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
+    )
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE generated_files)
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( cuda_compile OBJ _generated_files ${_sources} ${_cmake_options}
+    OPTIONS ${_options} )
+
+  set( ${generated_files} ${_generated_files})
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE PTX
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_PTX generated_files)
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( cuda_compile_ptx PTX _generated_files ${_sources} ${_cmake_options}
+    OPTIONS ${_options} )
+
+  set( ${generated_files} ${_generated_files})
+
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA ADD CUFFT TO TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_CUFFT_TO_TARGET target)
+  if (CUDA_BUILD_EMULATION)
+    target_link_libraries(${target} ${CUDA_cufftemu_LIBRARY})
+  else()
+    target_link_libraries(${target} ${CUDA_cufft_LIBRARY})
+  endif()
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA ADD CUBLAS TO TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_CUBLAS_TO_TARGET target)
+  if (CUDA_BUILD_EMULATION)
+    target_link_libraries(${target} ${CUDA_cublasemu_LIBRARY})
+  else()
+    target_link_libraries(${target} ${CUDA_cublas_LIBRARY})
+  endif()
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA BUILD CLEAN TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_BUILD_CLEAN_TARGET)
+  # Call this after you add all your CUDA targets, and you will get a convience
+  # target.  You should also make clean after running this target to get the
+  # build system to generate all the code again.
+
+  set(cuda_clean_target_name clean_cuda_depends)
+  if (CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(TOUPPER ${cuda_clean_target_name} cuda_clean_target_name)
+  endif()
+  add_custom_target(${cuda_clean_target_name}
+    COMMAND ${CMAKE_COMMAND} -E remove ${CUDA_ADDITIONAL_CLEAN_FILES})
+
+  # Clear out the variable, so the next time we configure it will be empty.
+  # This is useful so that the files won't persist in the list after targets
+  # have been removed.
+  set(CUDA_ADDITIONAL_CLEAN_FILES "" CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
+endmacro()
diff --git a/cmake/FindCUDA/make2cmake.cmake b/cmake/FindCUDA/make2cmake.cmake
new file mode 100644
index 0000000000..1b53d177d0
--- /dev/null
+++ b/cmake/FindCUDA/make2cmake.cmake
@@ -0,0 +1,93 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  Copyright (c) 2007-2009
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+#######################################################################
+# This converts a file written in makefile syntax into one that can be included
+# by CMake.
+
+file(READ ${input_file} depend_text)
+
+if (${depend_text} MATCHES ".+")
+
+  # message("FOUND DEPENDS")
+
+  # Remember, four backslashes is escaped to one backslash in the string.
+  string(REGEX REPLACE "\\\\ " " " depend_text ${depend_text})
+
+  # This works for the nvcc -M generated dependency files.
+  string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
+  string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
+
+  set(dependency_list "")
+
+  foreach(file ${depend_text})
+
+    string(REGEX REPLACE "^ +" "" file ${file})
+
+    # OK, now if we had a UNC path, nvcc has a tendency to only output the first '/'
+    # instead of '//'.  Here we will test to see if the file exists, if it doesn't then
+    # try to prepend another '/' to the path and test again.  If it still fails remove the
+    # path.
+
+    if(NOT EXISTS "${file}")
+      if (EXISTS "/${file}")
+        set(file "/${file}")
+      else()
+        message(WARNING " Removing non-existent dependency file: ${file}")
+        set(file "")
+      endif()
+    endif()
+
+    if(NOT IS_DIRECTORY "${file}")
+      # If softlinks start to matter, we should change this to REALPATH.  For now we need
+      # to flatten paths, because nvcc can generate stuff like /bin/../include instead of
+      # just /include.
+      get_filename_component(file_absolute "${file}" ABSOLUTE)
+      list(APPEND dependency_list "${file_absolute}")
+    endif()
+
+  endforeach()
+
+else()
+  # message("FOUND NO DEPENDS")
+endif()
+
+# Remove the duplicate entries and sort them.
+list(REMOVE_DUPLICATES dependency_list)
+list(SORT dependency_list)
+
+foreach(file ${dependency_list})
+  set(cuda_nvcc_depend "${cuda_nvcc_depend} \"${file}\"\n")
+endforeach()
+
+file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
diff --git a/cmake/FindCUDA/parse_cubin.cmake b/cmake/FindCUDA/parse_cubin.cmake
new file mode 100644
index 0000000000..e1905cfc66
--- /dev/null
+++ b/cmake/FindCUDA/parse_cubin.cmake
@@ -0,0 +1,110 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  Copyright (c) 2007-2009
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+#######################################################################
+# Parses a .cubin file produced by nvcc and reports statistics about the file.
+
+
+file(READ ${input_file} file_text)
+
+if (${file_text} MATCHES ".+")
+
+  # Remember, four backslashes is escaped to one backslash in the string.
+  string(REGEX REPLACE ";" "\\\\;" file_text ${file_text})
+  string(REGEX REPLACE "\ncode" ";code" file_text ${file_text})
+
+  list(LENGTH file_text len)
+
+  foreach(line ${file_text})
+
+    # Only look at "code { }" blocks.
+    if(line MATCHES "^code")
+
+      # Break into individual lines.
+      string(REGEX REPLACE "\n" ";" line ${line})
+
+      foreach(entry ${line})
+
+        # Extract kernel names.
+        if (${entry} MATCHES "[^g]name = ([^ ]+)")
+          string(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+
+          # Check to see if the kernel name starts with "_"
+          set(skip FALSE)
+          # if (${entry} MATCHES "^_")
+            # Skip the rest of this block.
+            # message("Skipping ${entry}")
+            # set(skip TRUE)
+          # else ()
+            message("Kernel:    ${entry}")
+          # endif ()
+
+        endif()
+
+        # Skip the rest of the block if necessary
+        if(NOT skip)
+
+          # Registers
+          if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
+            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            message("Registers: ${entry}")
+          endif()
+
+          # Local memory
+          if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
+            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            message("Local:     ${entry}")
+          endif()
+
+          # Shared memory
+          if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
+            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            message("Shared:    ${entry}")
+          endif()
+
+          if (${entry} MATCHES "^}")
+            message("")
+          endif()
+
+        endif()
+
+
+      endforeach()
+
+    endif()
+
+  endforeach()
+
+else()
+  # message("FOUND NO DEPENDS")
+endif()
diff --git a/cmake/FindCUDA/run_nvcc.cmake b/cmake/FindCUDA/run_nvcc.cmake
new file mode 100644
index 0000000000..f0aac8487a
--- /dev/null
+++ b/cmake/FindCUDA/run_nvcc.cmake
@@ -0,0 +1,288 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
+set(source_file "@source_file@") # path
+set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path
+set(cmake_dependency_file "@cmake_dependency_file@") # path
+set(CUDA_make2cmake "@CUDA_make2cmake@") # path
+set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
+set(build_cubin @build_cubin@) # bool
+set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "@generated_file_path@") # path
+set(generated_file_internal "@generated_file@") # path
+set(generated_cubin_file_internal "@generated_cubin_file@") # path
+
+set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path
+set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list
+@CUDA_NVCC_FLAGS_CONFIG@
+set(nvcc_flags @nvcc_flags@) # list
+set(CUDA_NVCC_INCLUDE_ARGS "@CUDA_NVCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "@format_flag@") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+@CUDA_HOST_FLAGS@
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION @CUDA_VERSION@)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 173bee3602..87dc4d178a 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -8,8 +8,24 @@ if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Cl
   return()
 endif()
 
+set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+
+foreach(var INCLUDE LIBRARY PROGRAM)
+  set(__old_frpm_${var} "${CMAKE_FIND_ROOT_PATH_MODE_${var}}")
+endforeach()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+
 find_package(CUDA "${MIN_VER_CUDA}" QUIET)
 
+foreach(var INCLUDE LIBRARY PROGRAM)
+  set(CMAKE_FIND_ROOT_PATH_MODE_${var} "${__old_frpm_${var}}")
+endforeach()
+
+list(REMOVE_AT CMAKE_MODULE_PATH 0)
+
 if(CUDA_FOUND)
   set(HAVE_CUDA 1)
 
@@ -21,47 +37,6 @@ if(CUDA_FOUND)
     set(HAVE_CUBLAS 1)
   endif()
 
-  if(${CUDA_VERSION} VERSION_LESS "5.5")
-    find_cuda_helper_libs(npp)
-  else()
-    # hack for CUDA 5.5
-    if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm")
-      unset(CUDA_TOOLKIT_INCLUDE CACHE)
-      unset(CUDA_CUDART_LIBRARY CACHE)
-      unset(CUDA_cublas_LIBRARY CACHE)
-      unset(CUDA_cufft_LIBRARY CACHE)
-      unset(CUDA_npp_LIBRARY CACHE)
-
-      if(SOFTFP)
-        set(cuda_arm_path "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabi")
-      else()
-        set(cuda_arm_path "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-      endif()
-
-      set(CUDA_TOOLKIT_INCLUDE "${cuda_arm_path}/include" CACHE PATH "include path")
-      set(CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
-
-      set(cuda_arm_library_path "${cuda_arm_path}/lib")
-
-      set(CUDA_CUDART_LIBRARY "${cuda_arm_library_path}/libcudart.so" CACHE FILEPATH "cudart library")
-      set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
-      set(CUDA_cublas_LIBRARY "${cuda_arm_library_path}/libcublas.so" CACHE FILEPATH "cublas library")
-      set(CUDA_cufft_LIBRARY "${cuda_arm_library_path}/libcufft.so" CACHE FILEPATH "cufft library")
-      set(CUDA_nppc_LIBRARY "${cuda_arm_library_path}/libnppc.so" CACHE FILEPATH "nppc library")
-      set(CUDA_nppi_LIBRARY "${cuda_arm_library_path}/libnppi.so" CACHE FILEPATH "nppi library")
-      set(CUDA_npps_LIBRARY "${cuda_arm_library_path}/libnpps.so" CACHE FILEPATH "npps library")
-      set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}" CACHE STRING "npp library")
-    else()
-      unset(CUDA_npp_LIBRARY CACHE)
-
-      find_cuda_helper_libs(nppc)
-      find_cuda_helper_libs(nppi)
-      find_cuda_helper_libs(npps)
-
-      set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}" CACHE STRING "npp library")
-    endif()
-  endif()
-
   if(WITH_NVCUVID)
     find_cuda_helper_libs(nvcuvid)
     if(WIN32)
@@ -166,10 +141,6 @@ if(CUDA_FOUND)
     set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
   endforeach()
 
-  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM")
-  endif()
-
   # These vars will be processed in other scripts
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
   set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}")
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index ed5acc76bd..6f2258d97b 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -499,7 +499,7 @@ macro(ocv_glob_module_sources)
   source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
 
   file(GLOB cl_kernels "src/opencl/*.cl")
-  if(HAVE_OPENCL AND cl_kernels)
+  if(HAVE_opencv_ocl AND cl_kernels)
     ocv_include_directories(${OPENCL_INCLUDE_DIRS})
     add_custom_command(
       OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
diff --git a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
index 78566e7d28..df18e19c57 100644
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
@@ -48,10 +48,10 @@ The structure of package contents looks as follows:
 
 ::
 
-    OpenCV-2.4.6-android-sdk
+    OpenCV-2.4.7-android-sdk
     |_ apk
-    |   |_ OpenCV_2.4.6_binary_pack_armv7a.apk
-    |   |_ OpenCV_2.4.6_Manager_2.9_XXX.apk
+    |   |_ OpenCV_2.4.7_binary_pack_armv7a.apk
+    |   |_ OpenCV_2.4.7_Manager_2.13_XXX.apk
     |
     |_ doc
     |_ samples
@@ -157,10 +157,10 @@ Get the OpenCV4Android SDK
 
    .. code-block:: bash
 
-      unzip ~/Downloads/OpenCV-2.4.6-android-sdk.zip
+      unzip ~/Downloads/OpenCV-2.4.7-android-sdk.zip
 
-.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.6-android-sdk.zip`
-.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.6/OpenCV-2.4.6-android-sdk.zip/download
+.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.7-android-sdk.zip`
+.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.7/OpenCV-2.4.7-android-sdk.zip/download
 .. |opencv_android_bin_pack_url| replace:: |opencv_android_bin_pack|
 .. |seven_zip| replace:: 7-Zip
 .. _seven_zip: http://www.7-zip.org/
@@ -295,7 +295,7 @@ Well, running samples from Eclipse is very simple:
   .. code-block:: sh
     :linenos:
 
-    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.6_Manager_2.9_armv7a-neon.apk
+    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.7_Manager_2.13_armv7a-neon.apk
 
   .. note:: ``armeabi``, ``armv7a-neon``, ``arm7a-neon-android8``, ``mips`` and ``x86`` stand for
             platform targets:
diff --git a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
index 243dc35dd8..12b602ceb9 100644
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
@@ -55,14 +55,14 @@ Manager to access OpenCV libraries externally installed in the target system.
    :guilabel:`File -> Import -> Existing project in your workspace`.
 
    Press :guilabel:`Browse`  button and locate OpenCV4Android SDK
-   (:file:`OpenCV-2.4.6-android-sdk/sdk`).
+   (:file:`OpenCV-2.4.7-android-sdk/sdk`).
 
    .. image:: images/eclipse_opencv_dependency0.png
         :alt: Add dependency from OpenCV library
         :align: center
 
 #. In application project add a reference to the OpenCV Java SDK in
-   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.6``.
+   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.7``.
 
    .. image:: images/eclipse_opencv_dependency1.png
         :alt: Add dependency from OpenCV library
@@ -128,27 +128,27 @@ described above.
 #. Add the OpenCV library project to your workspace the same way as for the async initialization
    above. Use menu :guilabel:`File -> Import -> Existing project in your workspace`,
    press :guilabel:`Browse` button and select OpenCV SDK path
-   (:file:`OpenCV-2.4.6-android-sdk/sdk`).
+   (:file:`OpenCV-2.4.7-android-sdk/sdk`).
 
    .. image:: images/eclipse_opencv_dependency0.png
         :alt: Add dependency from OpenCV library
         :align: center
 
 #. In the application project add a reference to the OpenCV4Android SDK in
-   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.6``;
+   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.7``;
 
    .. image:: images/eclipse_opencv_dependency1.png
        :alt: Add dependency from OpenCV library
        :align: center
 
 #. If your application project **doesn't have a JNI part**, just copy the corresponding OpenCV
-   native libs from :file:`<OpenCV-2.4.6-android-sdk>/sdk/native/libs/<target_arch>` to your
+   native libs from :file:`<OpenCV-2.4.7-android-sdk>/sdk/native/libs/<target_arch>` to your
    project directory to folder :file:`libs/<target_arch>`.
 
    In case of the application project **with a JNI part**, instead of manual libraries copying you
    need to modify your ``Android.mk`` file:
    add the following two code lines after the ``"include $(CLEAR_VARS)"`` and before
-   ``"include path_to_OpenCV-2.4.6-android-sdk/sdk/native/jni/OpenCV.mk"``
+   ``"include path_to_OpenCV-2.4.7-android-sdk/sdk/native/jni/OpenCV.mk"``
 
    .. code-block:: make
       :linenos:
@@ -221,7 +221,7 @@ taken:
 
    .. code-block:: make
 
-      include C:\Work\OpenCV4Android\OpenCV-2.4.6-android-sdk\sdk\native\jni\OpenCV.mk
+      include C:\Work\OpenCV4Android\OpenCV-2.4.7-android-sdk\sdk\native\jni\OpenCV.mk
 
    Should be inserted into the :file:`jni/Android.mk` file **after** this line:
 
diff --git a/modules/highgui/test/test_ffmpeg.cpp b/modules/highgui/test/test_ffmpeg.cpp
index 01afa8301e..85ee0be994 100644
--- a/modules/highgui/test/test_ffmpeg.cpp
+++ b/modules/highgui/test/test_ffmpeg.cpp
@@ -84,64 +84,63 @@ public:
 
         for (size_t j = 0; j < n; ++j)
         {
-        int tag = tags[j];
-        stringstream s;
-        s << tag;
+            int tag = tags[j];
+            stringstream s;
+            s << tag;
 
-        const string filename = "output_"+s.str()+".avi";
+            const string filename = "output_"+s.str()+".avi";
 
-        try
-        {
-            double fps = fps0;
-            Size frame_s = Size(img_c, img_r);
-
-            if( tag == VideoWriter::fourcc('H', '2', '6', '1') )
-                frame_s = Size(352, 288);
-            else if( tag == VideoWriter::fourcc('H', '2', '6', '3') )
-                frame_s = Size(704, 576);
-            /*else if( tag == CV_FOURCC('M', 'J', 'P', 'G') ||
-                     tag == CV_FOURCC('j', 'p', 'e', 'g') )
-                frame_s = Size(1920, 1080);*/
-
-            if( tag == VideoWriter::fourcc('M', 'P', 'E', 'G') )
+            try
             {
-                frame_s = Size(720, 576);
-                fps = 25;
-            }
-
-            VideoWriter writer(filename, tag, fps, frame_s);
+                double fps = fps0;
+                Size frame_s = Size(img_c, img_r);
+
+                if( tag == VideoWriter::fourcc('H', '2', '6', '1') )
+                    frame_s = Size(352, 288);
+                else if( tag == VideoWriter::fourcc('H', '2', '6', '3') )
+                    frame_s = Size(704, 576);
+                /*else if( tag == CV_FOURCC('M', 'J', 'P', 'G') ||
+                         tag == CV_FOURCC('j', 'p', 'e', 'g') )
+                    frame_s = Size(1920, 1080);*/
+
+                if( tag == VideoWriter::fourcc('M', 'P', 'E', 'G') )
+                {
+                    frame_s = Size(720, 576);
+                    fps = 25;
+                }
 
-            if (writer.isOpened() == false)
-            {
-                ts->printf(ts->LOG, "\n\nFile name: %s\n", filename.c_str());
-                ts->printf(ts->LOG, "Codec id: %d   Codec tag: %c%c%c%c\n", j,
-                           tag & 255, (tag >> 8) & 255, (tag >> 16) & 255, (tag >> 24) & 255);
-                ts->printf(ts->LOG, "Error: cannot create video file.");
-                ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
-            }
-            else
-            {
-                Mat img(frame_s, CV_8UC3, Scalar::all(0));
-                const int coeff = cvRound(min(frame_s.width, frame_s.height)/(fps0 * time_sec));
+                VideoWriter writer(filename, tag, fps, frame_s);
 
-                for (int i = 0 ; i < static_cast<int>(fps * time_sec); i++ )
+                if (writer.isOpened() == false)
                 {
-                    //circle(img, Point2i(img_c / 2, img_r / 2), min(img_r, img_c) / 2 * (i + 1), Scalar(255, 0, 0, 0), 2);
-                    rectangle(img, Point2i(coeff * i, coeff * i), Point2i(coeff * (i + 1), coeff * (i + 1)),
-                              Scalar::all(255 * (1.0 - static_cast<double>(i) / (fps * time_sec * 2) )), -1);
-                    writer << img;
+                    ts->printf(ts->LOG, "\n\nFile name: %s\n", filename.c_str());
+                    ts->printf(ts->LOG, "Codec id: %d   Codec tag: %c%c%c%c\n", j,
+                               tag & 255, (tag >> 8) & 255, (tag >> 16) & 255, (tag >> 24) & 255);
+                    ts->printf(ts->LOG, "Error: cannot create video file.");
+                    ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
+                }
+                else
+                {
+                    Mat img(frame_s, CV_8UC3, Scalar::all(0));
+                    const int coeff = cvRound(min(frame_s.width, frame_s.height)/(fps0 * time_sec));
+
+                    for (int i = 0 ; i < static_cast<int>(fps * time_sec); i++ )
+                    {
+                        //circle(img, Point2i(img_c / 2, img_r / 2), min(img_r, img_c) / 2 * (i + 1), Scalar(255, 0, 0, 0), 2);
+                        rectangle(img, Point2i(coeff * i, coeff * i), Point2i(coeff * (i + 1), coeff * (i + 1)),
+                                  Scalar::all(255 * (1.0 - static_cast<double>(i) / (fps * time_sec * 2) )), -1);
+                        writer << img;
+                    }
+
+                    if (!created) created = true;
+                    else remove(filename.c_str());
                 }
-
-                if (!created) created = true;
-                else remove(filename.c_str());
             }
-        }
-        catch(...)
-        {
-            ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
-        }
-        ts->set_failed_test_info(cvtest::TS::OK);
-
+            catch(...)
+            {
+                ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
+            }
+            ts->set_failed_test_info(cvtest::TS::OK);
         }
     }
 };
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index d5f9413058..c5a2e9b78d 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -3175,8 +3175,8 @@ public:
                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
-                            XY[x1*2] = (short)(sx >> INTER_BITS);
-                            XY[x1*2+1] = (short)(sy >> INTER_BITS);
+                            XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
+                            XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
                             A[x1] = (ushort)v;
                         }
                     }
@@ -3189,8 +3189,8 @@ public:
                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
-                            XY[x1*2] = (short)(sx >> INTER_BITS);
-                            XY[x1*2+1] = (short)(sy >> INTER_BITS);
+                            XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
+                            XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
                             A[x1] = (ushort)v;
                         }
                     }
@@ -3404,8 +3404,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 {
                     int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
                     int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
-                    dst1[x*2] = (short)(ix >> INTER_BITS);
-                    dst1[x*2+1] = (short)(iy >> INTER_BITS);
+                    dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
+                    dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
                 }
         }
@@ -3422,8 +3422,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 {
                     int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
                     int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
-                    dst1[x*2] = (short)(ix >> INTER_BITS);
-                    dst1[x*2+1] = (short)(iy >> INTER_BITS);
+                    dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
+                    dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
                 }
         }
diff --git a/modules/java/generator/src/java/android+OpenCVLoader.java b/modules/java/generator/src/java/android+OpenCVLoader.java
index a76471eac9..a130ae30fa 100644
--- a/modules/java/generator/src/java/android+OpenCVLoader.java
+++ b/modules/java/generator/src/java/android+OpenCVLoader.java
@@ -32,6 +32,11 @@ public class OpenCVLoader
      */
     public static final String OPENCV_VERSION_2_4_6 = "2.4.6";
 
+    /**
+     * OpenCV Library version 2.4.7.
+     */
+    public static final String OPENCV_VERSION_2_4_7 = "2.4.7";
+
 
     /**
      * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
diff --git a/modules/nonfree/src/sift.cpp b/modules/nonfree/src/sift.cpp
index 68216f58bb..e67138c8a0 100644
--- a/modules/nonfree/src/sift.cpp
+++ b/modules/nonfree/src/sift.cpp
@@ -543,6 +543,8 @@ static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float sc
     float exp_scale = -1.f/(d * d * 0.5f);
     float hist_width = SIFT_DESCR_SCL_FCTR * scl;
     int radius = cvRound(hist_width * 1.4142135623730951f * (d + 1) * 0.5f);
+    // Clip the radius to the diagonal of the image to avoid autobuffer too large exception
+    radius = std::min(radius, (int) sqrt((double) img.cols*img.cols + img.rows*img.rows));
     cos_t /= hist_width;
     sin_t /= hist_width;
 
diff --git a/modules/ocl/doc/image_filtering.rst b/modules/ocl/doc/image_filtering.rst
index bf468024bd..e020dc74e8 100644
--- a/modules/ocl/doc/image_filtering.rst
+++ b/modules/ocl/doc/image_filtering.rst
@@ -133,7 +133,7 @@ Creates a normalized 2D box filter.
 
 .. ocv:function:: Ptr<BaseFilter_GPU> ocl::getBoxFilter_GPU(int srcType, int dstType, const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
 
-    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
+    :param srcType: Input image type.
 
     :param dstType: Output image type.  It supports only the same values as the source type.
 
@@ -141,9 +141,7 @@ Creates a normalized 2D box filter.
 
     :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
 
-    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderType: Border type.
 
 .. seealso:: :ocv:func:`boxFilter`
 
@@ -153,21 +151,19 @@ Smooths the image using the normalized box filter.
 
 .. ocv:function:: void ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
 
-    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+    :param src: Input image.
 
     :param dst: Output image type. The size and type is the same as ``src`` .
 
-    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
+    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
 
     :param ksize: Kernel size.
 
     :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
 
-    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
-
-Smoothes image using box filter.Supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4.
+    :param borderType: Border type.
 
-.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+Smoothes image using box filter.
 
 ocl::blur
 -------------
@@ -175,7 +171,7 @@ Acts as a synonym for the normalized box filter.
 
 .. ocv:function:: void ocl::blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_CONSTANT)
 
-    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+    :param src: Input image.
 
     :param dst: Output image type with the same size and type as  ``src`` .
 
@@ -183,9 +179,7 @@ Acts as a synonym for the normalized box filter.
 
     :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
 
-    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderType: Border type.
 
 .. seealso:: :ocv:func:`blur`, :ocv:func:`ocl::boxFilter`
 
@@ -217,11 +211,11 @@ Creates a non-separable linear filter.
 
 .. ocv:function:: Ptr<FilterEngine_GPU> ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
 
-    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+    :param srcType: Input image type..
 
     :param dstType: Output image type. The same type as ``src`` is supported.
 
-    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:func:`ocl::convolve`.
+    :param kernel: 2D array of filter coefficients.
 
     :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
 
@@ -234,9 +228,9 @@ ocl::filter2D
 -----------------
 Applies the non-separable 2D linear filter to an image.
 
-.. ocv:function:: void ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+.. ocv:function:: void ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT)
 
-    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+    :param src: Source image.
 
     :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
 
@@ -246,9 +240,9 @@ Applies the non-separable 2D linear filter to an image.
 
     :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
 
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+    :param delta: optional value added to the filtered pixels before storing them in ``dst``. Value '0' is supported only.
 
-    :param stream: Stream for the asynchronous version.
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
 
 ocl::getLinearRowFilter_GPU
 -------------------------------
@@ -447,7 +441,7 @@ ocl::Laplacian
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1)
+.. ocv:function:: void ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1, double delta = 0, int borderType = BORDER_DEFAULT)
 
     :param src: The source image
 
@@ -459,6 +453,10 @@ Returns void
 
     :param scale: The optional scale factor for the computed Laplacian values (by default, no scaling is applied
 
+    :param delta: Optional delta value that is added to the results prior to storing them in  ``dst`` . Supported value is 0 only.
+
+    :param bordertype: Pixel extrapolation method.
+
 The function calculates the Laplacian of the source image by adding up the second x and y derivatives calculated using the Sobel operator.
 
 ocl::ConvolveBuf
diff --git a/modules/ocl/doc/ml_machine_learning.rst b/modules/ocl/doc/ml_machine_learning.rst
index 321cec9dba..eb72cbeef4 100644
--- a/modules/ocl/doc/ml_machine_learning.rst
+++ b/modules/ocl/doc/ml_machine_learning.rst
@@ -85,4 +85,28 @@ Finds centers of clusters and groups input samples around the clusters.
 
             * **KMEANS_USE_INITIAL_LABELS** During the first (and possibly the only) attempt, use the user-supplied labels instead of computing them from the initial centers. For the second and further attempts, use the random or semi-random centers. Use one of  ``KMEANS_*_CENTERS``  flag to specify the exact method.
 
-    :param centers: Output matrix of the cluster centers, one row per each cluster center.
\ No newline at end of file
+    :param centers: Output matrix of the cluster centers, one row per each cluster center.
+
+ocl::distanceToCenters
+----------------------
+For each samples in ``source``, find its closest neighour in ``centers``.
+
+.. ocv:function:: void ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers, int distType = NORM_L2SQR, const oclMat &indices = oclMat())
+
+    :param dists: The output distances calculated from each sample to the best matched center.
+
+    :param labels: The output index of best matched center for each row of sample.
+
+    :param src: Floating-point matrix of input samples. One row per sample.
+
+    :param centers: Floating-point matrix of center candidates. One row per center.
+
+    :param distType: Distance metric to calculate distances. Supports ``NORM_L1`` and ``NORM_L2SQR``.
+
+    :param indices: Optional source indices. If not empty:
+
+            * only the indexed source samples will be processed
+            * outputs, i.e., ``dists`` and ``labels``, have the same size of indices
+            * outputs are in the same order of indices instead of the order of src
+
+The method is a utility function which maybe used for multiple clustering algorithms such as K-means.
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index 3f0fb290ce..b8c26b2c9a 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -23,7 +23,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -111,6 +111,7 @@ namespace cv
 
             bool haveDoubleSupport;
             bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
+            bool isIntelDevice;
 
             std::string compilationExtraOptions;
 
@@ -154,7 +155,8 @@ namespace cv
         {
             FEATURE_CL_DOUBLE = 1,
             FEATURE_CL_UNIFIED_MEM,
-            FEATURE_CL_VER_1_2
+            FEATURE_CL_VER_1_2,
+            FEATURE_CL_INTEL_DEVICE
         };
 
         // Represents OpenCL context, interface
@@ -737,11 +739,12 @@ namespace cv
         CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
 
         //! applies Laplacian operator to the image
-        // supports only ksize = 1 and ksize = 3 8UC1 8UC4 32FC1 32FC4 data type
-        CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1);
+        // supports only ksize = 1 and ksize = 3
+        CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1,
+                double delta=0, int borderType=BORDER_DEFAULT);
 
         //! returns 2D box filter
-        // supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
+        // dst type must be the same as source type
         CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType,
                 const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
@@ -750,17 +753,16 @@ namespace cv
                 const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
         //! returns 2D filter with the specified kernel
-        // supports CV_8UC1 and CV_8UC4 types
+        // supports: dst type must be the same as source type
         CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
                 const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
         //! returns the non-separable linear filter engine
+        // supports: dst type must be the same as source type
         CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
                 const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
         //! smooths the image using the normalized box filter
-        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP
         CV_EXPORTS void boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
                                   Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
@@ -776,8 +778,6 @@ namespace cv
                 const Point &anchor = Point(-1, -1), int iterations = 1);
 
         //! a synonym for normalized box filter
-        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
         static inline void blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1),
                                 int borderType = BORDER_CONSTANT)
         {
@@ -785,10 +785,8 @@ namespace cv
         }
 
         //! applies non-separable 2D linear filter to the image
-        //  Note, at the moment this function only works when anchor point is in the kernel center
-        //  and kernel size supported is either 3x3 or 5x5; otherwise the function will fail to output valid result
         CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
-                                 Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+                                 Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT);
 
         //! applies separable 2D linear filter to the image
         CV_EXPORTS void sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY,
@@ -897,7 +895,10 @@ namespace cv
 
         //! Compute closest centers for each lines in source and lable it after center's index
         // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
-        CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers);
+        // supports NORM_L1 and NORM_L2 distType
+        // if indices is provided, only the indexed rows will be calculated and their results are in the same
+        // order of indices
+        CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers, int distType = NORM_L2SQR, const oclMat &indices = oclMat());
 
         //!Does k-means procedure on GPU
         // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
@@ -964,12 +965,12 @@ namespace cv
 
         struct CV_EXPORTS CannyBuf
         {
-            CannyBuf() : counter(NULL) {}
+            CannyBuf() : counter(1, 1, CV_32S) { }
             ~CannyBuf()
             {
                 release();
             }
-            explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(NULL)
+            explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(1, 1, CV_32S)
             {
                 create(image_size, apperture_size);
             }
@@ -981,7 +982,7 @@ namespace cv
             oclMat dx_buf, dy_buf;
             oclMat magBuf, mapBuf;
             oclMat trackBuf1, trackBuf2;
-            void *counter;
+            oclMat counter;
             Ptr<FilterEngine_GPU> filterDX, filterDY;
         };
 
@@ -1618,7 +1619,12 @@ namespace cv
                                           float pos, oclMat &newFrame, oclMat &buf);
 
         //! computes moments of the rasterized shape or a vector of points
-        CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage);
+        //! _array should be a vector a points standing for the contour
+        CV_EXPORTS Moments ocl_moments(InputArray contour);
+        //! src should be a general image uploaded to the GPU.
+        //! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1
+        //! to use type of CV_64FC1, the GPU should support CV_64FC1
+        CV_EXPORTS Moments ocl_moments(oclMat& src, bool binary);
 
         class CV_EXPORTS StereoBM_OCL
         {
diff --git a/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp b/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp
index beb3d27525..e3845446e0 100644
--- a/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp
@@ -21,7 +21,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp b/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp
index 70c45d3dde..08f980fc35 100644
--- a/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp
@@ -21,7 +21,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index 670b03c2ef..efb684cc2a 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -103,7 +103,11 @@ CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
         const cv::ocl::ProgramEntry* source, String kernelName);
 CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
         const cv::ocl::ProgramEntry* source, String kernelName, const char *build_options);
+CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source,
+        String kernelName, int channels, int depth, const char *build_options);
 CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
+CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
+                          size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args);
 CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, String kernelName, std::vector< std::pair<size_t, const void *> > &args,
         int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
 CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName,
diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp
index 025221b4ee..d71901e89d 100644
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -342,7 +342,7 @@ PERF_TEST_P(CartToPolarFixture, CartToPolar, OCL_TYPICAL_MAT_SIZES)
     if (srcSize == OCL_SIZE_4000)
         declare.time(3.6);
 
-   if (RUN_OCL_IMPL)
+    if (RUN_OCL_IMPL)
     {
         ocl::oclMat oclSrc1(src1), oclSrc2(src2),
                 oclDst1(srcSize, src1.type()), oclDst2(srcSize, src1.type());
@@ -374,7 +374,7 @@ PERF_TEST_P(PolarToCartFixture, PolarToCart, OCL_TYPICAL_MAT_SIZES)
 {
     const Size srcSize = GetParam();
 
-   Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
+    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
             dst1(srcSize, CV_32FC1), dst2(srcSize, CV_32FC1);
     declare.in(src1, src2).out(dst1, dst2);
     randu(src1, 0, 256);
@@ -421,7 +421,7 @@ PERF_TEST_P(MagnitudeFixture, Magnitude, OCL_TYPICAL_MAT_SIZES)
     randu(src2, 0, 1);
     declare.in(src1, src2).out(dst);
 
-   if (RUN_OCL_IMPL)
+    if (RUN_OCL_IMPL)
     {
         ocl::oclMat oclSrc1(src1), oclSrc2(src2),
                 oclDst(srcSize, src1.type());
@@ -457,7 +457,7 @@ PERF_TEST_P(TransposeFixture, Transpose,
     Mat src(srcSize, type), dst(srcSize, type);
     declare.in(src, WARMUP_RNG).out(dst);
 
-   if (RUN_OCL_IMPL)
+    if (RUN_OCL_IMPL)
     {
         ocl::oclMat oclSrc(src), oclDst(srcSize, type);
 
@@ -562,7 +562,7 @@ PERF_TEST_P(minMaxLocFixture, minMaxLoc,
             ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
                                OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
 {
-   const Size_MatType_t params = GetParam();
+    const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params);
 
@@ -607,7 +607,7 @@ PERF_TEST_P(SumFixture, Sum,
     const Size srcSize = get<0>(params);
     const int type = get<1>(params);
 
-   Mat src(srcSize, type);
+    Mat src(srcSize, type);
     Scalar result;
     randu(src, 0, 60);
     declare.in(src);
@@ -708,16 +708,16 @@ PERF_TEST_P(BitwiseAndFixture, bitwise_and,
             ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
                                OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
 {
-   const Size_MatType_t params = GetParam();
+    const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params);
 
-   Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
     declare.in(src1, src2).out(dst);
     randu(src1, 0, 256);
     randu(src2, 0, 256);
 
-   if (RUN_OCL_IMPL)
+    if (RUN_OCL_IMPL)
     {
         ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
 
@@ -737,6 +737,80 @@ PERF_TEST_P(BitwiseAndFixture, bitwise_and,
         OCL_PERF_ELSE
 }
 
+///////////// bitwise_xor ////////////////////////
+
+typedef Size_MatType BitwiseXorFixture;
+
+PERF_TEST_P(BitwiseXorFixture, bitwise_xor,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2).out(dst);
+    randu(src1, 0, 256);
+    randu(src2, 0, 256);
+
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
+
+        OCL_TEST_CYCLE() cv::ocl::bitwise_xor(oclSrc1, oclSrc2, oclDst);
+
+        oclDst.download(dst);
+
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::bitwise_xor(src1, src2, dst);
+
+        SANITY_CHECK(dst);
+    }
+    else
+        OCL_PERF_ELSE
+}
+
+///////////// bitwise_or ////////////////////////
+
+typedef Size_MatType BitwiseOrFixture;
+
+PERF_TEST_P(BitwiseOrFixture, bitwise_or,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2).out(dst);
+    randu(src1, 0, 256);
+    randu(src2, 0, 256);
+
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
+
+        OCL_TEST_CYCLE() cv::ocl::bitwise_or(oclSrc1, oclSrc2, oclDst);
+
+        oclDst.download(dst);
+
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::bitwise_or(src1, src2, dst);
+
+        SANITY_CHECK(dst);
+    }
+    else
+        OCL_PERF_ELSE
+}
+
 ///////////// bitwise_not////////////////////////
 
 typedef Size_MatType BitwiseNotFixture;
diff --git a/modules/ocl/perf/perf_blend.cpp b/modules/ocl/perf/perf_blend.cpp
index a5e057ffca..6f611bbc34 100644
--- a/modules/ocl/perf/perf_blend.cpp
+++ b/modules/ocl/perf/perf_blend.cpp
@@ -47,48 +47,61 @@
 #include "perf_precomp.hpp"
 
 using namespace perf;
+using namespace cv;
+using std::tr1::get;
 
 ///////////// blend ////////////////////////
 
 template <typename T>
-static void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2,
-                            const cv::Mat &weights1, const cv::Mat &weights2,
-                            cv::Mat &result_gold)
+static void blendLinearGold(const Mat &img1, const Mat &img2,
+                            const Mat &weights1, const Mat &weights2,
+                            Mat &result_gold)
 {
+    CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
+    CV_Assert(weights1.size() == weights2.size() && weights1.size() == img1.size() &&
+              weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
+
     result_gold.create(img1.size(), img1.type());
 
     int cn = img1.channels();
+    int step1 = img1.cols * img1.channels();
 
     for (int y = 0; y < img1.rows; ++y)
     {
-        const float *weights1_row = weights1.ptr<float>(y);
-        const float *weights2_row = weights2.ptr<float>(y);
-        const T *img1_row = img1.ptr<T>(y);
-        const T *img2_row = img2.ptr<T>(y);
-        T *result_gold_row = result_gold.ptr<T>(y);
+        const float * const weights1_row = weights1.ptr<float>(y);
+        const float * const weights2_row = weights2.ptr<float>(y);
+        const T * const img1_row = img1.ptr<T>(y);
+        const T * const img2_row = img2.ptr<T>(y);
+        T * const result_gold_row = result_gold.ptr<T>(y);
 
-        for (int x = 0; x < img1.cols * cn; ++x)
+        for (int x = 0; x < step1; ++x)
         {
-            int x1 = x * cn;
-            float w1 = weights1_row[x];
-            float w2 = weights2_row[x];
-            result_gold_row[x] = static_cast<T>((img1_row[x1] * w1
-                                                 + img2_row[x1] * w2) / (w1 + w2 + 1e-5f));
+            int x1 = x / cn;
+            float w1 = weights1_row[x1], w2 = weights2_row[x1];
+            result_gold_row[x] = saturate_cast<T>(((float)img1_row[x] * w1
+                                                 + (float)img2_row[x] * w2) / (w1 + w2 + 1e-5f));
         }
     }
 }
 
-typedef TestBaseWithParam<Size> blendLinearFixture;
+typedef void (*blendFunction)(const Mat &img1, const Mat &img2,
+                              const Mat &weights1, const Mat &weights2,
+                              Mat &result_gold);
+
+typedef Size_MatType blendLinearFixture;
 
-PERF_TEST_P(blendLinearFixture, blendLinear, OCL_TYPICAL_MAT_SIZES)
+PERF_TEST_P(blendLinearFixture, blendLinear, ::testing::Combine(
+                OCL_TYPICAL_MAT_SIZES, testing::Values(CV_8UC1, CV_8UC3, CV_32FC1)))
 {
-    const Size srcSize = GetParam();
-    const int type = CV_8UC1;
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params);
+    const double eps = CV_MAT_DEPTH(srcType) <= CV_32S ? 1.0 : 0.2;
 
-    Mat src1(srcSize, type), src2(srcSize, CV_8UC1), dst;
+    Mat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType);
     Mat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);
 
-    declare.in(src1, src2, WARMUP_RNG);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
     randu(weights1, 0.0f, 1.0f);
     randu(weights2, 0.0f, 1.0f);
 
@@ -97,17 +110,20 @@ PERF_TEST_P(blendLinearFixture, blendLinear, OCL_TYPICAL_MAT_SIZES)
         ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst;
         ocl::oclMat oclWeights1(weights1), oclWeights2(weights2);
 
-        OCL_TEST_CYCLE() cv::ocl::blendLinear(oclSrc1, oclSrc2, oclWeights1, oclWeights2, oclDst);
+        OCL_TEST_CYCLE() ocl::blendLinear(oclSrc1, oclSrc2, oclWeights1, oclWeights2, oclDst);
 
         oclDst.download(dst);
 
-        SANITY_CHECK(dst);
+        SANITY_CHECK(dst, eps);
     }
     else if (RUN_PLAIN_IMPL)
     {
-        TEST_CYCLE() blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+        blendFunction funcs[] = { (blendFunction)blendLinearGold<uchar>, (blendFunction)blendLinearGold<float> };
+        int funcIdx = CV_MAT_DEPTH(srcType) == CV_8UC1 ? 0 : 1;
+
+        TEST_CYCLE() (funcs[funcIdx])(src1, src2, weights1, weights2, dst);
 
-        SANITY_CHECK(dst);
+        SANITY_CHECK(dst, eps);
     }
     else
         OCL_PERF_ELSE
diff --git a/modules/ocl/perf/perf_brute_force_matcher.cpp b/modules/ocl/perf/perf_brute_force_matcher.cpp
index 86c0a3c70d..d124428e9d 100644
--- a/modules/ocl/perf/perf_brute_force_matcher.cpp
+++ b/modules/ocl/perf/perf_brute_force_matcher.cpp
@@ -53,8 +53,8 @@ using namespace perf;
 
 typedef TestBaseWithParam<Size> BruteForceMatcherFixture;
 
-PERF_TEST_P(BruteForceMatcherFixture, DISABLED_match,
-            OCL_BFMATCHER_TYPICAL_MAT_SIZES) // TODO too big difference between implementations
+PERF_TEST_P(BruteForceMatcherFixture, match,
+            OCL_BFMATCHER_TYPICAL_MAT_SIZES)
 {
     const Size srcSize = GetParam();
 
@@ -82,14 +82,14 @@ PERF_TEST_P(BruteForceMatcherFixture, DISABLED_match,
 
         oclMatcher.matchDownload(oclTrainIdx, oclDistance, matches);
 
-        SANITY_CHECK_MATCHES(matches);
+        SANITY_CHECK_MATCHES(matches, 1e-5);
     }
     else
         OCL_PERF_ELSE
 }
 
-PERF_TEST_P(BruteForceMatcherFixture, DISABLED_knnMatch,
-            OCL_BFMATCHER_TYPICAL_MAT_SIZES) // TODO too big difference between implementations
+PERF_TEST_P(BruteForceMatcherFixture, knnMatch,
+            OCL_BFMATCHER_TYPICAL_MAT_SIZES)
 {
     const Size srcSize = GetParam();
 
@@ -123,8 +123,8 @@ PERF_TEST_P(BruteForceMatcherFixture, DISABLED_knnMatch,
         oclMatcher.knnMatchDownload(oclTrainIdx, oclDistance, matches);
 
         std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
-        SANITY_CHECK_MATCHES(matches0);
-        SANITY_CHECK_MATCHES(matches1);
+        SANITY_CHECK_MATCHES(matches0, 1e-5);
+        SANITY_CHECK_MATCHES(matches1, 1e-5);
     }
     else
         OCL_PERF_ELSE
diff --git a/modules/ocl/perf/perf_hough.cpp b/modules/ocl/perf/perf_hough.cpp
index f259bd1f04..e90356acb0 100644
--- a/modules/ocl/perf/perf_hough.cpp
+++ b/modules/ocl/perf/perf_hough.cpp
@@ -22,7 +22,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp
index 7a9eab5d3b..1a8406e6d7 100644
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@ -56,6 +56,7 @@ typedef TestBaseWithParam<Size> equalizeHistFixture;
 PERF_TEST_P(equalizeHistFixture, equalizeHist, OCL_TYPICAL_MAT_SIZES)
 {
     const Size srcSize = GetParam();
+    const double eps = 1 + DBL_EPSILON;
 
     Mat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
     declare.in(src, WARMUP_RNG).out(dst);
@@ -68,13 +69,13 @@ PERF_TEST_P(equalizeHistFixture, equalizeHist, OCL_TYPICAL_MAT_SIZES)
 
         oclDst.download(dst);
 
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
+        SANITY_CHECK(dst, eps);
     }
     else if (RUN_PLAIN_IMPL)
     {
         TEST_CYCLE() cv::equalizeHist(src, dst);
 
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
+        SANITY_CHECK(dst, eps);
     }
     else
         OCL_PERF_ELSE
@@ -82,15 +83,20 @@ PERF_TEST_P(equalizeHistFixture, equalizeHist, OCL_TYPICAL_MAT_SIZES)
 
 /////////// CopyMakeBorder //////////////////////
 
-typedef Size_MatType CopyMakeBorderFixture;
+CV_ENUM(Border, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,
+        BORDER_WRAP, BORDER_REFLECT_101)
+
+typedef tuple<Size, MatType, Border> CopyMakeBorderParamType;
+typedef TestBaseWithParam<CopyMakeBorderParamType> CopyMakeBorderFixture;
 
 PERF_TEST_P(CopyMakeBorderFixture, CopyMakeBorder,
             ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4),
+                               Border::all()))
 {
-    const Size_MatType_t params = GetParam();
+    const CopyMakeBorderParamType params = GetParam();
     const Size srcSize = get<0>(params);
-    const int type = get<1>(params), borderType = BORDER_CONSTANT;
+    const int type = get<1>(params), borderType = get<2>(params);
 
     Mat src(srcSize, type), dst;
     const Size dstSize = srcSize + Size(12, 12);
@@ -360,20 +366,23 @@ PERF_TEST_P(resizeFixture, resize,
 
 ///////////// threshold////////////////////////
 
-CV_ENUM(ThreshType, THRESH_BINARY, THRESH_TRUNC)
+CV_ENUM(ThreshType, THRESH_BINARY, THRESH_TOZERO_INV)
 
-typedef tuple<Size, ThreshType> ThreshParams;
+typedef tuple<Size, MatType, ThreshType> ThreshParams;
 typedef TestBaseWithParam<ThreshParams> ThreshFixture;
 
 PERF_TEST_P(ThreshFixture, threshold,
             ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC4, CV_32FC1),
                                ThreshType::all()))
 {
     const ThreshParams params = GetParam();
     const Size srcSize = get<0>(params);
-    const int threshType = get<1>(params);
+    const int srcType = get<1>(params);
+    const int threshType = get<2>(params);
+    const double maxValue = 220.0, threshold = 50;
 
-    Mat src(srcSize, CV_8U), dst(srcSize, CV_8U);
+    Mat src(srcSize, srcType), dst(srcSize, srcType);
     randu(src, 0, 100);
     declare.in(src).out(dst);
 
@@ -381,7 +390,7 @@ PERF_TEST_P(ThreshFixture, threshold,
     {
         ocl::oclMat oclSrc(src), oclDst(srcSize, CV_8U);
 
-        OCL_TEST_CYCLE() cv::ocl::threshold(oclSrc, oclDst, 50.0, 0.0, threshType);
+        OCL_TEST_CYCLE() cv::ocl::threshold(oclSrc, oclDst, threshold, maxValue, threshType);
 
         oclDst.download(dst);
 
@@ -389,7 +398,7 @@ PERF_TEST_P(ThreshFixture, threshold,
     }
     else if (RUN_PLAIN_IMPL)
     {
-        TEST_CYCLE() cv::threshold(src, dst, 50.0, 0.0, threshType);
+        TEST_CYCLE() cv::threshold(src, dst, threshold, maxValue, threshType);
 
         SANITY_CHECK(dst);
     }
@@ -860,3 +869,64 @@ PERF_TEST_P(columnSumFixture, columnSum, OCL_TYPICAL_MAT_SIZES)
     else
         OCL_PERF_ELSE
 }
+
+//////////////////////////////distanceToCenters////////////////////////////////////////////////
+
+CV_ENUM(DistType, NORM_L1, NORM_L2SQR);
+typedef tuple<Size, DistType> distanceToCentersParameters;
+typedef TestBaseWithParam<distanceToCentersParameters> distanceToCentersFixture;
+
+static void distanceToCentersPerfTest(Mat& src, Mat& centers, Mat& dists, Mat& labels, int distType)
+{
+    Mat batch_dists;
+    cv::batchDistance(src,centers,batch_dists, CV_32FC1, noArray(), distType);
+    std::vector<float> dists_v;
+    std::vector<int> labels_v;
+    for(int i = 0; i<batch_dists.rows; i++)
+    {
+        Mat r = batch_dists.row(i);
+        double mVal;
+        Point mLoc;
+        minMaxLoc(r, &mVal, NULL, &mLoc, NULL);
+        dists_v.push_back((float)mVal);
+        labels_v.push_back(mLoc.x);
+    }
+    Mat temp_dists(dists_v);
+    Mat temp_labels(labels_v);
+    temp_dists.reshape(1,1).copyTo(dists);
+    temp_labels.reshape(1,1).copyTo(labels);
+}
+
+PERF_TEST_P(distanceToCentersFixture, distanceToCenters, ::testing::Combine(::testing::Values(cv::Size(256,256), cv::Size(512,512)), DistType::all()) )
+{
+    Size size = get<0>(GetParam());
+    int distType = get<1>(GetParam());
+    Mat src(size, CV_32FC1);
+    Mat centers(size, CV_32FC1);
+    Mat dists(cv::Size(src.rows,1), CV_32FC1);
+    Mat labels(cv::Size(src.rows,1), CV_32SC1);
+    declare.in(src, centers, WARMUP_RNG).out(dists, labels);
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat ocl_src(src);
+        ocl::oclMat ocl_centers(centers);
+        ocl::oclMat ocl_dists(dists);
+        ocl::oclMat ocl_labels(labels);
+
+        OCL_TEST_CYCLE() ocl::distanceToCenters(ocl_dists,ocl_labels,ocl_src, ocl_centers, distType);
+
+        ocl_dists.download(dists);
+        ocl_labels.download(labels);
+
+        SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE);
+        SANITY_CHECK(labels);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() distanceToCentersPerfTest(src,centers,dists,labels,distType);
+        SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE);
+        SANITY_CHECK(labels);
+    }
+    else
+        OCL_PERF_ELSE
+}
diff --git a/modules/ocl/perf/perf_matrix_operation.cpp b/modules/ocl/perf/perf_matrix_operation.cpp
index 3035c97f04..f2baa7ffc5 100644
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@@ -156,15 +156,17 @@ PERF_TEST_P(setToFixture, setTo,
         OCL_PERF_ELSE
 }
 
+#if 0
+
 /////////////////// upload ///////////////////////////
 
-typedef tuple<Size, int, int> uploadParams;
+typedef tuple<Size, MatDepth, int> uploadParams;
 typedef TestBaseWithParam<uploadParams> uploadFixture;
 
 PERF_TEST_P(uploadFixture, upload,
             testing::Combine(
                 OCL_TYPICAL_MAT_SIZES,
-                testing::Range(CV_8U, CV_64F),
+                testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
                 testing::Range(1, 5)))
 {
     const uploadParams params = GetParam();
@@ -200,7 +202,7 @@ typedef TestBaseWithParam<uploadParams> downloadFixture;
 PERF_TEST_P(downloadFixture, download,
             testing::Combine(
                 OCL_TYPICAL_MAT_SIZES,
-                testing::Range(CV_8U, CV_64F),
+                testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
                 testing::Range(1, 5)))
 {
     const uploadParams params = GetParam();
@@ -228,3 +230,5 @@ PERF_TEST_P(downloadFixture, download,
 
     SANITY_CHECK_NOTHING();
 }
+
+#endif
diff --git a/modules/ocl/perf/perf_moments.cpp b/modules/ocl/perf/perf_moments.cpp
index a36e1a13ed..c5d616f83d 100644
--- a/modules/ocl/perf/perf_moments.cpp
+++ b/modules/ocl/perf/perf_moments.cpp
@@ -26,7 +26,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other Materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -49,41 +49,42 @@
 using namespace perf;
 using std::tr1::tuple;
 using std::tr1::get;
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
 
-///////////// Moments ////////////////////////
 
-typedef Size_MatType MomentsFixture;
+///////////// Moments ////////////////////////
+//*! performance of image
+typedef tuple<Size, MatType, bool> MomentsParamType;
+typedef TestBaseWithParam<MomentsParamType> MomentsFixture;
 
-PERF_TEST_P(MomentsFixture, DISABLED_Moments,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1)))  // TODO does not work properly (see below)
+PERF_TEST_P(MomentsFixture, Moments,
+    ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                       OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_16UC1, CV_32FC1), ::testing::Bool()))
 {
-    const Size_MatType_t params = GetParam();
+    const MomentsParamType params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params);
+    const bool binaryImage = get<2>(params);
 
-    Mat src(srcSize, type), dst(7, 1, CV_64F);
-    const bool binaryImage = false;
-    cv::Moments mom;
-
-    declare.in(src, WARMUP_RNG).out(dst);
+    Mat  src(srcSize, type), dst(7, 1, CV_64F);
+    randu(src, 0, 255);
 
+    oclMat src_d(src);
+    cv::Moments mom;
     if (RUN_OCL_IMPL)
     {
-        ocl::oclMat oclSrc(src);
-
-        OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(oclSrc, binaryImage); // TODO Use oclSrc
-        cv::HuMoments(mom, dst);
-
-        SANITY_CHECK(dst);
+        OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(src_d, binaryImage);
     }
     else if (RUN_PLAIN_IMPL)
     {
         TEST_CYCLE() mom = cv::moments(src, binaryImage);
-        cv::HuMoments(mom, dst);
-
-        SANITY_CHECK(dst);
     }
     else
         OCL_PERF_ELSE
+    cv::HuMoments(mom, dst);
+    SANITY_CHECK(dst, 2e-1);
 }
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 6bfa7333a4..5bcfbe1af2 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -474,9 +474,13 @@ static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem
 
     std::ostringstream stream;
     stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()];
-    stream << " -D MAX_VAL=" << (WT)std::numeric_limits<T>::max();
-    stream << " -D MIN_VAL=" << (std::numeric_limits<T>::is_integer ?
-                  (WT)std::numeric_limits<T>::min() : -(WT)(std::numeric_limits<T>::max()));
+    if (std::numeric_limits<T>::is_integer)
+    {
+        stream << " -D MAX_VAL=" << (WT)std::numeric_limits<T>::max();
+        stream << " -D MIN_VAL=" << (WT)std::numeric_limits<T>::min();
+    }
+    else
+        stream << " -D DEPTH_" << src.depth();
     std::string buildOptions = stream.str();
 
     std::vector<std::pair<size_t , const void *> > args;
@@ -684,7 +688,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         break;
     }
     if (isRelative)
-        r = r / norm(src2, normType);
+        r = r / (norm(src2, normType) + DBL_EPSILON);
 
     return r;
 }
@@ -693,83 +697,47 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 ////////////////////////////////// flip //////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, String kernelName)
-{
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int vector_lengths[4][7] = {{4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1}
-    };
+enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
 
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
-
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-    int rows = divUp(dst.rows, 2);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args, -1, depth);
-}
-
-static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, String kernelName, bool isVertical)
+static void arithmetic_flip_run(const oclMat &src, oclMat &dst, String kernelName, int flipType)
 {
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
+    int cols = dst.cols, rows = dst.rows;
+    if ((cols == 1 && flipType == FLIP_COLS) ||
+            (rows == 1 && flipType == FLIP_ROWS) ||
+            (rows == 1 && cols == 1 && flipType == FLIP_BOTH))
+    {
+        src.copyTo(dst);
+        return;
+    }
 
-    int vector_lengths[4][7] = {{1, 1, 1, 1, 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1}
-    };
+    cols = flipType == FLIP_COLS ? divUp(cols, 2) : cols;
+    rows = flipType & FLIP_ROWS ? divUp(rows, 2) : rows;
 
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
-    cols = isVertical ? cols : divUp(cols, 2);
-    int rows = isVertical ?  divUp(dst.rows, 2) : dst.rows;
+    const char * const channelMap[] = { "", "", "2", "4", "4" };
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    std::string buildOptions = format("-D T=%s%s", typeMap[dst.depth()], channelMap[dst.oclchannels()]);
 
     size_t localThreads[3]  = { 64, 4, 1 };
     size_t globalThreads[3] = { cols, rows, 1 };
 
-    int dst_step1 = dst.cols * dst.elemSize();
+    int elemSize = src.elemSize();
+    int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
+    int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
+
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
 
-    if (isVertical)
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-    else
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    const cv::ocl::ProgramEntry* source = isVertical ? &arithm_flip_rc : &arithm_flip;
-
-    openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
+    openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args,
+                        -1, -1, buildOptions.c_str());
 }
 
 void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
@@ -783,11 +751,11 @@ void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
     dst.create(src.size(), src.type());
 
     if (flipCode == 0)
-        arithmetic_flip_rows_run(src, dst, "arithm_flip_rows");
+        arithmetic_flip_run(src, dst, "arithm_flip_rows", FLIP_ROWS);
     else if (flipCode > 0)
-        arithmetic_flip_cols_run(src, dst, "arithm_flip_cols", false);
+        arithmetic_flip_run(src, dst, "arithm_flip_cols", FLIP_COLS);
     else
-        arithmetic_flip_cols_run(src, dst, "arithm_flip_rc", true);
+        arithmetic_flip_run(src, dst, "arithm_flip_rows_cols", FLIP_BOTH);
 }
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/modules/ocl/src/blend.cpp b/modules/ocl/src/blend.cpp
index c9bba13c94..39f09c47bd 100644
--- a/modules/ocl/src/blend.cpp
+++ b/modules/ocl/src/blend.cpp
@@ -49,35 +49,51 @@
 using namespace cv;
 using namespace cv::ocl;
 
-void cv::ocl::blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2,
-                          oclMat &result)
+void cv::ocl::blendLinear(const oclMat &src1, const oclMat &src2, const oclMat &weights1, const oclMat &weights2,
+                          oclMat &dst)
 {
-    cv::ocl::Context *ctx = img1.clCxt;
-    CV_Assert(ctx == img2.clCxt && ctx == weights1.clCxt && ctx == weights2.clCxt);
-    int channels = img1.oclchannels();
-    int depth = img1.depth();
-    int rows = img1.rows;
-    int cols = img1.cols;
-    int istep = img1.step1();
-    int wstep = weights1.step1();
-    size_t globalSize[] = {cols * channels / 4, rows, 1};
-    size_t localSize[] = {256, 1, 1};
+    CV_Assert(src1.depth() <= CV_32F);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert(weights1.size() == weights2.size() && weights1.size() == src1.size() &&
+              weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
+
+    dst.create(src1.size(), src1.type());
+
+    size_t globalSize[] = { dst.cols, dst.rows, 1};
+    size_t localSize[] = { 16, 16, 1 };
+
+    int depth = dst.depth(), ocn = dst.oclchannels();
+    int src1_step = src1.step / src1.elemSize(), src1_offset = src1.offset / src1.elemSize();
+    int src2_step = src2.step / src2.elemSize(), src2_offset = src2.offset / src2.elemSize();
+    int weight1_step = weights1.step / weights1.elemSize(), weight1_offset = weights1.offset / weights1.elemSize();
+    int weight2_step = weights2.step / weights2.elemSize(), weight2_offset = weights2.offset / weights2.elemSize();
+    int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
+
+    const char * const channelMap[] = { "", "", "2", "4", "4" };
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    std::string buildOptions = format("-D T=%s%s -D convertToT=convert_%s%s%s -D FT=float%s -D convertToFT=convert_float%s",
+                                      typeMap[depth], channelMap[ocn], typeMap[depth], channelMap[ocn],
+                                      depth >= CV_32S ? "" : "_sat_rte", channelMap[ocn], channelMap[ocn]);
 
     std::vector< std::pair<size_t, const void *> > args;
-    result.create(img1.size(), CV_MAKE_TYPE(depth,img1.channels()));
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img1.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img2.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&istep ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&wstep ));
-        String kernelName = "BlendLinear";
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_step ));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_step ));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
 
-        openCLExecuteKernel(ctx, &blend_linear, kernelName, globalSize, localSize, args, channels, depth);
-    }
+    openCLExecuteKernel(src1.clCxt, &blend_linear, "blendLinear", globalSize, localSize, args,
+                        -1, -1, buildOptions.c_str());
 }
diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp
index 3f5de52748..8c68d8baca 100644
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@ -49,7 +49,7 @@
 using namespace cv;
 using namespace cv::ocl;
 
-cv::ocl::CannyBuf::CannyBuf(const oclMat &dx_, const oclMat &dy_) : dx(dx_), dy(dy_), counter(NULL)
+cv::ocl::CannyBuf::CannyBuf(const oclMat &dx_, const oclMat &dy_) : dx(dx_), dy(dy_), counter(1, 1, CV_32SC1)
 {
     CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
 
@@ -81,17 +81,8 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
     ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, magBuf);
     ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, mapBuf);
 
-    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
-    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
-
-    int counter_i [1] = { 0 };
-    int err = 0;
-    if(counter)
-    {
-        openCLFree(counter);
-    }
-    counter = clCreateBuffer( *((cl_context*)getClContextPtr()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
-    openCLSafeCall(err);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2);
 }
 
 void cv::ocl::CannyBuf::release()
@@ -104,11 +95,6 @@ void cv::ocl::CannyBuf::release()
     mapBuf.release();
     trackBuf1.release();
     trackBuf2.release();
-    if(counter)
-    {
-        openCLFree(counter);
-        counter = NULL;
-    }
 }
 
 namespace cv
@@ -124,9 +110,9 @@ namespace cv
 
             void calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh);
 
-            void edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols);
+            void edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols);
 
-            void edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols);
+            void edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols);
 
             void getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols);
         }
@@ -320,54 +306,61 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro
     openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
-void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols)
+void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols)
 {
     Context *clCxt = map.clCxt;
-    String kernelName = "edgesHysteresisLocal";
     std::vector< std::pair<size_t, const void *> > args;
 
+    Mat counterMat(counter.rows, counter.cols, counter.type());
+    counterMat.at<int>(0, 0) = 0;
+    counter.upload(counterMat);
+
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
+    cl_int stepBytes = map.step;
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&stepBytes));
+    cl_int offsetBytes = map.offset;
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&offsetBytes));
 
     size_t globalThreads[3] = {cols, rows, 1};
     size_t localThreads[3]  = {16, 16, 1};
 
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisLocal", globalThreads, localThreads, args, -1, -1);
 }
 
-void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
+void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols)
 {
-    unsigned int count;
-    openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
     Context *clCxt = map.clCxt;
-    String kernelName = "edgesHysteresisGlobal";
     std::vector< std::pair<size_t, const void *> > args;
     size_t localThreads[3]  = {128, 1, 1};
 
-    int count_i[1] = {0};
-    while(count > 0)
+    while(1 > 0)
     {
-        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
+        Mat counterMat; counter.download(counterMat);
+        int count = counterMat.at<int>(0, 0);
+        CV_Assert(count >= 0);
+        if (count == 0)
+            break;
+
+        counterMat.at<int>(0, 0) = 0;
+        counter.upload(counterMat);
 
         args.clear();
-        size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1};
+        size_t globalThreads[3] = {std::min((unsigned)count, 65535u) * 128, divUp(count, 65535), 1};
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st2.data));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&count));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
 
-        openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
+        openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisGlobal", globalThreads, localThreads, args, -1, -1);
         std::swap(st1, st2);
     }
 }
diff --git a/modules/ocl/src/cl_context.cpp b/modules/ocl/src/cl_context.cpp
index 2b3129d05e..bf5eaae50c 100644
--- a/modules/ocl/src/cl_context.cpp
+++ b/modules/ocl/src/cl_context.cpp
@@ -448,6 +448,17 @@ static int initializeOpenCLDevices()
                 {
                     deviceInfo.info.haveDoubleSupport = false;
                 }
+
+                size_t intel_platform = platformInfo.info.platformVendor.find("Intel");
+                if(intel_platform != std::string::npos)
+                {
+                    deviceInfo.info.compilationExtraOptions += " -D INTEL_DEVICE";
+                    deviceInfo.info.isIntelDevice = true;
+                }
+                else
+                {
+                    deviceInfo.info.isIntelDevice = false;
+                }
             }
         }
     }
@@ -471,7 +482,7 @@ DeviceInfo::DeviceInfo()
       deviceVendorId(-1),
       maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0), maxMemAllocSize(0),
       deviceVersionMajor(0), deviceVersionMinor(0),
-      haveDoubleSupport(false), isUnifiedMemory(false),
+      haveDoubleSupport(false), isUnifiedMemory(false),isIntelDevice(false),
       platform(NULL)
 {
     // nothing
@@ -572,6 +583,8 @@ bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const
 {
     switch (featureType)
     {
+    case FEATURE_CL_INTEL_DEVICE:
+        return deviceInfo.isIntelDevice;
     case FEATURE_CL_DOUBLE:
         return deviceInfo.haveDoubleSupport;
     case FEATURE_CL_UNIFIED_MEM:
diff --git a/modules/ocl/src/cl_operations.cpp b/modules/ocl/src/cl_operations.cpp
index f83220dae7..5910d05366 100644
--- a/modules/ocl/src/cl_operations.cpp
+++ b/modules/ocl/src/cl_operations.cpp
@@ -109,6 +109,31 @@ cl_mem openCLCreateBuffer(Context *ctx, size_t flag , size_t size)
     return buffer;
 }
 
+//#define CHECK_MEMORY_CORRUPTION
+#ifdef CHECK_MEMORY_CORRUPTION
+//#define CHECK_MEMORY_CORRUPTION_PRINT_ERROR
+#define CHECK_MEMORY_CORRUPTION_RAISE_ERROR
+static const int __memory_corruption_check_bytes = 1024*1024;
+static const int __memory_corruption_check_pattern = 0x14326547; // change pattern for sizeof(int)==8
+struct CheckBuffers
+{
+    cl_mem mainBuffer;
+    size_t size;
+    size_t widthInBytes, height;
+    CheckBuffers()
+        : mainBuffer(NULL), size(0), widthInBytes(0), height(0)
+    {
+        // nothing
+    }
+    CheckBuffers(cl_mem _mainBuffer, size_t _size, size_t _widthInBytes, size_t _height)
+        : mainBuffer(_mainBuffer), size(_size), widthInBytes(_widthInBytes), height(_height)
+    {
+        // notihng
+    }
+};
+static std::map<cl_mem, CheckBuffers> __check_buffers;
+#endif
+
 void openCLMallocPitch(Context *ctx, void **dev_ptr, size_t *pitch,
                        size_t widthInBytes, size_t height)
 {
@@ -119,9 +144,34 @@ void openCLMallocPitchEx(Context *ctx, void **dev_ptr, size_t *pitch,
                        size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
 {
     cl_int status;
+    size_t size = widthInBytes * height;
+#ifndef CHECK_MEMORY_CORRUPTION
     *dev_ptr = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
-                              widthInBytes * height, 0, &status);
+                              size, 0, &status);
+    openCLVerifyCall(status);
+#else
+    size_t allocSize = size + __memory_corruption_check_bytes * 2;
+    cl_mem mainBuffer = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
+            allocSize, 0, &status);
+    openCLVerifyCall(status);
+    cl_buffer_region r = {__memory_corruption_check_bytes, size};
+    *dev_ptr =  clCreateSubBuffer(mainBuffer,
+            gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
+            CL_BUFFER_CREATE_TYPE_REGION, &r,
+            &status);
     openCLVerifyCall(status);
+    std::vector<int> tmp(__memory_corruption_check_bytes / sizeof(int),
+            __memory_corruption_check_pattern);
+    CV_Assert(tmp.size() * sizeof(int) == __memory_corruption_check_bytes);
+    openCLVerifyCall(clEnqueueWriteBuffer(getClCommandQueue(ctx),
+            mainBuffer, CL_TRUE, 0, __memory_corruption_check_bytes, &tmp[0],
+            0, NULL, NULL));
+    openCLVerifyCall(clEnqueueWriteBuffer(getClCommandQueue(ctx),
+            mainBuffer, CL_TRUE, __memory_corruption_check_bytes + size, __memory_corruption_check_bytes, &tmp[0],
+            0, NULL, NULL));
+    CheckBuffers data(mainBuffer, size, widthInBytes, height);
+    __check_buffers.insert(std::pair<cl_mem, CheckBuffers>((cl_mem)*dev_ptr, data));
+#endif
     *pitch = widthInBytes;
 }
 
@@ -174,7 +224,59 @@ void openCLCopyBuffer2D(Context *ctx, void *dst, size_t dpitch, int dst_offset,
 
 void openCLFree(void *devPtr)
 {
+#ifdef CHECK_MEMORY_CORRUPTION
+    bool failBefore = false, failAfter = false;
+    CheckBuffers data;
+    std::map<cl_mem, CheckBuffers>::iterator i = __check_buffers.find((cl_mem)devPtr);
+    if (i != __check_buffers.end())
+    {
+        data = i->second;
+        Context* ctx = Context::getContext();
+        std::vector<uchar> checkBefore(__memory_corruption_check_bytes);
+        std::vector<uchar> checkAfter(__memory_corruption_check_bytes);
+        openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx),
+                data.mainBuffer, CL_TRUE, 0, __memory_corruption_check_bytes, &checkBefore[0],
+                0, NULL, NULL));
+        openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx),
+                data.mainBuffer, CL_TRUE, __memory_corruption_check_bytes + data.size, __memory_corruption_check_bytes, &checkAfter[0],
+                0, NULL, NULL));
+
+        std::vector<int> tmp(__memory_corruption_check_bytes / sizeof(int),
+                __memory_corruption_check_pattern);
+
+        if (memcmp(&checkBefore[0], &tmp[0], __memory_corruption_check_bytes) != 0)
+        {
+            failBefore = true;
+        }
+        if (memcmp(&checkAfter[0], &tmp[0], __memory_corruption_check_bytes) != 0)
+        {
+            failAfter = true;
+        }
+        openCLSafeCall(clReleaseMemObject(data.mainBuffer));
+        __check_buffers.erase(i);
+    }
+#endif
     openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
+#ifdef CHECK_MEMORY_CORRUPTION
+    if (failBefore)
+    {
+#ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR
+        std::cerr << "ERROR: Memory corruption detected: before buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl;
+#endif
+#ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR
+        CV_Error(CV_StsInternal, "Memory corruption detected: before buffer");
+#endif
+    }
+    if (failAfter)
+    {
+#ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR
+        std::cerr << "ERROR: Memory corruption detected: after buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl;
+#endif
+#ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR
+        CV_Error(CV_StsInternal, "Memory corruption detected: after buffer");
+#endif
+    }
+#endif
 }
 
 cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName)
@@ -234,8 +336,7 @@ static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
     return opt;
 }
 
-void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, int channels,
                           int depth, const char *build_options)
 {
     //construct kernel name
@@ -248,10 +349,14 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, Str
         idxStr << "_D" << depth;
     kernelName = kernelName + idxStr.str();
 
-    cl_kernel kernel;
     std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
-    kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
+    cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
+    return kernel;
+}
 
+void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
+                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args)
+{
     if ( localThreads != NULL)
     {
         globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
@@ -297,6 +402,15 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, Str
     openCLSafeCall(clReleaseKernel(kernel));
 }
 
+void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
+                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+                          int depth, const char *build_options)
+{
+    cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options);
+
+    openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args);
+}
+
 void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
                          size_t globalThreads[3], size_t localThreads[3],
                          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
diff --git a/modules/ocl/src/cl_programcache.cpp b/modules/ocl/src/cl_programcache.cpp
index 1254e30634..245bf3330a 100644
--- a/modules/ocl/src/cl_programcache.cpp
+++ b/modules/ocl/src/cl_programcache.cpp
@@ -428,7 +428,7 @@ struct ProgramFileCache
 
         if(status != CL_SUCCESS)
         {
-            if(status == CL_BUILD_PROGRAM_FAILURE)
+            if (status == CL_BUILD_PROGRAM_FAILURE || status == CL_INVALID_BUILD_OPTIONS)
             {
                 size_t buildLogSize = 0;
                 openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index 816988db78..305c723afd 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -11,7 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
@@ -69,37 +69,14 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize)
     normalizeAnchor(anchor.y, ksize.height);
 }
 
-inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size)
+inline void normalizeROI(Rect &roi, const Size &ksize, const Point &/*anchor*/, const Size &src_size)
 {
     if (roi == Rect(0, 0, -1, -1))
         roi = Rect(0, 0, src_size.width, src_size.height);
 
     CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
-    CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
     CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
 }
-
-
-inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8U, int *nDivisor = 0, bool reverse = false)
-{
-    int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
-
-    if (nDivisor)
-        *nDivisor = scale;
-    Mat temp(kernel.size(), type);
-    kernel.convertTo(temp, type, scale);
-    Mat cont_krnl = temp.reshape(1, 1);
-
-    if (reverse)
-    {
-        int count = cont_krnl.cols >> 1;
-
-        for (int i = 0; i < count; ++i)
-            std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
-    }
-
-    gpu_krnl.upload(cont_krnl);
-}
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -168,7 +145,7 @@ typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, c
 class MorphFilter_GPU : public BaseFilter_GPU
 {
 public:
-    MorphFilter_GPU(const Size &ksize_, const Point &anchor_, const oclMat &kernel_, GPUMorfFilter_t func_) :
+    MorphFilter_GPU(const Size &ksize_, const Point &anchor_, const Mat &kernel_, GPUMorfFilter_t func_) :
         BaseFilter_GPU(ksize_, anchor_, BORDER_CONSTANT), kernel(kernel_), func(func_), rectKernel(false) {}
 
     virtual void operator()(const oclMat &src, oclMat &dst)
@@ -345,27 +322,22 @@ static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
     openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 }
 
-Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, Point anchor)
+Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &_kernel, const Size &ksize, Point anchor)
 {
-    static const GPUMorfFilter_t GPUMorfFilter_callers[2][5] =
-    {
-        {0, GPUErode, 0, GPUErode, GPUErode },
-        {0, GPUDilate, 0, GPUDilate, GPUDilate}
-    };
-
     CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
     CV_Assert(type == CV_8UC1 || type == CV_8UC3 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC3 || type == CV_32FC4);
 
-    oclMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl);
     normalizeAnchor(anchor, ksize);
+    Mat kernel8U;
+    _kernel.convertTo(kernel8U, CV_8U);
+    Mat kernel = kernel8U.reshape(1, 1);
 
     bool noZero = true;
     for(int i = 0; i < kernel.rows * kernel.cols; ++i)
-        if(kernel.data[i] != 1)
+        if(kernel.at<uchar>(i) != 1)
             noZero = false;
 
-    MorphFilter_GPU* mfgpu = new MorphFilter_GPU(ksize, anchor, gpu_krnl, GPUMorfFilter_callers[op][CV_MAT_CN(type)]);
+    MorphFilter_GPU* mfgpu = new MorphFilter_GPU(ksize, anchor, kernel, op == MORPH_ERODE ? GPUErode : GPUDilate);
     if(noZero)
         mfgpu->rectKernel = true;
 
@@ -445,14 +417,15 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a
     else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
     {
         anchor = Point(anchor.x * iterations, anchor.y * iterations);
-        kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + iterations * (ksize.width - 1),
-                                       ksize.height + iterations * (ksize.height - 1)), anchor);
+        kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + (iterations - 1) * (ksize.width - 1),
+                                       ksize.height + (iterations - 1) * (ksize.height - 1)), anchor);
         iterations = 1;
     }
     else
         kernel = _kernel;
 
-    Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations);
+    Ptr<MorphologyFilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations)
+            .staticCast<MorphologyFilterEngine_GPU>();
 
     f->apply(src, dst);
 }
@@ -525,12 +498,12 @@ void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &ke
 
 namespace
 {
-typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , const oclMat & , const Size &, const Point&, const int);
+typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , const Mat & , const Size &, const Point&, const int);
 
 class LinearFilter_GPU : public BaseFilter_GPU
 {
 public:
-    LinearFilter_GPU(const Size &ksize_, const Point &anchor_, const oclMat &kernel_, GPUFilter2D_t func_,
+    LinearFilter_GPU(const Size &ksize_, const Point &anchor_, const Mat &kernel_, GPUFilter2D_t func_,
                      int borderType_) :
         BaseFilter_GPU(ksize_, anchor_, borderType_), kernel(kernel_), func(func_) {}
 
@@ -539,123 +512,217 @@ public:
         func(src, dst, kernel, ksize, anchor, borderType) ;
     }
 
-    oclMat kernel;
+    Mat kernel;
     GPUFilter2D_t func;
 };
 }
 
-static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel,
+// prepare kernel: transpose and make double rows (+align). Returns size of aligned row
+// Samples:
+//        a b c
+// Input: d e f
+//        g h i
+// Output, last two zeros is the alignment:
+// a d g a d g 0 0
+// b e h b e h 0 0
+// c f i c f i 0 0
+template <typename T>
+static int _prepareKernelFilter2D(std::vector<T>& data, const Mat &kernel)
+{
+    Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
+    int size_y_aligned = roundUp(kernel.rows * 2, 4);
+    data.clear(); data.resize(size_y_aligned * kernel.cols, 0);
+    for (int x = 0; x < kernel.cols; x++)
+    {
+        for (int y = 0; y < kernel.rows; y++)
+        {
+            data[x * size_y_aligned + y] = _kernel.at<T>(y, x);
+            data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x);
+        }
+    }
+    return size_y_aligned;
+}
+
+static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel,
     const Size &ksize, const Point& anchor, const int borderType)
 {
     CV_Assert(src.clCxt == dst.clCxt);
     CV_Assert((src.cols == dst.cols) &&
               (src.rows == dst.rows));
-    CV_Assert((src.oclchannels() == dst.oclchannels()));
-    CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
-    CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
-    CV_Assert(ksize.width == ksize.height);
-    Context *clCxt = src.clCxt;
-
-    int filterWidth = ksize.width;
-    bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4 && src.type() != CV_32FC3; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
+    CV_Assert(src.oclchannels() == dst.oclchannels());
 
-    String kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D";
+    CV_Assert(kernel.cols == ksize.width && kernel.rows == ksize.height);
+    CV_Assert(kernel.channels() == 1);
 
-    size_t src_offset_x = (src.offset % src.step) / src.elemSize();
-    size_t src_offset_y = src.offset / src.step;
+    CV_Assert(anchor.x >= 0 && anchor.x < kernel.cols);
+    CV_Assert(anchor.y >= 0 && anchor.y < kernel.rows);
 
-    size_t dst_offset_x = (dst.offset % dst.step) / dst.elemSize();
-    size_t dst_offset_y = dst.offset / dst.step;
+    bool useDouble = src.depth() == CV_64F;
 
-    int paddingPixels = filterWidth & (-2);
+    std::vector<float> kernelDataFloat;
+    std::vector<double> kernelDataDouble;
+    int kernel_size_y2_aligned = useDouble ?
+            _prepareKernelFilter2D<double>(kernelDataDouble, kernel)
+            : _prepareKernelFilter2D<float>(kernelDataFloat, kernel);
+    oclMat oclKernelParameter;
+    if (useDouble)
+    {
+        oclKernelParameter.createEx(1, kernelDataDouble.size(), CV_64FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT);
+        openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataDouble.size()*sizeof(double),
+                &kernelDataDouble[0], kernelDataDouble.size()*sizeof(double),
+                kernelDataDouble.size()*sizeof(double), 1, clMemcpyHostToDevice);
+    }
+    else
+    {
+        oclKernelParameter.createEx(1, kernelDataFloat.size(), CV_32FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT);
+        openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataFloat.size()*sizeof(float),
+                &kernelDataFloat[0], kernelDataFloat.size()*sizeof(float),
+                kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
+    }
 
-    size_t localThreads[3]  = {ksize_3x3 ? 256 : 16, ksize_3x3 ? 1 : 16, 1};
-    size_t globalThreads[3] = {src.wholecols, src.wholerows, 1};
+    size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+    do {
+        size_t BLOCK_SIZE = tryWorkItems;
+        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
+            BLOCK_SIZE /= 2;
+#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
+        size_t BLOCK_SIZE_Y = 1;
+#else
+        size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
+        while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
+            BLOCK_SIZE_Y *= 2;
+#endif
+
+        CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
+
+        bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+
+        std::vector<std::pair<size_t , const void *> > args;
+
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
+        cl_uint stepBytes = src.step;
+        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes));
+        int offsetXBytes = src.offset % src.step;
+        int offsetX = offsetXBytes / src.elemSize();
+        CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
+        int offsetY = src.offset / src.step;
+        int endX = (offsetX + src.cols);
+        int endY = (offsetY + src.rows);
+        cl_int rect[4] = {offsetX, offsetY, endX, endY};
+        if (!isIsolatedBorder)
+        {
+            rect[2] = src.wholecols;
+            rect[3] = src.wholerows;
+        }
+        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
+
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
+        cl_uint _stepBytes = dst.step;
+        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes));
+        int _offsetXBytes = dst.offset % dst.step;
+        int _offsetX = _offsetXBytes / dst.elemSize();
+        CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
+        int _offsetY = dst.offset / dst.step;
+        int _endX = (_offsetX + dst.cols);
+        int _endY = (_offsetY + dst.rows);
+        cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
+        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
+
+        float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+        double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+        if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
+        {
+            if (useDouble)
+                args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+            else
+                args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
+        }
 
-    int cn =  src.oclchannels();
-    int src_step = (int)(src.step/src.elemSize());
-    int dst_step = (int)(dst.step/src.elemSize());
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
 
-    int localWidth = localThreads[0] + paddingPixels;
-    int localHeight = localThreads[1] + paddingPixels;
+        const char* btype = NULL;
 
-    size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();
+        switch (borderType & ~BORDER_ISOLATED)
+        {
+        case BORDER_CONSTANT:
+            btype = "BORDER_CONSTANT";
+            break;
+        case BORDER_REPLICATE:
+            btype = "BORDER_REPLICATE";
+            break;
+        case BORDER_REFLECT:
+            btype = "BORDER_REFLECT";
+            break;
+        case BORDER_WRAP:
+            CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
+            return;
+        case BORDER_REFLECT101:
+            btype = "BORDER_REFLECT_101";
+            break;
+        }
 
-    int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
-    {4, 4, 1, 1, 1, 1, 1},
-    {1, 1, 1, 1, 1, 1, 1},
-    {4, 4, 4, 4, 1, 1, 4}
-    };
-    int cols = dst.cols + ((dst_offset_x) & (vector_lengths[cn - 1][src.depth()] - 1));
+        int requiredTop = anchor.y;
+        int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
+        int requiredBottom = ksize.height - 1 - anchor.y;
+        int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
+        int h = isIsolatedBorder ? src.rows : src.wholerows;
+        int w = isIsolatedBorder ? src.cols : src.wholecols;
+        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+
+        char build_options[1024];
+        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
+                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
+                "-D %s -D %s -D %s",
+                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
+                src.depth(), src.oclchannels(), useDouble ? 1 : 0,
+                anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
+                btype,
+                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+
+        size_t lt[3] = {BLOCK_SIZE, 1, 1};
+        size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
+
+        cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options);
+
+        size_t kernelWorkGroupSize;
+        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
+                                                CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+        if (lt[0] > kernelWorkGroupSize)
+        {
+            clReleaseKernel(kernel);
+            CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
+            tryWorkItems = kernelWorkGroupSize;
+            continue;
+        }
 
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
-    args.push_back(std::make_pair(localMemSize,   (void *)NULL));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_y));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_y));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&cols));
-    char btype[30];
-    switch (borderType)
-    {
-    case 0:
-        sprintf(btype, "BORDER_CONSTANT");
-        break;
-    case 1:
-        sprintf(btype, "BORDER_REPLICATE");
-        break;
-    case 2:
-        sprintf(btype, "BORDER_REFLECT");
-        break;
-    case 3:
-        CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-        return;
-    case 4:
-        sprintf(btype, "BORDER_REFLECT_101");
-        break;
-    }
-    int type = src.depth();
-    char build_options[150];
-    sprintf(build_options, "-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d", btype, cn, type, cn, ksize.width);
-    openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+        openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
+    } while (false);
 }
 
-Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
+Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
         const Point &anchor, int borderType)
 {
-    static const GPUFilter2D_t GPUFilter2D_callers[] = {0, GPUFilter2D, 0, GPUFilter2D, GPUFilter2D};
-
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
-
-    oclMat gpu_krnl;
     Point norm_archor = anchor;
-    normalizeKernel(kernel, gpu_krnl, CV_32FC1);
     normalizeAnchor(norm_archor, ksize);
 
-    return makePtr<LinearFilter_GPU>(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)],
-        borderType);
+    return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, norm_archor, kernel, GPUFilter2D,
+                               borderType));
 }
 
 Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor,
         int borderType)
 {
-    Size ksize = kernel.size();
+    Size ksize = kernel.size(); // TODO remove duplicated parameter
     Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor, borderType);
 
     return createFilter2D_GPU(linearFilter);
 }
 
-void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, int borderType)
+void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, double delta, int borderType)
 {
+    CV_Assert(delta == 0);
+
     if (ddepth < 0)
         ddepth = src.depth();
 
@@ -714,276 +781,146 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
     return makePtr<SeparableFilterEngine_GPU>(rowFilter, columnFilter);
 }
 
-/*
-**data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4
-**support four border types: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_REFLECT_101
-*/
-
-static void GPUFilterBox_8u_C1R(const oclMat &src, oclMat &dst,
-                         Size &ksize, const Point anchor, const int borderType)
-{
-    //Normalize the result by default
-    float alpha = ksize.height * ksize.width;
-
-    CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows));
-    Context *clCxt = src.clCxt;
-
-    String kernelName = "boxFilter_C1_D0";
-
-    char btype[30];
-
-    switch (borderType)
-    {
-    case 0:
-        sprintf(btype, "BORDER_CONSTANT");
-        break;
-    case 1:
-        sprintf(btype, "BORDER_REPLICATE");
-        break;
-    case 2:
-        sprintf(btype, "BORDER_REFLECT");
-        break;
-    case 3:
-        CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-        return;
-    case 4:
-        sprintf(btype, "BORDER_REFLECT_101");
-        break;
-    }
-
-    char build_options[150];
-    sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
-
-    size_t blockSizeX = 256, blockSizeY = 1;
-    size_t gSize = blockSizeX - (ksize.width - 1);
-    size_t threads = (dst.offset % dst.step % 4 + dst.cols + 3) / 4;
-    size_t globalSizeX = threads % gSize == 0 ? threads / gSize * blockSizeX : (threads / gSize + 1) * blockSizeX;
-    size_t globalSizeY = ((dst.rows + 1) / 2) % blockSizeY == 0 ? ((dst.rows + 1) / 2) : (((dst.rows + 1) / 2) / blockSizeY + 1) * blockSizeY;
-
-    size_t globalThreads[3] = { globalSizeX, globalSizeY, 1 };
-    size_t localThreads[3]  = { blockSizeX, blockSizeY, 1 };
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-    args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
-
-    openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-}
-
-static void GPUFilterBox_8u_C4R(const oclMat &src, oclMat &dst,
+static void GPUFilterBox(const oclMat &src, oclMat &dst,
                          Size &ksize, const Point anchor, const int borderType)
 {
     //Normalize the result by default
-    float alpha = ksize.height * ksize.width;
+    float alpha = 1.0f / (ksize.height * ksize.width);
 
     CV_Assert(src.clCxt == dst.clCxt);
     CV_Assert((src.cols == dst.cols) &&
               (src.rows == dst.rows));
-    Context *clCxt = src.clCxt;
-
-    String kernelName = "boxFilter_C4_D0";
-
-    char btype[30];
-
-    switch (borderType)
-    {
-    case 0:
-        sprintf(btype, "BORDER_CONSTANT");
-        break;
-    case 1:
-        sprintf(btype, "BORDER_REPLICATE");
-        break;
-    case 2:
-        sprintf(btype, "BORDER_REFLECT");
-        break;
-    case 3:
-        CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-        return;
-    case 4:
-        sprintf(btype, "BORDER_REFLECT_101");
-        break;
-    }
-
-    char build_options[150];
-    sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
-
-    size_t blockSizeX = 256, blockSizeY = 1;
-    size_t gSize = blockSizeX - ksize.width / 2 * 2;
-    size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
-    size_t rows_per_thread = 2;
-    size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
-
-    size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
-    size_t localThreads[3]  = { blockSizeX, blockSizeY, 1};
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-    args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
-
-    openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-}
-
-static void GPUFilterBox_32F_C1R(const oclMat &src, oclMat &dst,
-                          Size &ksize, const Point anchor, const int borderType)
-{
-    //Normalize the result by default
-    float alpha = ksize.height * ksize.width;
-
-    CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows));
-    Context *clCxt = src.clCxt;
-
-    String kernelName = "boxFilter_C1_D5";
-
-    char btype[30];
-
-    switch (borderType)
-    {
-    case 0:
-        sprintf(btype, "BORDER_CONSTANT");
-        break;
-    case 1:
-        sprintf(btype, "BORDER_REPLICATE");
-        break;
-    case 2:
-        sprintf(btype, "BORDER_REFLECT");
-        break;
-    case 3:
-        CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-        return;
-    case 4:
-        sprintf(btype, "BORDER_REFLECT_101");
-        break;
-    }
-
-    char build_options[150];
-    sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
-
-    size_t blockSizeX = 256, blockSizeY = 1;
-    size_t gSize = blockSizeX - ksize.width / 2 * 2;
-    size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
-    size_t rows_per_thread = 2;
-    size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
-
-
-    size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
-    size_t localThreads[3]  = { blockSizeX, blockSizeY, 1};
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-    args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
-
-    openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-}
-
-static void GPUFilterBox_32F_C4R(const oclMat &src, oclMat &dst,
-                          Size &ksize, const Point anchor, const int borderType)
-{
-    //Normalize the result by default
-    float alpha = ksize.height * ksize.width;
-
-    CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows));
-    Context *clCxt = src.clCxt;
-
-    String kernelName = "boxFilter_C4_D5";
-
-    char btype[30];
-
-    switch (borderType)
-    {
-    case 0:
-        sprintf(btype, "BORDER_CONSTANT");
-        break;
-    case 1:
-        sprintf(btype, "BORDER_REPLICATE");
-        break;
-    case 2:
-        sprintf(btype, "BORDER_REFLECT");
-        break;
-    case 3:
-        CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-        return;
-    case 4:
-        sprintf(btype, "BORDER_REFLECT_101");
-        break;
-    }
+    CV_Assert(src.oclchannels() == dst.oclchannels());
 
-    char build_options[150];
-    sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
+    size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+    do {
+        size_t BLOCK_SIZE = tryWorkItems;
+        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
+            BLOCK_SIZE /= 2;
+        size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
+        while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
+            BLOCK_SIZE_Y *= 2;
+
+        CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
+
+        bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+
+        std::vector<std::pair<size_t , const void *> > args;
+
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
+        cl_uint stepBytes = src.step;
+        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes));
+        int offsetXBytes = src.offset % src.step;
+        int offsetX = offsetXBytes / src.elemSize();
+        CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
+        int offsetY = src.offset / src.step;
+        int endX = (offsetX + src.cols);
+        int endY = (offsetY + src.rows);
+        cl_int rect[4] = {offsetX, offsetY, endX, endY};
+        if (!isIsolatedBorder)
+        {
+            rect[2] = src.wholecols;
+            rect[3] = src.wholerows;
+        }
+        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
+
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
+        cl_uint _stepBytes = dst.step;
+        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes));
+        int _offsetXBytes = dst.offset % dst.step;
+        int _offsetX = _offsetXBytes / dst.elemSize();
+        CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
+        int _offsetY = dst.offset / dst.step;
+        int _endX = (_offsetX + dst.cols);
+        int _endY = (_offsetY + dst.rows);
+        cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
+        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
+
+        bool useDouble = src.depth() == CV_64F;
+
+        float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+        double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+        if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
+        {
+            if (useDouble)
+                args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+            else
+                args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
+        }
 
-    size_t blockSizeX = 256, blockSizeY = 1;
-    size_t gSize = blockSizeX - ksize.width / 2 * 2;
-    size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
-    size_t rows_per_thread = 2;
-    size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
+        double alphaDouble = alpha; // DON'T move into 'if' body
+        if (useDouble)
+            args.push_back( std::make_pair( sizeof(double), (void *)&alphaDouble));
+        else
+            args.push_back( std::make_pair( sizeof(float), (void *)&alpha));
 
+        const char* btype = NULL;
 
-    size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
-    size_t localThreads[3]  = { blockSizeX, blockSizeY, 1};
+        switch (borderType & ~BORDER_ISOLATED)
+        {
+        case BORDER_CONSTANT:
+            btype = "BORDER_CONSTANT";
+            break;
+        case BORDER_REPLICATE:
+            btype = "BORDER_REPLICATE";
+            break;
+        case BORDER_REFLECT:
+            btype = "BORDER_REFLECT";
+            break;
+        case BORDER_WRAP:
+            CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
+            return;
+        case BORDER_REFLECT101:
+            btype = "BORDER_REFLECT_101";
+            break;
+        }
 
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-    args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
+        int requiredTop = anchor.y;
+        int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
+        int requiredBottom = ksize.height - 1 - anchor.y;
+        int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
+        int h = isIsolatedBorder ? src.rows : src.wholerows;
+        int w = isIsolatedBorder ? src.cols : src.wholecols;
+        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+
+        CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
+
+        char build_options[1024];
+        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
+                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
+                src.depth(), src.oclchannels(), useDouble ? 1 : 0,
+                anchor.x, anchor.y, ksize.width, ksize.height,
+                btype,
+                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+
+        size_t lt[3] = {BLOCK_SIZE, 1, 1};
+        size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
+
+        cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options);
+
+        size_t kernelWorkGroupSize;
+        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
+                                                CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+        if (lt[0] > kernelWorkGroupSize)
+        {
+            clReleaseKernel(kernel);
+            CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
+            tryWorkItems = kernelWorkGroupSize;
+            continue;
+        }
 
-    openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+        openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
+    } while (false);
 }
 
-
-Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int srcType, int dstType,
+Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,
         const Size &ksize, Point anchor, int borderType)
 {
-    static const FilterBox_t FilterBox_callers[2][5] = {{0, GPUFilterBox_8u_C1R, 0, GPUFilterBox_8u_C4R, GPUFilterBox_8u_C4R},
-        {0, GPUFilterBox_32F_C1R, 0, GPUFilterBox_32F_C4R, GPUFilterBox_32F_C4R}
-    };
-    //Remove this check if more data types need to be supported.
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 ||
-               srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
-
     normalizeAnchor(anchor, ksize);
 
-    return makePtr<GPUBoxFilter>(ksize, anchor,
-        borderType, FilterBox_callers[(CV_MAT_DEPTH(srcType) == CV_32F)][CV_MAT_CN(srcType)]);
+    return Ptr<BaseFilter_GPU>(new GPUBoxFilter(ksize, anchor,
+                               borderType, GPUFilterBox));
 }
 
 Ptr<FilterEngine_GPU> cv::ocl::createBoxFilter_GPU(int srcType, int dstType,
@@ -1373,8 +1310,11 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
     sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
 }
 
-void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
+void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale,
+        double delta, int borderType)
 {
+    CV_Assert(delta == 0);
+
     if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
@@ -1383,17 +1323,17 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
 
     CV_Assert(ksize == 1 || ksize == 3);
 
-    int K[2][9] =
+    double K[2][9] =
     {
         {0, 1, 0, 1, -4, 1, 0, 1, 0},
         {2, 0, 2, 0, -8, 0, 2, 0, 2}
     };
-    Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]);
+    Mat kernel(3, 3, CV_64F, (void *)K[ksize == 3 ? 1 : 0]);
 
     if (scale != 1)
         kernel *= scale;
 
-    filter2D(src, dst, ddepth, kernel, Point(-1, -1));
+    filter2D(src, dst, ddepth, kernel, Point(-1, -1), 0, borderType);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1431,6 +1371,15 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
 
 void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
 {
+    if (bordertype != BORDER_CONSTANT)
+    {
+        if (src.rows == 1)
+            ksize.height = 1;
+
+        if (src.cols == 1)
+            ksize.width = 1;
+    }
+
     if (ksize.width == 1 && ksize.height == 1)
     {
         src.copyTo(dst);
@@ -1453,15 +1402,6 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
 
     dst.create(src.size(), src.type());
 
-    if (bordertype != BORDER_CONSTANT)
-    {
-        if (src.rows == 1)
-            ksize.height = 1;
-
-        if (src.cols == 1)
-            ksize.width = 1;
-    }
-
     Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
     f->apply(src, dst);
 }
diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp
index 8116496362..fd67daf1e3 100644
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -849,16 +849,138 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
         args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq ));
         args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
 
-        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+        if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+        {
+            //setup local group size
+            localThreads[0] = 8;
+            localThreads[1] = 16;
+            localThreads[2] = 1;
+
+            //init maximal number of workgroups
+            int WGNumX = 1+(sizev[0].width /(localThreads[0]));
+            int WGNumY = 1+(sizev[0].height/(localThreads[1]));
+            int WGNumZ = loopcount;
+            int WGNum = 0; //accurate number of non -empty workgroups
+            oclMat      oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
+            {
+                cl_int4*    pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
+                openCLVerifyCall(status);
+                for(int z=0;z<WGNumZ;++z)
+                {
+                    int     Width  = (scaleinfo[z].width_height >> 16)&0xFFFF;
+                    int     Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF;
+                    for(int y=0;y<WGNumY;++y)
+                    {
+                        int     gy = y*localThreads[1];
+                        if(gy>=(Height-cascade->orig_window_size.height))
+                            continue; // no data to process
+                        for(int x=0;x<WGNumX;++x)
+                        {
+                            int     gx = x*localThreads[0];
+                            if(gx>=(Width-cascade->orig_window_size.width))
+                                continue; // no data to process
+
+                            // save no-empty workgroup info into array
+                            pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
+                            pWGInfo[WGNum].s[1] = (gx << 16) | gy;
+                            pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
+                            memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
+                            WGNum++;
+                        }
+                    }
+                }
+                openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0));
+                pWGInfo = NULL;
+            }
+
+            // setup global sizes to have linear array of workgroups with WGNum size
+            globalThreads[0] = localThreads[0]*WGNum;
+            globalThreads[1] = localThreads[1];
+            globalThreads[2] = 1;
+
+#define NODE_SIZE 12
+            // pack node info to have less memory loads
+            oclMat  oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
+            {
+                cl_int  status;
+                cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE, 0, oclNodesPK.step, 0,0,0,&status);
+                openCLVerifyCall(status);
+                //use known local data stride to precalulate indexes
+                int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width);
+                // check that maximal value is less than maximal unsigned short
+                assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < USHRT_MAX);
+                for(int i = 0;i<nodenum;++i)
+                {//process each node from classifier
+                    struct NodePK
+                    {
+                        unsigned short  slm_index[3][4];
+                        float           weight[3];
+                        float           threshold;
+                        float           alpha[2];
+                    };
+                    struct NodePK * pOut = (struct NodePK *)(pNodesPK + NODE_SIZE*i);
+                    for(int k=0;k<3;++k)
+                    {// calc 4 short indexes in shared local mem for each rectangle instead of 2 (x,y) pair.
+                        int* p = &(node[i].p[k][0]);
+                        pOut->slm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]);
+                        pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]);
+                        pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]);
+                        pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]);
+                    }
+                    //store used float point values for each node
+                    pOut->weight[0] = node[i].weight[0];
+                    pOut->weight[1] = node[i].weight[1];
+                    pOut->weight[2] = node[i].weight[2];
+                    pOut->threshold = node[i].threshold;
+                    pOut->alpha[0] = node[i].alpha[0];
+                   pOut->alpha[1] = node[i].alpha[1];
+                }
+                openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0));
+                pNodesPK = NULL;
+            }
+            // add 2 additional buffers (WGinfo and packed nodes) as 2 last args
+            args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart ));
+            args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart ));
+
+            //form build options for kernel
+            String  options = "-D PACKED_CLASSIFIER";
+            options = options + format(" -D NODE_SIZE=%d",NODE_SIZE);
+            options = options + format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
+            options = options + format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
+            options = options + format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
+            options = options + format(" -D LSx=%d",localThreads[0]);
+            options = options + format(" -D LSy=%d",localThreads[1]);
+            options = options + format(" -D SPLITNODE=%d",splitnode);
+            options = options + format(" -D SPLITSTAGE=%d",splitstage);
+            options = options + format(" -D OUTPUTSZ=%d",outputsz);
+
+            // init candiate global count by 0
+            int pattern = 0;
+            openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
+            // execute face detector
+            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
+            //read candidate buffer back and put it into host list
+            openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
+            assert(candidate[0]<outputsz);
+            //printf("candidate[0]=%d\n",candidate[0]);
+            for(int i = 1; i <= candidate[0]; i++)
+            {
+                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],candidate[4 * i + 2], candidate[4 * i + 3]));
+            }
+        }
+        else
+        {
+            const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
 
-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
+            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
 
-        openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
+            openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
 
-        for(int i = 0; i < outputsz; i++)
-            if(candidate[4 * i + 2] != 0)
-                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
-                candidate[4 * i + 2], candidate[4 * i + 3]));
+            for(int i = 0; i < outputsz; i++)
+                if(candidate[4 * i + 2] != 0)
+                    allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
+                    candidate[4 * i + 2], candidate[4 * i + 3]));
+        }
 
         free(scaleinfo);
         free(candidate);
@@ -934,11 +1056,11 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
         {
             sz = sizev[i];
             factor = scalev[i];
-            int ystep = cvRound(std::max(2., factor));
-            int equRect_x = (int)(factor * gcascade->p0 + 0.5);
-            int equRect_y = (int)(factor * gcascade->p1 + 0.5);
-            int equRect_w = (int)(factor * gcascade->p3 + 0.5);
-            int equRect_h = (int)(factor * gcascade->p2 + 0.5);
+            double ystep = std::max(2., factor);
+            int equRect_x = cvRound(factor * gcascade->p0);
+            int equRect_y = cvRound(factor * gcascade->p1);
+            int equRect_w = cvRound(factor * gcascade->p3);
+            int equRect_h = cvRound(factor * gcascade->p2);
             p[i].s[0] = equRect_x;
             p[i].s[1] = equRect_y;
             p[i].s[2] = equRect_x + equRect_w;
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index ed39868c76..96bdb91327 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -99,79 +99,85 @@ namespace cv
         /////////////////////////////////////////////////////////////////////////////////////
         // threshold
 
-        typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
-
-        static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+        static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
         {
-            uchar thresh_uchar = cvFloor(thresh);
-            uchar max_val = cvRound(maxVal);
+            CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
 
-            size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
-            size_t bSizeX = 16, bSizeY = 16;
-            size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
-            size_t gSizeY = dst.rows;
-            size_t globalThreads[3] = {gSizeX, gSizeY, 1};
-            size_t localThreads[3] = {bSizeX, bSizeY, 1};
+            static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
+                                       sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
 
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair(sizeof(cl_mem), &src.data));
-            args.push_back( std::make_pair(sizeof(cl_mem), &dst.data));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.step));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
-            args.push_back( std::make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
-            args.push_back( std::make_pair(sizeof(cl_uchar), (void *)&max_val));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&type));
-            openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
+            int elemSize1 = sizeMap[depth];
+            int bufSize = elemSize1 * ocn;
+            std::vector<uchar> _buf(bufSize);
+            uchar * buf = &_buf[0];
+            scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
+            memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
+
+            return _buf;
         }
 
-        static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+        static void threshold_runner(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
         {
-            float thresh_f = thresh;
-            float max_val = maxVal;
-            int dst_offset = (dst.offset >> 2);
-            int dst_step = (dst.step >> 2);
-            int src_offset = (src.offset >> 2);
-            int src_step = (src.step >> 2);
-
-            size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
-            size_t bSizeX = 16, bSizeY = 16;
-            size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
-            size_t gSizeY = dst.rows;
-            size_t globalThreads[3] = {gSizeX, gSizeY, 1};
-            size_t localThreads[3] = {bSizeX, bSizeY, 1};
+            bool ival = src.depth() < CV_32F;
+            int cn = src.channels(), vecSize = 4, depth = src.depth();
+            std::vector<uchar> thresholdValue = scalarToVector(cv::Scalar::all(ival ? cvFloor(thresh) : thresh), dst.depth(),
+                                                               dst.oclchannels(), dst.channels());
+            std::vector<uchar> maxValue = scalarToVector(cv::Scalar::all(maxVal), dst.depth(), dst.oclchannels(), dst.channels());
+
+            const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
+                                                  "THRESH_TOZERO", "THRESH_TOZERO_INV" };
+            const char * const channelMap[] = { "", "", "2", "4", "4" };
+            const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+            std::string buildOptions = format("-D T=%s%s -D %s", typeMap[depth], channelMap[cn], thresholdMap[thresholdType]);
+
+            int elemSize = src.elemSize();
+            int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
+            int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
 
             std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair(sizeof(cl_mem), &src.data));
-            args.push_back( std::make_pair(sizeof(cl_mem), &dst.data));
+            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
             args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_offset));
             args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_step));
+            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_step));
-            args.push_back( std::make_pair(sizeof(cl_float), (void *)&thresh_f));
-            args.push_back( std::make_pair(sizeof(cl_float), (void *)&max_val));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&type));
+            args.push_back( std::make_pair(thresholdValue.size(), (void *)&thresholdValue[0]));
+            args.push_back( std::make_pair(maxValue.size(), (void *)&maxValue[0]));
 
-            openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
+            int max_index = dst.cols, cols = dst.cols;
+            if (cn == 1 && vecSize > 1)
+            {
+                CV_Assert(((vecSize - 1) & vecSize) == 0 && vecSize <= 16);
+                cols = divUp(cols, vecSize);
+                buildOptions += format(" -D VECTORIZED -D VT=%s%d -D VLOADN=vload%d -D VECSIZE=%d -D VSTOREN=vstore%d",
+                                       typeMap[depth], vecSize, vecSize, vecSize, vecSize);
+
+                int vecSizeBytes = vecSize * dst.elemSize1();
+                if ((dst.offset % dst.step) % vecSizeBytes == 0 && dst.step % vecSizeBytes == 0)
+                    buildOptions += " -D DST_ALIGNED";
+                if ((src.offset % src.step) % vecSizeBytes == 0 && src.step % vecSizeBytes == 0)
+                    buildOptions += " -D SRC_ALIGNED";
+
+                args.push_back( std::make_pair(sizeof(cl_int), (void *)&max_index));
+            }
+
+            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
+            args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
+
+            size_t localThreads[3] = { 16, 16, 1 };
+            size_t globalThreads[3] = { cols, dst.rows, 1 };
+
+            openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args,
+                                -1, -1, buildOptions.c_str());
         }
 
-        // threshold: support 8UC1 and 32FC1 data type and five threshold type
-        double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+        double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
         {
-            //TODO: These limitations shall be removed later.
-            CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
-            CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
-                      || type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
-
-            static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
+            CV_Assert(thresholdType == THRESH_BINARY || thresholdType == THRESH_BINARY_INV || thresholdType == THRESH_TRUNC
+                      || thresholdType == THRESH_TOZERO || thresholdType == THRESH_TOZERO_INV);
 
-            dst.create( src.size(), src.type() );
-            gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
+            dst.create(src.size(), src.type());
+            threshold_runner(src, dst, thresh, maxVal, thresholdType);
 
             return thresh;
         }
@@ -891,8 +897,60 @@ namespace cv
 
             if (ksize > 0)
             {
-                Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
-                Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
+                Context* clCxt = Context::getContext();
+                if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
+                    src.cols % 8 == 0 && src.rows % 8 == 0 &&
+                    ksize==3 &&
+                    (borderType ==cv::BORDER_REFLECT ||
+                     borderType == cv::BORDER_REPLICATE ||
+                     borderType ==cv::BORDER_REFLECT101 ||
+                     borderType ==cv::BORDER_WRAP))
+                {
+                    Dx.create(src.size(), CV_32FC1);
+                    Dy.create(src.size(), CV_32FC1);
+
+                    const unsigned int block_x = 8;
+                    const unsigned int block_y = 8;
+
+                    unsigned int src_pitch = src.step;
+                    unsigned int dst_pitch = Dx.cols;
+
+                    float _scale = scale;
+
+                    std::vector<std::pair<size_t , const void *> > args;
+                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
+                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
+                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
+                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
+                    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
+                    size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
+
+                    String option = "-D BLK_X=8 -D BLK_Y=8";
+                    switch(borderType)
+                    {
+                    case cv::BORDER_REPLICATE:
+                        option = option + " -D BORDER_REPLICATE";
+                        break;
+                    case cv::BORDER_REFLECT:
+                        option = option + " -D BORDER_REFLECT";
+                        break;
+                    case cv::BORDER_REFLECT101:
+                        option = option + " -D BORDER_REFLECT101";
+                        break;
+                    case cv::BORDER_WRAP:
+                        option = option + " -D BORDER_WRAP";
+                        break;
+                    }
+                    openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
+                }
+                else
+                {
+                    Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
+                    Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
+                }
             }
             else
             {
@@ -954,6 +1012,7 @@ namespace cv
             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
             args.push_back( std::make_pair( sizeof(cl_float) , (void *)&k));
+
             openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, buildOptions.c_str());
         }
 
@@ -969,15 +1028,15 @@ namespace cv
         {
             if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
             {
-                CV_Error(Error::OpenCLDoubleNotSupported, "Select device doesn't support double");
+                CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
                 return;
             }
 
-            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE
                       || borderType == cv::BORDER_REFLECT);
+
             extractCovData(src, dx, dy, blockSize, ksize, borderType);
-            dst.create(src.size(), CV_32F);
+            dst.create(src.size(), CV_32FC1);
             corner_ocl(&imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
         }
 
@@ -991,12 +1050,13 @@ namespace cv
         {
             if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
             {
-                CV_Error(Error::OpenCLDoubleNotSupported, "select device don't support double");
+                CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
                 return;
             }
 
-            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 ||
+                      borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+
             extractCovData(src, dx, dy, blockSize, ksize, borderType);
             dst.create(src.size(), CV_32F);
 
diff --git a/modules/ocl/src/kmeans.cpp b/modules/ocl/src/kmeans.cpp
index 5486aa495a..52fe0eb6ff 100644
--- a/modules/ocl/src/kmeans.cpp
+++ b/modules/ocl/src/kmeans.cpp
@@ -160,32 +160,61 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
     }
 }
 
-void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers)
+void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers, int distType, const oclMat &indices)
 {
-    //if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
-    //{
-    //    CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-    //    return;
-    //}
-
-    Context  *clCxt = src.clCxt;
-    int labels_step = (int)(labels.step/labels.elemSize());
+    CV_Assert(src.cols*src.oclchannels() == centers.cols*centers.oclchannels());
+    CV_Assert(src.depth() == CV_32F && centers.depth() == CV_32F);
+    bool is_label_row_major = false;
+    ensureSizeIsEnough(1, src.rows, CV_32FC1, dists);
+    if(labels.empty() || (!labels.empty() && labels.rows == src.rows && labels.cols == 1))
+    {
+        ensureSizeIsEnough(src.rows, 1, CV_32SC1, labels);
+        is_label_row_major = true;
+    }
+    CV_Assert(distType == NORM_L1 || distType == NORM_L2SQR);
+
+    std::stringstream build_opt_ss;
+    build_opt_ss
+        << (distType == NORM_L1 ? "-D L1_DIST" : "-D L2SQR_DIST")
+        << (indices.empty() ? "" : " -D USE_INDEX");
+
+    String build_opt = build_opt_ss.str();
+
+    const int src_step = (int)(src.oclchannels() * src.step / src.elemSize());
+    const int centers_step = (int)(centers.oclchannels() * centers.step / centers.elemSize());
+
+    const int colsNumb = centers.cols*centers.oclchannels();
+
+    const int label_step   = is_label_row_major ? (int)(labels.step / labels.elemSize()) : 1;
     String kernelname = "distanceToCenters";
-    int threadNum = src.rows > 256 ? 256 : src.rows;
-    size_t localThreads[3]  = {1, threadNum, 1};
-    size_t globalThreads[3] = {1, src.rows, 1};
+
+    const int number_of_input = indices.empty() ? src.rows : indices.size().area();
+
+    const int src_offset = (int)src.offset/src.elemSize();
+    const int centers_offset = (int)centers.offset/centers.elemSize();
+
+    size_t globalThreads[3] = {number_of_input, 1, 1};
 
     std::vector<std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&labels_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers.rows));
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&labels.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&centers.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&dists.data));
+    if(!indices.empty())
+    {
+        args.push_back(std::make_pair(sizeof(cl_mem), (void *)&indices.data));
+    }
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&labels.data));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dists.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&colsNumb));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&label_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&number_of_input));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers_offset));
 
-    openCLExecuteKernel(clCxt, &kmeans_kernel, kernelname, globalThreads, localThreads, args, -1, -1, NULL);
+    openCLExecuteKernel(Context::getContext(), &kmeans_kernel,
+        kernelname, globalThreads, NULL, args, -1, -1, build_opt.c_str());
 }
 ///////////////////////////////////k - means /////////////////////////////////////////////////////////
 double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels,
@@ -404,17 +433,17 @@ double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels,
 
             _bestLabels.upload(_labels);
             _centers.upload(centers);
+
             distanceToCenters(_dists, _bestLabels, _src, _centers);
 
             Mat dists;
             _dists.download(dists);
             _bestLabels.download(_labels);
-
-            double* dist = dists.ptr<double>(0);
+            float* dist = dists.ptr<float>(0);
             compactness = 0;
             for( i = 0; i < N; i++ )
             {
-                compactness += dist[i];
+                    compactness += (double)dist[i];
             }
         }
 
diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp
index 6372364dd4..0ba6e8ce05 100644
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
+//    Jin Ma,  jin@multicorewareinc.com
 //    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -26,7 +26,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other Materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -50,295 +50,342 @@
 
 #include "opencl_kernels.hpp"
 
+#if defined _MSC_VER
+#define snprintf sprintf_s
+#endif
 namespace cv
 {
-namespace ocl
-{
-// The function calculates center of gravity and the central second order moments
-static void icvCompleteMomentState( CvMoments* moments )
-{
-    double cx = 0, cy = 0;
-    double mu20, mu11, mu02;
-
-    assert( moments != 0 );
-    moments->inv_sqrt_m00 = 0;
-
-    if( fabs(moments->m00) > DBL_EPSILON )
-    {
-        double inv_m00 = 1. / moments->m00;
-        cx = moments->m10 * inv_m00;
-        cy = moments->m01 * inv_m00;
-        moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
-    }
-
-    // mu20 = m20 - m10*cx
-    mu20 = moments->m20 - moments->m10 * cx;
-    // mu11 = m11 - m10*cy
-    mu11 = moments->m11 - moments->m10 * cy;
-    // mu02 = m02 - m01*cy
-    mu02 = moments->m02 - moments->m01 * cy;
-
-    moments->mu20 = mu20;
-    moments->mu11 = mu11;
-    moments->mu02 = mu02;
-
-    // mu30 = m30 - cx*(3*mu20 + cx*m10)
-    moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
-    mu11 += mu11;
-    // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
-    moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
-    // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
-    moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
-    // mu03 = m03 - cy*(3*mu02 + cy*m01)
-    moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
-}
-
-
-static void icvContourMoments( CvSeq* contour, CvMoments* mom )
-{
-    if( contour->total )
+    namespace ocl
     {
-        CvSeqReader reader;
-        int lpt = contour->total;
-        double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
-
-        cvStartReadSeq( contour, &reader, 0 );
+        // The function calculates center of gravity and the central second order moments
+        static void icvCompleteMomentState( CvMoments* moments )
+        {
+            double cx = 0, cy = 0;
+            double mu20, mu11, mu02;
 
-        size_t reader_size = lpt << 1;
-        cv::Mat reader_mat(1,reader_size,CV_32FC1);
+            assert( moments != 0 );
+            moments->inv_sqrt_m00 = 0;
 
-        bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
+            if( fabs(moments->m00) > DBL_EPSILON )
+            {
+                double inv_m00 = 1. / moments->m00;
+                cx = moments->m10 * inv_m00;
+                cy = moments->m01 * inv_m00;
+                moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
+            }
 
-        if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
-        {
-            CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
+            // mu20 = m20 - m10*cx
+            mu20 = moments->m20 - moments->m10 * cx;
+            // mu11 = m11 - m10*cy
+            mu11 = moments->m11 - moments->m10 * cy;
+            // mu02 = m02 - m01*cy
+            mu02 = moments->m02 - moments->m01 * cy;
+
+            moments->mu20 = mu20;
+            moments->mu11 = mu11;
+            moments->mu02 = mu02;
+
+            // mu30 = m30 - cx*(3*mu20 + cx*m10)
+            moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
+            mu11 += mu11;
+            // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
+            moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
+            // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
+            moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
+            // mu03 = m03 - cy*(3*mu02 + cy*m01)
+            moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
         }
 
-        if( is_float )
+
+        static void icvContourMoments( CvSeq* contour, CvMoments* mom )
         {
-            for(size_t i = 0; i < reader_size; ++i)
+            if( contour->total )
             {
-                reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
-                reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
-                CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+                CvSeqReader reader;
+                int lpt = contour->total;
+                double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
+
+                cvStartReadSeq( contour, &reader, 0 );
+
+                size_t reader_size = lpt << 1;
+                cv::Mat reader_mat(1,reader_size,CV_32FC1);
+
+                bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
+
+                if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
+                {
+                    CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
+                }
+
+                if( is_float )
+                {
+                    for(size_t i = 0; i < reader_size; ++i)
+                    {
+                        reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
+                        reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
+                        CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+                    }
+                }
+                else
+                {
+                    for(size_t i = 0; i < reader_size; ++i)
+                    {
+                        reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
+                        reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
+                        CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+                    }
+                }
+
+                cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
+                cv::ocl::oclMat reader_oclmat(reader_mat);
+                int llength = std::min(lpt,128);
+                size_t localThreads[3]  = { llength, 1, 1};
+                size_t globalThreads[3] = { lpt, 1, 1};
+                std::vector<std::pair<size_t , const void *> > args;
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
+                cl_int dst_step = (cl_int)dst_a.step;
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step ));
+
+                char builOption[128];
+                snprintf(builOption, 128, "-D CV_8UC1");
+
+                openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1, builOption);
+
+                cv::Mat dst(dst_a);
+                a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
+                if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
+                {
+                    for (int i = 0; i < contour->total; ++i)
+                    {
+                        a00 += dst.at<cl_long>(0, i);
+                        a10 += dst.at<cl_long>(1, i);
+                        a01 += dst.at<cl_long>(2, i);
+                        a20 += dst.at<cl_long>(3, i);
+                        a11 += dst.at<cl_long>(4, i);
+                        a02 += dst.at<cl_long>(5, i);
+                        a30 += dst.at<cl_long>(6, i);
+                        a21 += dst.at<cl_long>(7, i);
+                        a12 += dst.at<cl_long>(8, i);
+                        a03 += dst.at<cl_long>(9, i);
+                    }
+                }
+                else
+                {
+                    a00 = cv::sum(dst.row(0))[0];
+                    a10 = cv::sum(dst.row(1))[0];
+                    a01 = cv::sum(dst.row(2))[0];
+                    a20 = cv::sum(dst.row(3))[0];
+                    a11 = cv::sum(dst.row(4))[0];
+                    a02 = cv::sum(dst.row(5))[0];
+                    a30 = cv::sum(dst.row(6))[0];
+                    a21 = cv::sum(dst.row(7))[0];
+                    a12 = cv::sum(dst.row(8))[0];
+                    a03 = cv::sum(dst.row(9))[0];
+                }
+
+                double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
+                if( fabs(a00) > FLT_EPSILON )
+                {
+                    if( a00 > 0 )
+                    {
+                        db1_2 = 0.5;
+                        db1_6 = 0.16666666666666666666666666666667;
+                        db1_12 = 0.083333333333333333333333333333333;
+                        db1_24 = 0.041666666666666666666666666666667;
+                        db1_20 = 0.05;
+                        db1_60 = 0.016666666666666666666666666666667;
+                    }
+                    else
+                    {
+                        db1_2 = -0.5;
+                        db1_6 = -0.16666666666666666666666666666667;
+                        db1_12 = -0.083333333333333333333333333333333;
+                        db1_24 = -0.041666666666666666666666666666667;
+                        db1_20 = -0.05;
+                        db1_60 = -0.016666666666666666666666666666667;
+                    }
+
+                    // spatial moments
+                    mom->m00 = a00 * db1_2;
+                    mom->m10 = a10 * db1_6;
+                    mom->m01 = a01 * db1_6;
+                    mom->m20 = a20 * db1_12;
+                    mom->m11 = a11 * db1_24;
+                    mom->m02 = a02 * db1_12;
+                    mom->m30 = a30 * db1_20;
+                    mom->m21 = a21 * db1_60;
+                    mom->m12 = a12 * db1_60;
+                    mom->m03 = a03 * db1_20;
+
+                    icvCompleteMomentState( mom );
+                }
             }
         }
-        else
+
+        Moments ocl_moments(oclMat& src, bool binary) //for image
         {
-            for(size_t i = 0; i < reader_size; ++i)
+            CV_Assert(src.oclchannels() == 1);
+            if(src.type() == CV_64FC1 && !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
             {
-                reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
-                reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
-                CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+                CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
             }
-        }
 
-        cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
-        cv::ocl::oclMat reader_oclmat(reader_mat);
-        int llength = std::min(lpt,128);
-        size_t localThreads[3]  = { llength, 1, 1};
-        size_t globalThreads[3] = { lpt, 1, 1};
-        std::vector<std::pair<size_t , const void *> > args;
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
-        cl_int dst_step = (cl_int)dst_a.step;
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step ));
-
-        openCLExecuteKernel2(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
-
-        cv::Mat dst(dst_a);
-        a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
-        if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-        {
-            for (int i = 0; i < contour->total; ++i)
+            if(binary)
             {
-                a00 += dst.at<cl_long>(0, i);
-                a10 += dst.at<cl_long>(1, i);
-                a01 += dst.at<cl_long>(2, i);
-                a20 += dst.at<cl_long>(3, i);
-                a11 += dst.at<cl_long>(4, i);
-                a02 += dst.at<cl_long>(5, i);
-                a30 += dst.at<cl_long>(6, i);
-                a21 += dst.at<cl_long>(7, i);
-                a12 += dst.at<cl_long>(8, i);
-                a03 += dst.at<cl_long>(9, i);
+                oclMat mask;
+                if(src.type() != CV_8UC1)
+                {
+                    src.convertTo(mask, CV_8UC1);
+                }
+                oclMat src8u(src.size(), CV_8UC1);
+                src8u.setTo(Scalar(255), mask);
+                src = src8u;
             }
-        }
-        else
-        {
-            a00 = cv::sum(dst.row(0))[0];
-            a10 = cv::sum(dst.row(1))[0];
-            a01 = cv::sum(dst.row(2))[0];
-            a20 = cv::sum(dst.row(3))[0];
-            a11 = cv::sum(dst.row(4))[0];
-            a02 = cv::sum(dst.row(5))[0];
-            a30 = cv::sum(dst.row(6))[0];
-            a21 = cv::sum(dst.row(7))[0];
-            a12 = cv::sum(dst.row(8))[0];
-            a03 = cv::sum(dst.row(9))[0];
-        }
+            const int TILE_SIZE = 256;
 
-        double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
-        if( fabs(a00) > FLT_EPSILON )
-        {
-            if( a00 > 0 )
+            CvMoments mom;
+            memset(&mom, 0, sizeof(mom));
+
+            cv::Size size = src.size();
+            int blockx, blocky;
+            blockx = (size.width + TILE_SIZE - 1)/TILE_SIZE;
+            blocky = (size.height + TILE_SIZE - 1)/TILE_SIZE;
+
+            oclMat dst_m;
+            int tile_height = TILE_SIZE;
+
+            size_t localThreads[3]  = {1, tile_height, 1};
+            size_t globalThreads[3] = {blockx, size.height, 1};
+
+            if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
             {
-                db1_2 = 0.5;
-                db1_6 = 0.16666666666666666666666666666667;
-                db1_12 = 0.083333333333333333333333333333333;
-                db1_24 = 0.041666666666666666666666666666667;
-                db1_20 = 0.05;
-                db1_60 = 0.016666666666666666666666666666667;
+                dst_m.create(blocky * 10, blockx, CV_64FC1);
+            }else
+            {
+                dst_m.create(blocky * 10, blockx, CV_32FC1);
             }
+
+            int src_step = (int)(src.step/src.elemSize());
+            int dstm_step = (int)(dst_m.step/dst_m.elemSize());
+
+            std::vector<std::pair<size_t , const void *> > args,args_sum;
+            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step ));
+            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
+            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
+            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstm_step ));
+
+            int binary_;
+            if(binary)
+                binary_ = 1;
             else
+                binary_ = 0;
+            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary_));
+
+            char builOption[128];
+            if(binary || src.type() == CV_8UC1)
+            {
+                snprintf(builOption, 128, "-D CV_8UC1");
+            }else if(src.type() == CV_16UC1)
             {
-                db1_2 = -0.5;
-                db1_6 = -0.16666666666666666666666666666667;
-                db1_12 = -0.083333333333333333333333333333333;
-                db1_24 = -0.041666666666666666666666666666667;
-                db1_20 = -0.05;
-                db1_60 = -0.016666666666666666666666666666667;
+                snprintf(builOption, 128, "-D CV_16UC1");
+            }else if(src.type() == CV_16SC1)
+            {
+                snprintf(builOption, 128, "-D CV_16SC1");
+            }else if(src.type() == CV_32FC1)
+            {
+                snprintf(builOption, 128, "-D CV_32FC1");
+            }else if(src.type() == CV_64FC1)
+            {
+                snprintf(builOption, 128, "-D CV_64FC1");
+            }else
+            {
+                CV_Error( CV_StsUnsupportedFormat, "" );
             }
 
-            // spatial moments
-            mom->m00 = a00 * db1_2;
-            mom->m10 = a10 * db1_6;
-            mom->m01 = a01 * db1_6;
-            mom->m20 = a20 * db1_12;
-            mom->m11 = a11 * db1_24;
-            mom->m02 = a02 * db1_12;
-            mom->m30 = a30 * db1_20;
-            mom->m21 = a21 * db1_60;
-            mom->m12 = a12 * db1_60;
-            mom->m03 = a03 * db1_20;
-
-            icvCompleteMomentState( mom );
-        }
-    }
-}
-
-static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
-{
-    const int TILE_SIZE = 256;
-    int type, depth, cn, coi = 0;
-    CvMat stub, *mat = (CvMat*)array;
-    CvContour contourHeader;
-    CvSeq* contour = 0;
-    CvSeqBlock block;
-    if( CV_IS_SEQ( array ))
-    {
-        contour = (CvSeq*)array;
-        if( !CV_IS_SEQ_POINT_SET( contour ))
-            CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
-    }
+            openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, -1, builOption);
 
-    if( !mom )
-        CV_Error( CV_StsNullPtr, "" );
+            Mat tmp(dst_m);
+            tmp.convertTo(tmp, CV_64FC1);
 
-    memset( mom, 0, sizeof(*mom));
+            double tmp_m[10] = {0};
 
-    if( !contour )
-    {
+            for(int j = 0; j < tmp.rows; j += 10)
+            {
+                for(int i = 0; i < tmp.cols; i++)
+                {
+                    tmp_m[0] += tmp.at<double>(j, i);
+                    tmp_m[1] += tmp.at<double>(j + 1, i);
+                    tmp_m[2] += tmp.at<double>(j + 2, i);
+                    tmp_m[3] += tmp.at<double>(j + 3, i);
+                    tmp_m[4] += tmp.at<double>(j + 4, i);
+                    tmp_m[5] += tmp.at<double>(j + 5, i);
+                    tmp_m[6] += tmp.at<double>(j + 6, i);
+                    tmp_m[7] += tmp.at<double>(j + 7, i);
+                    tmp_m[8] += tmp.at<double>(j + 8, i);
+                    tmp_m[9] += tmp.at<double>(j + 9, i);
+                }
+            }
 
-        mat = cvGetMat( mat, &stub, &coi );
-        type = CV_MAT_TYPE( mat->type );
+            mom.m00 = tmp_m[0];
+            mom.m10 = tmp_m[1];
+            mom.m01 = tmp_m[2];
+            mom.m20 = tmp_m[3];
+            mom.m11 = tmp_m[4];
+            mom.m02 = tmp_m[5];
+            mom.m30 = tmp_m[6];
+            mom.m21 = tmp_m[7];
+            mom.m12 = tmp_m[8];
+            mom.m03 = tmp_m[9];
+            icvCompleteMomentState( &mom );
+            return mom;
+        }
 
-        if( type == CV_32SC2 || type == CV_32FC2 )
+        Moments ocl_moments(InputArray _contour) //for contour
         {
-            contour = cvPointSeqFromMat(
-                          CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
-                          mat, &contourHeader, &block );
-        }
-    }
-    if( contour )
-    {
-        icvContourMoments( contour, mom );
-        return;
-    }
+            CvMoments mom;
+            memset(&mom, 0, sizeof(mom));
 
-    type = CV_MAT_TYPE( mat->type );
-    depth = CV_MAT_DEPTH( type );
-    cn = CV_MAT_CN( type );
-
-    cv::Size size = cvGetMatSize( mat );
-    if( cn > 1 && coi == 0 )
-        CV_Error( CV_StsBadArg, "Invalid image type" );
-
-    if( size.width <= 0 || size.height <= 0 )
-        return;
-
-    cv::Mat src0 = cv::cvarrToMat(mat);
-    cv::ocl::oclMat src(src0);
-    cv::Size tileSize;
-    int blockx,blocky;
-    if(size.width%TILE_SIZE == 0)
-        blockx = size.width/TILE_SIZE;
-    else
-        blockx = size.width/TILE_SIZE + 1;
-    if(size.height%TILE_SIZE == 0)
-        blocky = size.height/TILE_SIZE;
-    else
-        blocky = size.height/TILE_SIZE + 1;
-    oclMat dst_m(blocky * 10, blockx, CV_64FC1);
-    oclMat sum(1, 10, CV_64FC1);
-    int tile_width  = std::min(size.width,TILE_SIZE);
-    int tile_height = std::min(size.height,TILE_SIZE);
-    size_t localThreads[3]  = { tile_height, 1, 1};
-    size_t globalThreads[3] = { size.height, blockx, 1};
-    std::vector<std::pair<size_t , const void *> > args,args_sum;
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&blocky ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&depth ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cn ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&coi ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    openCLExecuteKernel2(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
-
-    size_t localThreadss[3]  = { 128, 1, 1};
-    size_t globalThreadss[3] = { 128, 1, 1};
-    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_height ));
-    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_width ));
-    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
-    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
-    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
-    openCLExecuteKernel2(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
-
-    Mat dstsum(sum);
-    mom->m00 = dstsum.at<double>(0, 0);
-    mom->m10 = dstsum.at<double>(0, 1);
-    mom->m01 = dstsum.at<double>(0, 2);
-    mom->m20 = dstsum.at<double>(0, 3);
-    mom->m11 = dstsum.at<double>(0, 4);
-    mom->m02 = dstsum.at<double>(0, 5);
-    mom->m30 = dstsum.at<double>(0, 6);
-    mom->m21 = dstsum.at<double>(0, 7);
-    mom->m12 = dstsum.at<double>(0, 8);
-    mom->m03 = dstsum.at<double>(0, 9);
-
-    icvCompleteMomentState( mom );
-}
+            Mat arr = _contour.getMat();
+            CvMat c_array = arr;
 
+            const void* array = &c_array;
 
-Moments ocl_moments( InputArray _array, bool binaryImage )
-{
-    CvMoments om;
-    Mat arr = _array.getMat();
-    CvMat c_array = arr;
-    ocl_cvMoments(&c_array, &om, binaryImage);
-    return om;
-}
+            CvSeq* contour = 0;
+            if( CV_IS_SEQ( array ))
+            {
+                contour = (CvSeq*)(array);
+                if( !CV_IS_SEQ_POINT_SET( contour ))
+                    CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
+            }
 
-}
+            int type, coi = 0;
+
+            CvMat stub, *mat = (CvMat*)(array);
+            CvContour contourHeader;
+            CvSeqBlock block;
+
+            if( !contour )
+            {
+                mat = cvGetMat( mat, &stub, &coi );
+                type = CV_MAT_TYPE( mat->type );
+
+                if( type == CV_32SC2 || type == CV_32FC2 )
+                {
+                    contour = cvPointSeqFromMat(
+                        CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
+                        mat, &contourHeader, &block );
+                }
+            }
+
+            CV_Assert(contour);
 
+            icvContourMoments(contour, &mom);
+            return mom;
+        }
+    }
 }
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
index a1876b57d0..03f46ccc0b 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/src/opencl/arithm_bitwise_not.cl b/modules/ocl/src/opencl/arithm_bitwise_not.cl
index e5b46c9368..5bc1839d6a 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_not.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl
@@ -67,7 +67,6 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
         x = x << 2;
         int src1_index = mad24(y, src1_step, x + src1_offset);
 
-        int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x);
 
@@ -97,7 +96,6 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
         x = x << 2;
         int src1_index = mad24(y, src1_step, x + src1_offset);
 
-        int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x);
 
diff --git a/modules/ocl/src/opencl/arithm_cartToPolar.cl b/modules/ocl/src/opencl/arithm_cartToPolar.cl
index 6c779ead90..e37818c40f 100644
--- a/modules/ocl/src/opencl/arithm_cartToPolar.cl
+++ b/modules/ocl/src/opencl/arithm_cartToPolar.cl
@@ -44,14 +44,18 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
+    #pragma OPENCL EXTENSION cl_khr_fp64:enable
+    #define CV_PI   3.1415926535897932384626433832795
+    #ifndef DBL_EPSILON
+        #define DBL_EPSILON 0x1.0p-52
+    #endif
+#else
+    #define CV_PI   3.1415926535897932384626433832795f
+    #ifndef DBL_EPSILON
+        #define DBL_EPSILON 0x1.0p-52f
+    #endif
 #endif
 
-#define CV_PI   3.1415926535897932384626433832795
-
-#ifndef DBL_EPSILON
-#define DBL_EPSILON 0x1.0p-52
-#endif
 
 __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset,
                                      __global float *src2, int src2_step, int src2_offset,
@@ -82,9 +86,9 @@ __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int sr
         float tmp = y >= 0 ? 0 : CV_PI*2;
         tmp = x < 0 ? CV_PI : tmp;
 
-        float tmp1 = y >= 0 ? CV_PI*0.5 : CV_PI*1.5;
-        cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + (float)DBL_EPSILON)  + tmp :
-                                 tmp1 - x*y/(y2 + 0.28f*x2 + (float)DBL_EPSILON);
+        float tmp1 = y >= 0 ? CV_PI*0.5f : CV_PI*1.5f;
+        cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + DBL_EPSILON)  + tmp :
+                                 tmp1 - x*y/(y2 + 0.28f*x2 + DBL_EPSILON);
 
         cartToPolar = angInDegree == 0 ? cartToPolar : cartToPolar * (float)(180/CV_PI);
 
diff --git a/modules/ocl/src/opencl/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl
index 7c2a04d74f..416240bd85 100644
--- a/modules/ocl/src/opencl/arithm_flip.cl
+++ b/modules/ocl/src/opencl/arithm_flip.cl
@@ -51,969 +51,75 @@
 #endif
 #endif
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////flip rows///////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_offset,
-                                   __global uchar *dst, int dst_step, int dst_offset,
-                                   int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
-        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
-
-        int dst_start_0  = mad24(y,            dst_step, dst_offset);
-        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
-        int dst_end_0    = mad24(y,            dst_step, dst_offset + dst_step1);
-        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
-        int dst_index_0  = mad24(y,            dst_step, dst_offset + x & (int)0xfffffffc);
-        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
-        int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
-        uchar4 src_data_0 = vload4(0, src + src1_index_fix);
-        uchar4 src_data_1 = vload4(0, src + src2_index_fix);
-        if(src_index_0 < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
-            src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
-        }
-        if(src_index_1 < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
-            src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data_0 = *((__global uchar4 *)(dst + dst_index_0));
-        uchar4 dst_data_1 = *((__global uchar4 *)(dst + dst_index_1));
-
-        dst_data_0.x =  (dst_index_0 + 0 >= dst_start_0)                                   ? src_data_1.x : dst_data_0.x;
-        dst_data_0.y = ((dst_index_0 + 1 >= dst_start_0) && (dst_index_0 + 1 < dst_end_0)) ? src_data_1.y : dst_data_0.y;
-        dst_data_0.z = ((dst_index_0 + 2 >= dst_start_0) && (dst_index_0 + 2 < dst_end_0)) ? src_data_1.z : dst_data_0.z;
-        dst_data_0.w =  (dst_index_0 + 3 < dst_end_0)                                      ? src_data_1.w : dst_data_0.w;
-
-        dst_data_1.x =  (dst_index_1 + 0 >= dst_start_1)                                   ? src_data_0.x : dst_data_1.x;
-        dst_data_1.y = ((dst_index_1 + 1 >= dst_start_1) && (dst_index_1 + 1 < dst_end_1)) ? src_data_0.y : dst_data_1.y;
-        dst_data_1.z = ((dst_index_1 + 2 >= dst_start_1) && (dst_index_1 + 2 < dst_end_1)) ? src_data_0.z : dst_data_1.z;
-        dst_data_1.w =  (dst_index_1 + 3 < dst_end_1)                                      ? src_data_0.w : dst_data_1.w;
-
-        *((__global uchar4 *)(dst + dst_index_0)) = dst_data_0;
-        *((__global uchar4 *)(dst + dst_index_1)) = dst_data_1;
-    }
-}
-__kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_offset,
-                                   __global char *dst, int dst_step, int dst_offset,
-                                   int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
-        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
-
-        int dst_start_0  = mad24(y,            dst_step, dst_offset);
-        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
-        int dst_end_0    = mad24(y,            dst_step, dst_offset + dst_step1);
-        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
-        int dst_index_0  = mad24(y,            dst_step, dst_offset + x & (int)0xfffffffc);
-        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src_data_0 = vload4(0, src + src_index_0);
-        char4 src_data_1 = vload4(0, src + src_index_1);
-
-        char4 dst_data_0 = *((__global char4 *)(dst + dst_index_0));
-        char4 dst_data_1 = *((__global char4 *)(dst + dst_index_1));
-
-        dst_data_0.x =  (dst_index_0 + 0 >= dst_start_0)                                   ? src_data_1.x : dst_data_0.x;
-        dst_data_0.y = ((dst_index_0 + 1 >= dst_start_0) && (dst_index_0 + 1 < dst_end_0)) ? src_data_1.y : dst_data_0.y;
-        dst_data_0.z = ((dst_index_0 + 2 >= dst_start_0) && (dst_index_0 + 2 < dst_end_0)) ? src_data_1.z : dst_data_0.z;
-        dst_data_0.w =  (dst_index_0 + 3 < dst_end_0)                                      ? src_data_1.w : dst_data_0.w;
-
-        dst_data_1.x =  (dst_index_1 + 0 >= dst_start_1)                                   ? src_data_0.x : dst_data_1.x;
-        dst_data_1.y = ((dst_index_1 + 1 >= dst_start_1) && (dst_index_1 + 1 < dst_end_1)) ? src_data_0.y : dst_data_1.y;
-        dst_data_1.z = ((dst_index_1 + 2 >= dst_start_1) && (dst_index_1 + 2 < dst_end_1)) ? src_data_0.z : dst_data_1.z;
-        dst_data_1.w =  (dst_index_1 + 3 < dst_end_1)                                      ? src_data_0.w : dst_data_1.w;
-
-        *((__global char4 *)(dst + dst_index_0)) = dst_data_0;
-        *((__global char4 *)(dst + dst_index_1)) = dst_data_1;
-    }
-}
-__kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_offset,
-                                   __global ushort *dst, int dst_step, int dst_offset,
-                                   int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset >> 1) & 3) << 1)
-        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
-        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
-
-        int dst_start_0  = mad24(y,            dst_step, dst_offset);
-        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
-        int dst_end_0    = mad24(y,            dst_step, dst_offset + dst_step1);
-        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
-        int dst_index_0  = mad24(y,            dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)src + src_index_0));
-        ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)src + src_index_1));
-
-        ushort4 dst_data_0 = *((__global ushort4 *)((__global char *)dst + dst_index_0));
-        ushort4 dst_data_1 = *((__global ushort4 *)((__global char *)dst + dst_index_1));
-
-        dst_data_0.x =  (dst_index_0 + 0 >= dst_start_0)                                   ? src_data_1.x : dst_data_0.x;
-        dst_data_0.y = ((dst_index_0 + 2 >= dst_start_0) && (dst_index_0 + 2 < dst_end_0)) ? src_data_1.y : dst_data_0.y;
-        dst_data_0.z = ((dst_index_0 + 4 >= dst_start_0) && (dst_index_0 + 4 < dst_end_0)) ? src_data_1.z : dst_data_0.z;
-        dst_data_0.w =  (dst_index_0 + 6 < dst_end_0)                                      ? src_data_1.w : dst_data_0.w;
-
-        dst_data_1.x =  (dst_index_1 + 0 >= dst_start_1)                                   ? src_data_0.x : dst_data_1.x;
-        dst_data_1.y = ((dst_index_1 + 2 >= dst_start_1) && (dst_index_1 + 2 < dst_end_1)) ? src_data_0.y : dst_data_1.y;
-        dst_data_1.z = ((dst_index_1 + 4 >= dst_start_1) && (dst_index_1 + 4 < dst_end_1)) ? src_data_0.z : dst_data_1.z;
-        dst_data_1.w =  (dst_index_1 + 6 < dst_end_1)                                      ? src_data_0.w : dst_data_1.w;
+///////////////////////////////////////////// flip rows ///////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
 
-        *((__global ushort4 *)((__global char *)dst + dst_index_0)) = dst_data_0;
-        *((__global ushort4 *)((__global char *)dst + dst_index_1)) = dst_data_1;
-    }
-}
-__kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_offset,
-                                   __global short *dst, int dst_step, int dst_offset,
-                                   int rows, int cols, int thread_rows, int dst_step1)
+__kernel void arithm_flip_rows(__global T * src, int src_step, int src_offset,
+                               __global T * dst, int dst_step, int dst_offset,
+                               int rows, int cols, int thread_rows, int thread_cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < thread_rows)
     {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset >> 1) & 3) << 1)
-        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
-        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
-
-        int dst_start_0  = mad24(y,            dst_step, dst_offset);
-        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
-        int dst_end_0    = mad24(y,            dst_step, dst_offset + dst_step1);
-        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
-        int dst_index_0  = mad24(y,            dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+        int src_index_0 = mad24(y,            src_step, x + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset);
 
-        short4 src_data_0 = vload4(0, (__global short *)((__global char *)src + src_index_0));
-        short4 src_data_1 = vload4(0, (__global short *)((__global char *)src + src_index_1));
+        int dst_index_0 = mad24(y,            dst_step, x + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, x + dst_offset);
 
-        short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index_0));
-        short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index_1));
+        T data0 = src[src_index_0], data1 = src[src_index_1];
 
-        dst_data_0.x =  (dst_index_0 + 0 >= dst_start_0)                                   ? src_data_1.x : dst_data_0.x;
-        dst_data_0.y = ((dst_index_0 + 2 >= dst_start_0) && (dst_index_0 + 2 < dst_end_0)) ? src_data_1.y : dst_data_0.y;
-        dst_data_0.z = ((dst_index_0 + 4 >= dst_start_0) && (dst_index_0 + 4 < dst_end_0)) ? src_data_1.z : dst_data_0.z;
-        dst_data_0.w =  (dst_index_0 + 6 < dst_end_0)                                      ? src_data_1.w : dst_data_0.w;
-
-        dst_data_1.x =  (dst_index_1 + 0 >= dst_start_1)                                   ? src_data_0.x : dst_data_1.x;
-        dst_data_1.y = ((dst_index_1 + 2 >= dst_start_1) && (dst_index_1 + 2 < dst_end_1)) ? src_data_0.y : dst_data_1.y;
-        dst_data_1.z = ((dst_index_1 + 4 >= dst_start_1) && (dst_index_1 + 4 < dst_end_1)) ? src_data_0.z : dst_data_1.z;
-        dst_data_1.w =  (dst_index_1 + 6 < dst_end_1)                                      ? src_data_0.w : dst_data_1.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index_0)) = dst_data_0;
-        *((__global short4 *)((__global char *)dst + dst_index_1)) = dst_data_1;
+        dst[dst_index_0] = data1;
+        dst[dst_index_1] = data0;
     }
 }
 
-__kernel void arithm_flip_rows_D4 (__global int *src, int src_step, int src_offset,
-                                   __global int *dst, int dst_step, int dst_offset,
-                                   int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 2) + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 2) + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);
-
-        int data0 = *((__global int *)((__global char *)src + src_index_0));
-        int data1 = *((__global int *)((__global char *)src + src_index_1));
-
-        *((__global int *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global int *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rows_D5 (__global float *src, int src_step, int src_offset,
-                                   __global float *dst, int dst_step, int dst_offset,
-                                   int rows, int cols, int thread_rows, int dst_step1)
+__kernel void arithm_flip_rows_cols(__global T * src, int src_step, int src_offset,
+                                    __global T * dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int thread_cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < thread_rows)
     {
-        int src_index_0 = mad24(y,            src_step, (x << 2) + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
+        int src_index_0 = mad24(y,            src_step, x            + src_offset);
+        int dst_index_0 = mad24(rows - y - 1, dst_step, cols - x - 1 + dst_offset);
 
-        int dst_index_0 = mad24(y,            dst_step, (x << 2) + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, cols - x - 1 + src_offset);
+        int dst_index_1 = mad24(y,            dst_step, x            + dst_offset);
 
-        float data0 = *((__global float *)((__global char *)src + src_index_0));
-        float data1 = *((__global float *)((__global char *)src + src_index_1));
+        T data0 = src[src_index_0], data1 = src[src_index_1];
 
-        *((__global float *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global float *)((__global char *)dst + dst_index_1)) = data0;
+        dst[dst_index_0] = data0;
+        dst[dst_index_1] = data1;
     }
 }
 
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_rows_D6 (__global double *src, int src_step, int src_offset,
-                                   __global double *dst, int dst_step, int dst_offset,
-                                   int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 3) + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, (x << 3) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 3) + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 3) + dst_offset);
-
-        double data0 = *((__global double *)((__global char *)src + src_index_0));
-        double data1 = *((__global double *)((__global char *)src + src_index_1));
-
-        *((__global double *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global double *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-#endif
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////flip cols///////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src_offset,
-                                      __global uchar *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x)           + src_offset);
-        int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
-        uchar data0 = *(src + src_index_0);
-        *(dst + dst_index_1) = data0;
-
-        int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
-        int dst_index_0 = mad24(y, dst_step, (x)           + dst_offset);
-        uchar data1 = *(src + src_index_1);
-        *(dst + dst_index_0) = data1;
-    }
-}
-__kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset,
-                                      __global char *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x)           + src_offset);
-        int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x)           + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
-
-        char data0 = *(src + src_index_0);
-        char data1 = *(src + src_index_1);
-
-        *(dst + dst_index_0) = data1;
-        *(dst + dst_index_1) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C1_D2 (__global ushort *src, int src_step, int src_offset,
-                                      __global ushort *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
-
-        ushort data0 = *((__global ushort *)((__global char *)src + src_index_0));
-        ushort data1 = *((__global ushort *)((__global char *)src + src_index_1));
-
-        *((__global ushort *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global ushort *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C1_D3 (__global short *src, int src_step, int src_offset,
-                                      __global short *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
-
-        short data0 = *((__global short *)((__global char *)src + src_index_0));
-        short data1 = *((__global short *)((__global char *)src + src_index_1));
-
-        *((__global short *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global short *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C1_D4 (__global int *src, int src_step, int src_offset,
-                                      __global int *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        int data0 = *((__global int *)((__global char *)src + src_index_0));
-        int data1 = *((__global int *)((__global char *)src + src_index_1));
-
-        *((__global int *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global int *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C1_D5 (__global float *src, int src_step, int src_offset,
-                                      __global float *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        float data0 = *((__global float *)((__global char *)src + src_index_0));
-        float data1 = *((__global float *)((__global char *)src + src_index_1));
-
-        *((__global float *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global float *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_cols_C1_D6 (__global double *src, int src_step, int src_offset,
-                                      __global double *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        double data0 = *((__global double *)((__global char *)src + src_index_0));
-        double data1 = *((__global double *)((__global char *)src + src_index_1));
-
-        *((__global double *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global double *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-#endif
-__kernel void arithm_flip_cols_C2_D0 (__global uchar *src, int src_step, int src_offset,
-                                      __global uchar *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
-
-        uchar2 data0 = *((__global uchar2 *)((__global char *)src + src_index_0));
-        uchar2 data1 = *((__global uchar2 *)((__global char *)src + src_index_1));
-
-        *((__global uchar2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global uchar2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C2_D1 (__global char *src, int src_step, int src_offset,
-                                      __global char *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
-
-        char2 data0 = *((__global char2 *)((__global char *)src + src_index_0));
-        char2 data1 = *((__global char2 *)((__global char *)src + src_index_1));
-
-        *((__global char2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global char2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C2_D2 (__global ushort *src, int src_step, int src_offset,
-                                      __global ushort *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        ushort2 data0 = *((__global ushort2 *)((__global char *)src + src_index_0));
-        ushort2 data1 = *((__global ushort2 *)((__global char *)src + src_index_1));
-
-        *((__global ushort2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global ushort2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C2_D3 (__global short *src, int src_step, int src_offset,
-                                      __global short *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        short2 data0 = *((__global short2 *)((__global char *)src + src_index_0));
-        short2 data1 = *((__global short2 *)((__global char *)src + src_index_1));
-
-        *((__global short2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global short2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C2_D4 (__global int *src, int src_step, int src_offset,
-                                      __global int *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        int2 data0 = *((__global int2 *)((__global char *)src + src_index_0));
-        int2 data1 = *((__global int2 *)((__global char *)src + src_index_1));
-
-        *((__global int2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global int2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C2_D5 (__global float *src, int src_step, int src_offset,
-                                      __global float *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        float2 data0 = *((__global float2 *)((__global char *)src + src_index_0));
-        float2 data1 = *((__global float2 *)((__global char *)src + src_index_1));
-
-        *((__global float2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global float2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_cols_C2_D6 (__global double *src, int src_step, int src_offset,
-                                      __global double *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
-
-        double2 data0 = *((__global double2 *)((__global char *)src + src_index_0));
-        double2 data1 = *((__global double2 *)((__global char *)src + src_index_1));
-
-        *((__global double2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global double2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-#endif
-
-__kernel void arithm_flip_cols_C3_D0 (__global uchar *src, int src_step, int src_offset,
-                                      __global uchar *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x) * 3           + src_offset);
-        int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x) * 3           + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);
-
-        uchar data0_0 = *(src + src_index_0 + 0);
-        uchar data0_1 = *(src + src_index_0 + 1);
-        uchar data0_2 = *(src + src_index_0 + 2);
-
-        uchar data1_0 = *(src + src_index_1 + 0);
-        uchar data1_1 = *(src + src_index_1 + 1);
-        uchar data1_2 = *(src + src_index_1 + 2);
-
-        *(dst + dst_index_0 + 0 ) = data1_0;
-        *(dst + dst_index_0 + 1 ) = data1_1;
-        *(dst + dst_index_0 + 2 ) = data1_2;
-
-        *(dst + dst_index_1 + 0) = data0_0;
-        *(dst + dst_index_1 + 1) = data0_1;
-        *(dst + dst_index_1 + 2) = data0_2;
-    }
-}
-__kernel void arithm_flip_cols_C3_D1 (__global char *src, int src_step, int src_offset,
-                                      __global char *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x) * 3           + src_offset);
-        int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x) * 3           + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);
-
-        char data0_0 = *(src + src_index_0 + 0);
-        char data0_1 = *(src + src_index_0 + 1);
-        char data0_2 = *(src + src_index_0 + 2);
-
-        char data1_0 = *(src + src_index_1 + 0);
-        char data1_1 = *(src + src_index_1 + 1);
-        char data1_2 = *(src + src_index_1 + 2);
-
-        *(dst + dst_index_0 + 0 ) = data1_0;
-        *(dst + dst_index_0 + 1 ) = data1_1;
-        *(dst + dst_index_0 + 2 ) = data1_2;
-
-        *(dst + dst_index_1 + 0) = data0_0;
-        *(dst + dst_index_1 + 1) = data0_1;
-        *(dst + dst_index_1 + 2) = data0_2;
-    }
-}
-__kernel void arithm_flip_cols_C3_D2 (__global ushort *src, int src_step, int src_offset,
-                                      __global ushort *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x * 3 << 1)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x * 3 << 1)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
-
-        ushort data0_0 = *((__global ushort *)((__global char *)src + src_index_0 + 0));
-        ushort data0_1 = *((__global ushort *)((__global char *)src + src_index_0 + 2));
-        ushort data0_2 = *((__global ushort *)((__global char *)src + src_index_0 + 4));
-
-        ushort data1_0 = *((__global ushort *)((__global char *)src + src_index_1 + 0));
-        ushort data1_1 = *((__global ushort *)((__global char *)src + src_index_1 + 2));
-        ushort data1_2 = *((__global ushort *)((__global char *)src + src_index_1 + 4));
-
-        *((__global ushort *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
-        *((__global ushort *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
-        *((__global ushort *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
-
-        *((__global ushort *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
-        *((__global ushort *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
-        *((__global ushort *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
-    }
-}
-__kernel void arithm_flip_cols_C3_D3 (__global short *src, int src_step, int src_offset,
-                                      __global short *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x * 3 << 1)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x * 3 << 1)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
-
-        short data0_0 = *((__global short *)((__global char *)src + src_index_0 + 0));
-        short data0_1 = *((__global short *)((__global char *)src + src_index_0 + 2));
-        short data0_2 = *((__global short *)((__global char *)src + src_index_0 + 4));
-
-        short data1_0 = *((__global short *)((__global char *)src + src_index_1 + 0));
-        short data1_1 = *((__global short *)((__global char *)src + src_index_1 + 2));
-        short data1_2 = *((__global short *)((__global char *)src + src_index_1 + 4));
-
-        *((__global short *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
-        *((__global short *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
-        *((__global short *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
-
-        *((__global short *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
-        *((__global short *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
-        *((__global short *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
-    }
-}
-__kernel void arithm_flip_cols_C3_D4 (__global int *src, int src_step, int src_offset,
-                                      __global int *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x * 3 << 2)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x * 3 << 2)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
-
-        int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
-        int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
-        int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
-
-        int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
-        int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
-        int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
-
-        *((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
-        *((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
-        *((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
-
-        *((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
-        *((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
-        *((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
-    }
-}
-__kernel void arithm_flip_cols_C3_D5 (__global float *src, int src_step, int src_offset,
-                                      __global float *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x * 3 << 2)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x * 3 << 2)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
-
-        float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
-        float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
-        float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
-
-        float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
-        float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
-        float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
-
-        *((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
-        *((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
-        *((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
-
-        *((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
-        *((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
-        *((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_cols_C3_D6 (__global double *src, int src_step, int src_offset,
-                                      __global double *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x * 3 << 3)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 3) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x * 3 << 3)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
-
-        double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0));
-        double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8));
-        double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
-
-        double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0));
-        double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8));
-        double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
-
-        *((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
-        *((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
-        *((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
-
-        *((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
-        *((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
-        *((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
-    }
-}
-#endif
-__kernel void arithm_flip_cols_C4_D0 (__global uchar *src, int src_step, int src_offset,
-                                      __global uchar *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        uchar4 data0 = *((__global uchar4 *)(src + src_index_0));
-        uchar4 data1 = *((__global uchar4 *)(src + src_index_1));
-
-        *((__global uchar4 *)(dst + dst_index_0)) = data1;
-        *((__global uchar4 *)(dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C4_D1 (__global char *src, int src_step, int src_offset,
-                                      __global char *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        char4 data0 = *((__global char4 *)(src + src_index_0));
-        char4 data1 = *((__global char4 *)(src + src_index_1));
-
-        *((__global char4 *)(dst + dst_index_0)) = data1;
-        *((__global char4 *)(dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C4_D2 (__global ushort *src, int src_step, int src_offset,
-                                      __global ushort *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        ushort4 data0 = *((__global ushort4 *)((__global char *)src + src_index_0));
-        ushort4 data1 = *((__global ushort4 *)((__global char *)src + src_index_1));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global ushort4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C4_D3 (__global short *src, int src_step, int src_offset,
-                                      __global short *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        short4 data0 = *((__global short4 *)((__global char *)src + src_index_0));
-        short4 data1 = *((__global short4 *)((__global char *)src + src_index_1));
-
-        *((__global short4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global short4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-
-__kernel void arithm_flip_cols_C4_D4 (__global int *src, int src_step, int src_offset,
-                                      __global int *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
-
-        int4 data0 = *((__global int4 *)((__global char *)src + src_index_0));
-        int4 data1 = *((__global int4 *)((__global char *)src + src_index_1));
-
-        *((__global int4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global int4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_cols_C4_D5 (__global float *src, int src_step, int src_offset,
-                                      __global float *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
-
-        float4 data0 = *((__global float4 *)((__global char *)src + src_index_0));
-        float4 data1 = *((__global float4 *)((__global char *)src + src_index_1));
+///////////////////////////////////////////// flip cols ///////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
 
-        *((__global float4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global float4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_cols_C4_D6 (__global double *src, int src_step, int src_offset,
-                                      __global double *dst, int dst_step, int dst_offset,
-                                      int rows, int cols, int thread_cols, int dst_step1)
+__kernel void arithm_flip_cols(__global T * src, int src_step, int src_offset,
+                               __global T * dst, int dst_step, int dst_offset,
+                               int rows, int cols, int thread_rows, int thread_cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < thread_cols && y < rows)
     {
-        int src_index_0 = mad24(y, src_step, (x << 5)             + src_offset);
-        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 5) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x << 5)             + dst_offset);
-        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 5) + dst_offset);
+        int src_index_0 = mad24(y, src_step, x            + src_offset);
+        int dst_index_0 = mad24(y, dst_step, cols - x - 1 + dst_offset);
 
-        double4 data0 = *((__global double4 *)((__global char *)src + src_index_0));
-        double4 data1 = *((__global double4 *)((__global char *)src + src_index_1));
+        int src_index_1 = mad24(y, src_step, cols - x - 1 + src_offset);
+        int dst_index_1 = mad24(y, dst_step, x            + dst_offset);
 
-        *((__global double4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global double4 *)((__global char *)dst + dst_index_1)) = data0;
+        T data0 = src[src_index_0], data1 = src[src_index_1];
+        dst[dst_index_1] = data1;
+        dst[dst_index_0] = data0;
     }
 }
-#endif
diff --git a/modules/ocl/src/opencl/arithm_flip_rc.cl b/modules/ocl/src/opencl/arithm_flip_rc.cl
deleted file mode 100644
index 4a20382755..0000000000
--- a/modules/ocl/src/opencl/arithm_flip_rc.cl
+++ /dev/null
@@ -1,753 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////flip rows and cols///////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void arithm_flip_rc_C1_D0 (__global uchar *src, int src_step, int src_offset,
-                                    __global uchar *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x)           + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x)           + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
-
-        uchar data0 = *(src + src_index_0);
-        uchar data1 = *(src + src_index_1);
-
-        *(dst + dst_index_0) = data1;
-        *(dst + dst_index_1) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C1_D1 (__global char *src, int src_step, int src_offset,
-                                    __global char *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x)           + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x)           + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
-
-        char data0 = *(src + src_index_0);
-        char data1 = *(src + src_index_1);
-
-        *(dst + dst_index_0) = data1;
-        *(dst + dst_index_1) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C1_D2 (__global ushort *src, int src_step, int src_offset,
-                                    __global ushort *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
-
-        ushort data0 = *((__global ushort *)((__global char *)src + src_index_0));
-        ushort data1 = *((__global ushort *)((__global char *)src + src_index_1));
-
-        *((__global ushort *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global ushort *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C1_D3 (__global short *src, int src_step, int src_offset,
-                                    __global short *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
-
-        short data0 = *((__global short *)((__global char *)src + src_index_0));
-        short data1 = *((__global short *)((__global char *)src + src_index_1));
-
-        *((__global short *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global short *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C1_D4 (__global int *src, int src_step, int src_offset,
-                                    __global int *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        int data0 = *((__global int *)((__global char *)src + src_index_0));
-        int data1 = *((__global int *)((__global char *)src + src_index_1));
-
-        *((__global int *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global int *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C1_D5 (__global float *src, int src_step, int src_offset,
-                                    __global float *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        float data0 = *((__global float *)((__global char *)src + src_index_0));
-        float data1 = *((__global float *)((__global char *)src + src_index_1));
-
-        *((__global float *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global float *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_rc_C1_D6 (__global double *src, int src_step, int src_offset,
-                                    __global double *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        double data0 = *((__global double *)((__global char *)src + src_index_0));
-        double data1 = *((__global double *)((__global char *)src + src_index_1));
-
-        *((__global double *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global double *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-#endif
-__kernel void arithm_flip_rc_C2_D0 (__global uchar *src, int src_step, int src_offset,
-                                    __global uchar *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
-
-        uchar2 data0 = *((__global uchar2 *)(src + src_index_0));
-        uchar2 data1 = *((__global uchar2 *)(src + src_index_1));
-
-        *((__global uchar2 *)(dst + dst_index_0)) = data1;
-        *((__global uchar2 *)(dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C2_D1 (__global char *src, int src_step, int src_offset,
-                                    __global char *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
-
-        char2 data0 = *((__global char2 *)(src + src_index_0));
-        char2 data1 = *((__global char2 *)(src + src_index_1));
-
-        *((__global char2 *)(dst + dst_index_0)) = data1;
-        *((__global char2 *)(dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C2_D2 (__global ushort *src, int src_step, int src_offset,
-                                    __global ushort *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        ushort2 data0 = *((__global ushort2 *)((__global char *)src + src_index_0));
-        ushort2 data1 = *((__global ushort2 *)((__global char *)src + src_index_1));
-
-        *((__global ushort2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global ushort2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C2_D3 (__global short *src, int src_step, int src_offset,
-                                    __global short *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        short2 data0 = *((__global short2 *)((__global char *)src + src_index_0));
-        short2 data1 = *((__global short2 *)((__global char *)src + src_index_1));
-
-        *((__global short2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global short2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C2_D4 (__global int *src, int src_step, int src_offset,
-                                    __global int *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        int2 data0 = *((__global int2 *)((__global char *)src + src_index_0));
-        int2 data1 = *((__global int2 *)((__global char *)src + src_index_1));
-
-        *((__global int2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global int2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C2_D5 (__global float *src, int src_step, int src_offset,
-                                    __global float *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        float2 data0 = *((__global float2 *)((__global char *)src + src_index_0));
-        float2 data1 = *((__global float2 *)((__global char *)src + src_index_1));
-
-        *((__global float2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global float2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_rc_C2_D6 (__global double *src, int src_step, int src_offset,
-                                    __global double *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
-
-        double2 data0 = *((__global double2 *)((__global char *)src + src_index_0));
-        double2 data1 = *((__global double2 *)((__global char *)src + src_index_1));
-
-        *((__global double2 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global double2 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-#endif
-
-__kernel void arithm_flip_rc_C3_D0 (__global uchar *src, int src_step, int src_offset,
-                                    __global uchar *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x * 3)            + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3  + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x * 3)           + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
-
-
-        uchar data0_0 = *(src + src_index_0 + 0);
-        uchar data0_1 = *(src + src_index_0 + 1);
-        uchar data0_2 = *(src + src_index_0 + 2);
-
-        uchar data1_0 = *(src + src_index_1 + 0);
-        uchar data1_1 = *(src + src_index_1 + 1);
-        uchar data1_2 = *(src + src_index_1 + 2);
-
-        *(dst + dst_index_0 + 0 ) = data1_0;
-        *(dst + dst_index_0 + 1 ) = data1_1;
-        *(dst + dst_index_0 + 2 ) = data1_2;
-
-        *(dst + dst_index_1 + 0) = data0_0;
-        *(dst + dst_index_1 + 1) = data0_1;
-        *(dst + dst_index_1 + 2) = data0_2;
-    }
-}
-__kernel void arithm_flip_rc_C3_D1 (__global char *src, int src_step, int src_offset,
-                                    __global char *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x * 3)            + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3  + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x * 3)           + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
-
-
-        char data0_0 = *(src + src_index_0 + 0);
-        char data0_1 = *(src + src_index_0 + 1);
-        char data0_2 = *(src + src_index_0 + 2);
-
-        char data1_0 = *(src + src_index_1 + 0);
-        char data1_1 = *(src + src_index_1 + 1);
-        char data1_2 = *(src + src_index_1 + 2);
-
-        *(dst + dst_index_0 + 0 ) = data1_0;
-        *(dst + dst_index_0 + 1 ) = data1_1;
-        *(dst + dst_index_0 + 2 ) = data1_2;
-
-        *(dst + dst_index_1 + 0) = data0_0;
-        *(dst + dst_index_1 + 1) = data0_1;
-        *(dst + dst_index_1 + 2) = data0_2;
-    }
-}
-__kernel void arithm_flip_rc_C3_D2 (__global ushort *src, int src_step, int src_offset,
-                                    __global ushort *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x * 3 << 1)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 1)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
-
-        ushort data0_0 = *((__global ushort *)((__global char *)src + src_index_0 + 0));
-        ushort data0_1 = *((__global ushort *)((__global char *)src + src_index_0 + 2));
-        ushort data0_2 = *((__global ushort *)((__global char *)src + src_index_0 + 4));
-
-        ushort data1_0 = *((__global ushort *)((__global char *)src + src_index_1 + 0));
-        ushort data1_1 = *((__global ushort *)((__global char *)src + src_index_1 + 2));
-        ushort data1_2 = *((__global ushort *)((__global char *)src + src_index_1 + 4));
-
-        *((__global ushort *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
-        *((__global ushort *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
-        *((__global ushort *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
-
-        *((__global ushort *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
-        *((__global ushort *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
-        *((__global ushort *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
-    }
-}
-__kernel void arithm_flip_rc_C3_D3 (__global short *src, int src_step, int src_offset,
-                                    __global short *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x * 3 << 1)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 1)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
-
-        short data0_0 = *((__global short *)((__global char *)src + src_index_0 + 0));
-        short data0_1 = *((__global short *)((__global char *)src + src_index_0 + 2));
-        short data0_2 = *((__global short *)((__global char *)src + src_index_0 + 4));
-
-        short data1_0 = *((__global short *)((__global char *)src + src_index_1 + 0));
-        short data1_1 = *((__global short *)((__global char *)src + src_index_1 + 2));
-        short data1_2 = *((__global short *)((__global char *)src + src_index_1 + 4));
-
-        *((__global short *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
-        *((__global short *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
-        *((__global short *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
-
-        *((__global short *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
-        *((__global short *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
-        *((__global short *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
-    }
-}
-
-__kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_offset,
-                                    __global int *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x * 3 << 2)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 2)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
-
-        int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
-        int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
-        int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
-
-        int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
-        int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
-        int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
-
-        *((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
-        *((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
-        *((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
-
-        *((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
-        *((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
-        *((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
-    }
-}
-__kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_offset,
-                                    __global float *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x * 3 << 2)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 2)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
-
-        float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
-        float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
-        float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
-
-        float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
-        float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
-        float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
-
-        *((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
-        *((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
-        *((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
-
-        *((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
-        *((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
-        *((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_offset,
-                                    __global double *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x * 3 << 3)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 3) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 3)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
-
-        double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0 ));
-        double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8 ));
-        double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
-
-        double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0 ));
-        double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8 ));
-        double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
-
-        *((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
-        *((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
-        *((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
-
-        *((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
-        *((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
-        *((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
-    }
-}
-#endif
-__kernel void arithm_flip_rc_C4_D0 (__global uchar *src, int src_step, int src_offset,
-                                    __global uchar *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        uchar4 data0 = *((__global uchar4 *)(src + src_index_0));
-        uchar4 data1 = *((__global uchar4 *)(src + src_index_1));
-
-        *((__global uchar4 *)(dst + dst_index_0)) = data1;
-        *((__global uchar4 *)(dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C4_D1 (__global char *src, int src_step, int src_offset,
-                                    __global char *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
-
-        char4 data0 = *((__global char4 *)(src + src_index_0));
-        char4 data1 = *((__global char4 *)(src + src_index_1));
-
-        *((__global char4 *)(dst + dst_index_0)) = data1;
-        *((__global char4 *)(dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C4_D2 (__global ushort *src, int src_step, int src_offset,
-                                    __global ushort *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        ushort4 data0 = *((__global ushort4 *)((__global char *)src + src_index_0));
-        ushort4 data1 = *((__global ushort4 *)((__global char *)src + src_index_1));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global ushort4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C4_D3 (__global short *src, int src_step, int src_offset,
-                                    __global short *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
-
-        short4 data0 = *((__global short4 *)((__global char *)src + src_index_0));
-        short4 data1 = *((__global short4 *)((__global char *)src + src_index_1));
-
-        *((__global short4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global short4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C4_D4 (__global int *src, int src_step, int src_offset,
-                                    __global int *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
-
-        int4 data0 = *((__global int4 *)((__global char *)src + src_index_0));
-        int4 data1 = *((__global int4 *)((__global char *)src + src_index_1));
-
-        *((__global int4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global int4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-__kernel void arithm_flip_rc_C4_D5 (__global float *src, int src_step, int src_offset,
-                                    __global float *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
-
-        float4 data0 = *((__global float4 *)((__global char *)src + src_index_0));
-        float4 data1 = *((__global float4 *)((__global char *)src + src_index_1));
-
-        *((__global float4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global float4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_flip_rc_C4_D6 (__global double *src, int src_step, int src_offset,
-                                    __global double *dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, (x << 5)             + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 5) + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, (x << 5)             + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 5) + dst_offset);
-
-        double4 data0 = *((__global double4 *)((__global char *)src + src_index_0));
-        double4 data1 = *((__global double4 *)((__global char *)src + src_index_1));
-
-        *((__global double4 *)((__global char *)dst + dst_index_0)) = data1;
-        *((__global double4 *)((__global char *)dst + dst_index_1)) = data0;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_magnitude.cl b/modules/ocl/src/opencl/arithm_magnitude.cl
index 3403f5caf9..7c8cc187e3 100644
--- a/modules/ocl/src/opencl/arithm_magnitude.cl
+++ b/modules/ocl/src/opencl/arithm_magnitude.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl
index c5d3ec2abd..33a39d83f3 100644
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -53,61 +53,66 @@
 #endif
 #endif
 
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
+#ifdef DEPTH_5
+#define MIN_VAL (-FLT_MAX)
+#define MAX_VAL FLT_MAX
+#elif defined DEPTH_6
+#define MIN_VAL (-DBL_MAX)
+#define MAX_VAL DBL_MAX
+#endif
 
 /**************************************Array minMax**************************************/
 
 __kernel void arithm_op_minMax(__global const T * src, __global T * dst,
     int cols, int invalid_cols, int offset, int elemnum, int groupnum)
 {
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int id = get_global_id(0);
-
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-
-   __local T localmem_max[128], localmem_min[128];
-   T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
-
-   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       temp = src[idx];
-       minval = min(minval, temp);
-       maxval = max(maxval, temp);
-   }
-
-   if(lid > 127)
-   {
-       localmem_min[lid - 128] = minval;
-       localmem_max[lid - 128] = maxval;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   if(lid < 128)
-   {
-       localmem_min[lid] = min(minval, localmem_min[lid]);
-       localmem_max[lid] = max(maxval, localmem_max[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   for (int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if (lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-
-   if (lid == 0)
-   {
-       dst[gid] = localmem_min[0];
-       dst[gid + groupnum] = localmem_max[0];
-   }
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
+    int id = get_global_id(0);
+
+    int idx = offset + id + (id / cols) * invalid_cols;
+
+    __local T localmem_max[128], localmem_min[128];
+    T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
+
+    for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
+    {
+        idx = offset + id + (id / cols) * invalid_cols;
+        temp = src[idx];
+        minval = min(minval, temp);
+        maxval = max(maxval, temp);
+    }
+
+    if (lid > 127)
+    {
+        localmem_min[lid - 128] = minval;
+        localmem_max[lid - 128] = maxval;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid < 128)
+    {
+        localmem_min[lid] = min(minval, localmem_min[lid]);
+        localmem_max[lid] = max(maxval, localmem_max[lid]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int lsize = 64; lsize > 0; lsize >>= 1)
+    {
+        if (lid < lsize)
+        {
+            int lid2 = lsize + lid;
+            localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+            localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+    {
+        dst[gid] = localmem_min[0];
+        dst[gid + groupnum] = localmem_max[0];
+    }
 }
 
 __kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
@@ -115,57 +120,57 @@ __kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
     int elemnum, int groupnum,
     const __global uchar * mask, int minvalid_cols, int moffset)
 {
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int id = get_global_id(0);
-
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
-
-   __local T localmem_max[128], localmem_min[128];
-   T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
-
-   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       midx = moffset + id + (id / cols) * minvalid_cols;
-
-       if (mask[midx])
-       {
-           temp = src[idx];
-           minval = min(minval, temp);
-           maxval = max(maxval, temp);
-       }
-   }
-
-   if(lid > 127)
-   {
-       localmem_min[lid - 128] = minval;
-       localmem_max[lid - 128] = maxval;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   if(lid < 128)
-   {
-       localmem_min[lid] = min(minval, localmem_min[lid]);
-       localmem_max[lid] = max(maxval, localmem_max[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   for (int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if (lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-
-   if (lid == 0)
-   {
-       dst[gid] = localmem_min[0];
-       dst[gid + groupnum] = localmem_max[0];
-   }
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
+    int id = get_global_id(0);
+
+    int idx = offset + id + (id / cols) * invalid_cols;
+    int midx = moffset + id + (id / cols) * minvalid_cols;
+
+    __local T localmem_max[128], localmem_min[128];
+    T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
+
+    for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
+    {
+        idx = offset + id + (id / cols) * invalid_cols;
+        midx = moffset + id + (id / cols) * minvalid_cols;
+
+        if (mask[midx])
+        {
+            temp = src[idx];
+            minval = min(minval, temp);
+            maxval = max(maxval, temp);
+        }
+    }
+
+    if (lid > 127)
+    {
+        localmem_min[lid - 128] = minval;
+        localmem_max[lid - 128] = maxval;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid < 128)
+    {
+        localmem_min[lid] = min(minval, localmem_min[lid]);
+        localmem_max[lid] = max(maxval, localmem_max[lid]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int lsize = 64; lsize > 0; lsize >>= 1)
+    {
+        if (lid < lsize)
+        {
+            int lid2 = lsize + lid;
+            localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+            localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+    {
+        dst[gid] = localmem_min[0];
+        dst[gid + groupnum] = localmem_max[0];
+    }
 }
diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
index 848aac3197..076fb06001 100644
--- a/modules/ocl/src/opencl/arithm_minMaxLoc.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -137,118 +137,114 @@
 #define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
 #endif
 
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
-
 /**************************************Array minMax**************************************/
 
 __kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
                                   __global VEC_TYPE *src, __global RES_TYPE *dst)
 {
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-
-   __local VEC_TYPE localmem_max[128], localmem_min[128];
-   VEC_TYPE minval, maxval, temp;
-
-   __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
-   VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
-
-   int idx_c;
-
-   if (id < elemnum)
-   {
-       temp = src[idx];
-       idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
-
-       if (id % cols == 0 )
-       {
-           repeat_s(temp);
-           repeat_s(temploc);
-       }
-       if (id % cols == cols - 1)
-       {
-           repeat_e(temp);
-           repeat_e(temploc);
-       }
-       minval = temp;
-       maxval = temp;
-       minloc = temploc;
-       maxloc = temploc;
-   }
-   else
-   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
-       minloc = negative;
-       maxloc = negative;
-   }
-
-   int grainSize = (groupnum << 8);
-   for (id = id + grainSize; id < elemnum; id = id + grainSize)
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       temp = src[idx];
-       idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
-
-       if (id % cols == 0 )
-       {
-               repeat_s(temp);
-               repeat_s(temploc);
-       }
-       if (id % cols == cols - 1)
-       {
-               repeat_e(temp);
-               repeat_e(temploc);
-       }
-
-       minval = min(minval, temp);
-       maxval = max(maxval, temp);
-       minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
-   }
-
-   if (lid > 127)
-   {
-       localmem_min[lid - 128] = minval;
-       localmem_max[lid - 128] = maxval;
-       localmem_minloc[lid - 128] = minloc;
-       localmem_maxloc[lid - 128] = maxloc;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   if (lid < 128)
-   {
-       localmem_min[lid] = min(minval,localmem_min[lid]);
-       localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   for (int lsize = 64; lsize > 0; lsize >>= 1)
-   {
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
+    int  id = get_global_id(0);
+    int idx = offset + id + (id / cols) * invalid_cols;
+
+    __local VEC_TYPE localmem_max[128], localmem_min[128];
+    VEC_TYPE minval, maxval, temp;
+
+    __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
+    VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
+
+    int idx_c;
+
+    if (id < elemnum)
+    {
+        temp = src[idx];
+        idx_c = idx << 2;
+        temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
+
+        if (id % cols == 0 )
+        {
+            repeat_s(temp);
+            repeat_s(temploc);
+        }
+        if (id % cols == cols - 1)
+        {
+            repeat_e(temp);
+            repeat_e(temploc);
+        }
+        minval = temp;
+        maxval = temp;
+        minloc = temploc;
+        maxloc = temploc;
+    }
+    else
+    {
+        minval = MAX_VAL;
+        maxval = MIN_VAL;
+        minloc = negative;
+        maxloc = negative;
+    }
+
+    int grainSize = (groupnum << 8);
+    for (id = id + grainSize; id < elemnum; id = id + grainSize)
+    {
+        idx = offset + id + (id / cols) * invalid_cols;
+        temp = src[idx];
+        idx_c = idx << 2;
+        temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
+
+        if (id % cols == 0 )
+        {
+            repeat_s(temp);
+            repeat_s(temploc);
+        }
+        if (id % cols == cols - 1)
+        {
+            repeat_e(temp);
+            repeat_e(temploc);
+        }
+
+        minval = min(minval, temp);
+        maxval = max(maxval, temp);
+        minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
+        maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
+    }
+
+    if (lid > 127)
+    {
+        localmem_min[lid - 128] = minval;
+        localmem_max[lid - 128] = maxval;
+        localmem_minloc[lid - 128] = minloc;
+        localmem_maxloc[lid - 128] = maxloc;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid < 128)
+    {
+        localmem_min[lid] = min(minval,localmem_min[lid]);
+        localmem_max[lid] = max(maxval,localmem_max[lid]);
+        localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
+        localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int lsize = 64; lsize > 0; lsize >>= 1)
+    {
        if (lid < lsize)
        {
-           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
-           localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
-           localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
+            int lid2 = lsize + lid;
+            localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+            localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
+            localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
+            localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
-   }
-
-   if ( lid == 0)
-   {
-       dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
-       dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
-       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
-       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
-   }
+    }
+
+    if ( lid == 0)
+    {
+        dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
+        dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
+        dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
+        dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
+    }
 }
diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
index 0af4f7ba03..4d73be9541 100644
--- a/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -147,96 +147,96 @@
 __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global TYPE *src,
                                         int minvalid_cols,int moffset,__global uchar *mask,__global RES_TYPE  *dst)
 {
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = id + (id / cols) * invalid_cols;
-   unsigned int midx = id + (id / cols) * minvalid_cols;
-   __local VEC_TYPE lm_max[128],lm_min[128];
-   VEC_TYPE minval,maxval,temp,m_temp;
-   __local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0;
-   if(id < elemnum)
-   {
-       temp = vload4(idx, &src[offset]);
-       m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
-       int idx_c = (idx << 2) + offset;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
-       maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
-       minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative);
-       maxloc = minloc;
-   }
-   else
-   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
-       minloc = negative;
-       maxloc = negative;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = id + (id / cols) * invalid_cols;
-       midx = id + (id / cols) * minvalid_cols;
-       temp = vload4(idx, &src[offset]);
-       m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
-       int idx_c = (idx << 2) + offset;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
-       maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
+    int  id = get_global_id(0);
+    int idx = id + (id / cols) * invalid_cols;
+    int midx = id + (id / cols) * minvalid_cols;
+    __local VEC_TYPE lm_max[128],lm_min[128];
+    VEC_TYPE minval,maxval,temp,m_temp;
+    __local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128];
+    VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0;
+    if(id < elemnum)
+    {
+        temp = vload4(idx, &src[offset]);
+        m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
+        int idx_c = (idx << 2) + offset;
+        temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
+        if(id % cols == cols - 1)
+        {
+            repeat_me(m_temp);
+            repeat_e(temploc);
+        }
+        minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
+        maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
+        minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative);
+        maxloc = minloc;
+    }
+    else
+    {
+        minval = MAX_VAL;
+        maxval = MIN_VAL;
+        minloc = negative;
+        maxloc = negative;
+    }
+    for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+    {
+        idx = id + (id / cols) * invalid_cols;
+        midx = id + (id / cols) * minvalid_cols;
+        temp = vload4(idx, &src[offset]);
+        m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
+        int idx_c = (idx << 2) + offset;
+        temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
+        if(id % cols == cols - 1)
+        {
+            repeat_me(m_temp);
+            repeat_e(temploc);
+        }
+        minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
+        maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
 
-       minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
-       maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
-   }
-   if(lid > 127)
-   {
-       lm_min[lid - 128] = minval;
-       lm_max[lid - 128] = maxval;
-       lm_minloc[lid - 128] = minloc;
-       lm_maxloc[lid - 128] = maxloc;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       lm_min[lid] = min(minval,lm_min[lid]);
-       lm_max[lid] = max(maxval,lm_max[lid]);
-       VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero);
-       VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero);
-       lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]);
-       lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           lm_min[lid] = min(lm_min[lid] , lm_min[lid2]);
-           lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
-           VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
-           VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
-           lm_minloc[lid] =
-              CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
-           lm_maxloc[lid] =
-              CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid] = CONVERT_RES_TYPE(lm_min[0]);
-       dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]);
-       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]);
-       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
-   }
+        minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
+        maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
+    }
+    if(lid > 127)
+    {
+        lm_min[lid - 128] = minval;
+        lm_max[lid - 128] = maxval;
+        lm_minloc[lid - 128] = minloc;
+        lm_maxloc[lid - 128] = maxloc;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lid < 128)
+    {
+        lm_min[lid] = min(minval,lm_min[lid]);
+        lm_max[lid] = max(maxval,lm_max[lid]);
+        VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero);
+        VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero);
+        lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]);
+        lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int lsize = 64; lsize > 0; lsize >>= 1)
+    {
+        if(lid < lsize)
+        {
+            int lid2 = lsize + lid;
+            lm_min[lid] = min(lm_min[lid] , lm_min[lid2]);
+            lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
+            VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
+            VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
+            lm_minloc[lid] =
+                CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
+            lm_maxloc[lid] =
+                CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if( lid == 0)
+    {
+        dst[gid] = CONVERT_RES_TYPE(lm_min[0]);
+        dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]);
+        dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]);
+        dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
+    }
 }
diff --git a/modules/ocl/src/opencl/arithm_minMax_mask.cl b/modules/ocl/src/opencl/arithm_minMax_mask.cl
deleted file mode 100644
index 734ccab750..0000000000
--- a/modules/ocl/src/opencl/arithm_minMax_mask.cl
+++ /dev/null
@@ -1,196 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar8
-#define TYPE uchar
-#define CONVERT_TYPE convert_uchar8
-#define MIN_VAL 0
-#define MAX_VAL 255
-#endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char8
-#define TYPE char
-#define CONVERT_TYPE convert_char8
-#define MIN_VAL -128
-#define MAX_VAL 127
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort8
-#define TYPE ushort
-#define CONVERT_TYPE convert_ushort8
-#define MIN_VAL 0
-#define MAX_VAL 65535
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short8
-#define TYPE short
-#define CONVERT_TYPE convert_short8
-#define MIN_VAL -32768
-#define MAX_VAL 32767
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int8
-#define TYPE int
-#define CONVERT_TYPE convert_int8
-#define MIN_VAL INT_MIN
-#define MAX_VAL INT_MAX
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float8
-#define TYPE float
-#define CONVERT_TYPE convert_float8
-#define MIN_VAL (-FLT_MAX)
-#define MAX_VAL FLT_MAX
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double8
-#define TYPE double
-#define CONVERT_TYPE convert_double8
-#define MIN_VAL (-DBL_MAX)
-#define MAX_VAL DBL_MAX
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_me(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_me(a) a.s7 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_me(a) a.s7 = 0;a.s6 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
-
-/**************************************Array minMax mask**************************************/
-__kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum, __global TYPE *src,
-                                     int minvalid_cols,int moffset, __global uchar *mask,__global VEC_TYPE *dst)
-{
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = id + (id / cols) * invalid_cols;
-   unsigned int midx = id + (id / cols) * minvalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp,m_temp;
-   if(id < elemnum)
-   {
-       temp = vload8(idx, &src[offset]);
-       m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset]));
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-       }
-       minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
-       maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
-   }
-   else
-   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = id + (id / cols) * invalid_cols;
-       midx = id + (id / cols) * minvalid_cols;
-       temp = vload8(idx, &src[offset]);
-       m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset]));
-       if(id % cols == cols - 1)
-       {
-               repeat_me(m_temp);
-       }
-       minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
-       maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
-   }
-   if(lid > 127)
-   {
-       localmem_min[lid - 128] = minval;
-       localmem_max[lid - 128] = maxval;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_min[lid] = min(minval,localmem_min[lid]);
-       localmem_max[lid] = max(maxval,localmem_max[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid] = localmem_min[0];
-       dst[gid + groupnum] = localmem_max[0];
-   }
-}
diff --git a/modules/ocl/src/opencl/arithm_nonzero.cl b/modules/ocl/src/opencl/arithm_nonzero.cl
index 085386f5c3..fc98257962 100644
--- a/modules/ocl/src/opencl/arithm_nonzero.cl
+++ b/modules/ocl/src/opencl/arithm_nonzero.cl
@@ -55,11 +55,11 @@
 __kernel void arithm_op_nonzero(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
                                   __global srcT *src, __global dstT *dst)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
-    unsigned int  id = get_global_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
+    int  id = get_global_id(0);
 
-    unsigned int idx = offset + id + (id / cols) * invalid_cols;
+    int idx = offset + id + (id / cols) * invalid_cols;
     __local dstT localmem_nonzero[128];
     dstT nonzero = (dstT)(0);
     srcT zero = (srcT)(0), one = (srcT)(1);
diff --git a/modules/ocl/src/opencl/arithm_phase.cl b/modules/ocl/src/opencl/arithm_phase.cl
index b6bc7b42b4..f9835948c4 100644
--- a/modules/ocl/src/opencl/arithm_phase.cl
+++ b/modules/ocl/src/opencl/arithm_phase.cl
@@ -45,15 +45,17 @@
 //
 
 #if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
+    #ifdef cl_khr_fp64
+        #pragma OPENCL EXTENSION cl_khr_fp64:enable
+    #elif defined (cl_amd_fp64)
+        #pragma OPENCL EXTENSION cl_amd_fp64:enable
+    #endif
+    #define CV_PI 3.1415926535897932384626433832795
+    #define CV_2PI 2*CV_PI
+#else
+    #define CV_PI 3.1415926535897932384626433832795f
+    #define CV_2PI 2*CV_PI
 #endif
-#endif
-
-#define CV_PI 3.1415926535898
-#define CV_2PI 2*3.1415926535898
 
 /**************************************phase inradians**************************************/
 
diff --git a/modules/ocl/src/opencl/arithm_polarToCart.cl b/modules/ocl/src/opencl/arithm_polarToCart.cl
index 180ea6de3b..8469cdb097 100644
--- a/modules/ocl/src/opencl/arithm_polarToCart.cl
+++ b/modules/ocl/src/opencl/arithm_polarToCart.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -43,12 +43,13 @@
 //
 //M*/
 
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#ifdef DOUBLE_SUPPORT
+    #pragma OPENCL EXTENSION cl_khr_fp64:enable
+    #define CV_PI   3.1415926535897932384626433832795
+#else
+    #define CV_PI   3.1415926535897932384626433832795f
 #endif
 
-#define CV_PI   3.1415926535897932384626433832795
-
 /////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////polarToCart with magnitude//////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +73,7 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in
         float x = *((__global float *)((__global char *)src1 + src1_index));
         float y = *((__global float *)((__global char *)src2 + src2_index));
 
-        float ascale = CV_PI/180.0;
+        float ascale = CV_PI/180.0f;
         float alpha  = angInDegree == 1 ? y * ascale : y;
         float a = cos(alpha) * x;
         float b = sin(alpha) * x;
@@ -134,7 +135,7 @@ __kernel void arithm_polarToCart_D5 (__global float *src,  int src_step,  int sr
 
         float y = *((__global float *)((__global char *)src + src_index));
 
-        float ascale = CV_PI/180.0;
+        float ascale = CV_PI/180.0f;
         float alpha  = angInDegree == 1 ? y * ascale : y;
         float a = cos(alpha);
         float b = sin(alpha);
diff --git a/modules/ocl/src/opencl/arithm_sum.cl b/modules/ocl/src/opencl/arithm_sum.cl
index 6eb6e48323..7ada5be4c1 100644
--- a/modules/ocl/src/opencl/arithm_sum.cl
+++ b/modules/ocl/src/opencl/arithm_sum.cl
@@ -66,39 +66,39 @@
 __kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
                                 __global srcT *src, __global dstT *dst)
 {
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int id = get_global_id(0);
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
+    int id = get_global_id(0);
+    int idx = offset + id + (id / cols) * invalid_cols;
 
-   __local dstT localmem_sum[128];
-   dstT sum = (dstT)(0), temp;
+    __local dstT localmem_sum[128];
+    dstT sum = (dstT)(0), temp;
 
-   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       temp = convertToDstT(src[idx]);
-       FUNC(temp, sum);
-   }
+    for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
+    {
+        idx = offset + id + (id / cols) * invalid_cols;
+        temp = convertToDstT(src[idx]);
+        FUNC(temp, sum);
+    }
 
-   if (lid > 127)
-       localmem_sum[lid - 128] = sum;
-   barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid > 127)
+        localmem_sum[lid - 128] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-   if (lid < 128)
-       localmem_sum[lid] = sum + localmem_sum[lid];
-   barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid < 128)
+        localmem_sum[lid] = sum + localmem_sum[lid];
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-   for (int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if (lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
+    for (int lsize = 64; lsize > 0; lsize >>= 1)
+    {
+        if (lid < lsize)
+        {
+            int lid2 = lsize + lid;
+            localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
 
-   if (lid == 0)
-       dst[gid] = localmem_sum[0];
+    if (lid == 0)
+        dst[gid] = localmem_sum[0];
 }
diff --git a/modules/ocl/src/opencl/blend_linear.cl b/modules/ocl/src/opencl/blend_linear.cl
index 50c5c39c5f..06a51f25cf 100644
--- a/modules/ocl/src/opencl/blend_linear.cl
+++ b/modules/ocl/src/opencl/blend_linear.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -42,99 +42,37 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-__kernel void BlendLinear_C1_D0(
-    __global uchar4 *dst,
-    __global uchar4 *img1,
-    __global uchar4 *img2,
-    __global float4 *weight1,
-    __global float4 *weight2,
-    int rows,
-    int cols,
-    int istep,
-    int wstep
-    )
-{
-    int idx = get_global_id(0);
-    int idy = get_global_id(1);
-    if (idx << 2 < cols && idy < rows)
-    {
-        int pos = mad24(idy,istep >> 2,idx);
-        int wpos = mad24(idy,wstep >> 2,idx);
-        float4 w1 = weight1[wpos], w2 = weight2[wpos];
-        dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
-            convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
-    }
-}
 
-__kernel void BlendLinear_C4_D0(
-    __global uchar4 *dst,
-    __global uchar4 *img1,
-    __global uchar4 *img2,
-    __global float *weight1,
-    __global float *weight2,
-    int rows,
-    int cols,
-    int istep,
-    int wstep
-    )
-{
-    int idx = get_global_id(0);
-    int idy = get_global_id(1);
-    if (idx < cols && idy < rows)
-    {
-        int pos = mad24(idy,istep >> 2,idx);
-        int wpos = mad24(idy,wstep, idx);
-        float w1 = weight1[wpos];
-        float w2 = weight2[wpos];
-        dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
-            convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
-    }
-}
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
 
-
-__kernel void BlendLinear_C1_D5(
-    __global float4 *dst,
-    __global float4 *img1,
-    __global float4 *img2,
-    __global float4 *weight1,
-    __global float4 *weight2,
-    int rows,
-    int cols,
-    int istep,
-    int wstep
-    )
+__kernel void blendLinear(__global const T * src1, int src1_offset, int src1_step,
+                          __global const T * src2, int src2_offset, int src2_step,
+                          __global const float * weight1, int weight1_offset, int weight1_step,
+                          __global const float * weight2, int weight2_offset, int weight2_step,
+                          __global T * dst, int dst_offset, int dst_step,
+                          int rows, int cols)
 {
-    int idx = get_global_id(0);
-    int idy = get_global_id(1);
-    if (idx << 2 < cols && idy < rows)
-    {
-        int pos = mad24(idy,istep >> 2,idx);
-        int wpos = mad24(idy,wstep >> 2,idx);
-        float4 w1 = weight1[wpos], w2 = weight2[wpos];
-        dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
-    }
-}
+    int x = get_global_id(0);
+    int y = get_global_id(1);
 
-__kernel void BlendLinear_C4_D5(
-    __global float4 *dst,
-    __global float4 *img1,
-    __global float4 *img2,
-    __global float *weight1,
-    __global float *weight2,
-    int rows,
-    int cols,
-    int istep,
-    int wstep
-    )
-{
-    int idx = get_global_id(0);
-    int idy = get_global_id(1);
-    if (idx < cols && idy < rows)
+    if (x < cols && y < rows)
     {
-        int pos = mad24(idy,istep >> 2,idx);
-        int wpos = mad24(idy,wstep, idx);
-        float w1 = weight1[wpos];
-        float w2 = weight2[wpos];
-        dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+        int src1_index = mad24(y, src1_step, src1_offset + x);
+        int src2_index = mad24(y, src2_step, src2_offset + x);
+        int weight1_index = mad24(y, weight1_step, weight1_offset + x);
+        int weight2_index = mad24(y, weight2_step, weight2_offset + x);
+        int dst_index = mad24(y, dst_step, dst_offset + x);
+
+        FT w1 = (FT)(weight1[weight1_index]), w2 = (FT)(weight2[weight2_index]);
+        FT den = w1 + w2 + (FT)(1e-5f);
+        FT num = w1 * convertToFT(src1[src1_index]) + w2 * convertToFT(src2[src2_index]);
+
+        dst[dst_index] = convertToT(num / den);
     }
 }
diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl
index d6a89f2057..ce0d86e8a4 100644
--- a/modules/ocl/src/opencl/brute_force_match.cl
+++ b/modules/ocl/src/opencl/brute_force_match.cl
@@ -17,6 +17,7 @@
 // @Authors
 //    Nathan, liujun@multicorewareinc.com
 //    Peng Xiao, pengxiao@outlook.com
+//    Baichuan Su, baichuan@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -63,7 +64,7 @@
 #endif
 
 //http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-int bit1Count(int v)
+static int bit1Count(int v)
 {
     v = v - ((v >> 1) & 0x55555555);                    // reuse input as temporary
     v = (v & 0x33333333) + ((v >> 2) & 0x33333333);     // temp
@@ -94,7 +95,7 @@ typedef int result_type;
 #define DIST_RES(x) (x)
 #endif
 
-result_type reduce_block(
+static result_type reduce_block(
     __local value_type *s_query,
     __local value_type *s_train,
     int lidx,
@@ -112,7 +113,25 @@ result_type reduce_block(
     return DIST_RES(result);
 }
 
-result_type reduce_multi_block(
+static result_type reduce_block_match(
+    __local value_type *s_query,
+    __local value_type *s_train,
+    int lidx,
+    int lidy
+    )
+{
+    result_type result = 0;
+    #pragma unroll
+    for (int j = 0 ; j < BLOCK_SIZE ; j++)
+    {
+        result += DIST(
+            s_query[lidy * BLOCK_SIZE + j],
+            s_train[j * BLOCK_SIZE + lidx]);
+    }
+    return (result);
+}
+
+static result_type reduce_multi_block(
     __local value_type *s_query,
     __local value_type *s_train,
     int block_index,
@@ -128,7 +147,7 @@ result_type reduce_multi_block(
             s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
             s_train[j * BLOCK_SIZE + lidx]);
     }
-    return DIST_RES(result);
+    return result;
 }
 
 /* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
@@ -168,7 +187,6 @@ __kernel void BruteForceMatch_UnrollMatch(
     int myBestTrainIdx = -1;
 
     // loopUnrolledCached to find the best trainIdx and best distance.
-    volatile int imgIdx = 0;
     for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
     {
         result_type result = 0;
@@ -187,11 +205,12 @@ __kernel void BruteForceMatch_UnrollMatch(
             barrier(CLK_LOCAL_MEM_FENCE);
         }
 
+        result = DIST_RES(result);
+
         int trainIdx = t * BLOCK_SIZE + lidx;
 
         if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
         {
-            //bestImgIdx = imgIdx;
             myBestDistance = result;
             myBestTrainIdx = trainIdx;
         }
@@ -272,16 +291,17 @@ __kernel void BruteForceMatch_Match(
 
             barrier(CLK_LOCAL_MEM_FENCE);
 
-            result += reduce_block(s_query, s_train, lidx, lidy);
+            result += reduce_block_match(s_query, s_train, lidx, lidy);
 
             barrier(CLK_LOCAL_MEM_FENCE);
         }
 
+        result = DIST_RES(result);
+
         const int trainIdx = t * BLOCK_SIZE + lidx;
 
         if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
         {
-            //myBestImgidx = imgIdx;
             myBestDistance = result;
             myBestTrainIdx = trainIdx;
         }
@@ -367,11 +387,10 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
     if (queryIdx < query_rows && trainIdx < train_rows &&
         convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
     {
-        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+        int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
 
         if(ind < bestTrainIdx_cols)
         {
-            //bestImgIdx = imgIdx;
             bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
             bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
         }
@@ -428,11 +447,10 @@ __kernel void BruteForceMatch_RadiusMatch(
     if (queryIdx < query_rows && trainIdx < train_rows &&
         convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
     {
-        unsigned int ind = atom_inc(nMatches + queryIdx);
+        int ind = atom_inc(nMatches + queryIdx);
 
         if(ind < bestTrainIdx_cols)
         {
-            //bestImgIdx = imgIdx;
             bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
             bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
         }
@@ -475,7 +493,6 @@ __kernel void BruteForceMatch_knnUnrollMatch(
     int myBestTrainIdx2 = -1;
 
     //loopUnrolledCached
-    volatile int imgIdx = 0;
     for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
     {
         result_type result = 0;
@@ -493,6 +510,8 @@ __kernel void BruteForceMatch_knnUnrollMatch(
             barrier(CLK_LOCAL_MEM_FENCE);
         }
 
+        result = DIST_RES(result);
+
         const int trainIdx = t * BLOCK_SIZE + lidx;
 
         if (queryIdx < query_rows && trainIdx < train_rows)
@@ -631,11 +650,13 @@ __kernel void BruteForceMatch_knnMatch(
 
             barrier(CLK_LOCAL_MEM_FENCE);
 
-            result += reduce_block(s_query, s_train, lidx, lidy);
+            result += reduce_block_match(s_query, s_train, lidx, lidy);
 
             barrier(CLK_LOCAL_MEM_FENCE);
         }
 
+        result = DIST_RES(result);
+
         const int trainIdx = t * BLOCK_SIZE + lidx;
 
         if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
diff --git a/modules/ocl/src/opencl/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl
index fcbf67ca7a..01286f7ad7 100644
--- a/modules/ocl/src/opencl/cvt_color.cl
+++ b/modules/ocl/src/opencl/cvt_color.cl
@@ -50,8 +50,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 
-#define DATA_TYPE UNDEFINED
-
 #if defined (DEPTH_0)
 #define DATA_TYPE uchar
 #define MAX_NUM  255
@@ -73,6 +71,10 @@
 #define SAT_CAST(num) (num)
 #endif
 
+#ifndef DATA_TYPE
+    #define DATA_TYPE UNDEFINED
+#endif
+
 #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
 
 enum
diff --git a/modules/ocl/src/opencl/filter_sep_col.cl b/modules/ocl/src/opencl/filter_sep_col.cl
index 8dd77d5a97..0d1998ce9d 100644
--- a/modules/ocl/src/opencl/filter_sep_col.cl
+++ b/modules/ocl/src/opencl/filter_sep_col.cl
@@ -16,7 +16,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/src/opencl/filtering_boxFilter.cl b/modules/ocl/src/opencl/filtering_boxFilter.cl
index 030c13cc57..7f7fd018d7 100644
--- a/modules/ocl/src/opencl/filtering_boxFilter.cl
+++ b/modules/ocl/src/opencl/filtering_boxFilter.cl
@@ -10,13 +10,9 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//    Zhang Ying, zhangying913@gmail.com
-//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -79,400 +75,298 @@
 #define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
 #endif
 
-#define THREADS 256
-#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
-
-inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
-                             int dst_rows, int dst_cols,
-                             int dst_startX, int dst_x_off,
-                             float alpha)
-{
-    if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
-    {
-        return;
+#ifdef EXTRA_EXTRAPOLATION // border > src image size
+#ifdef BORDER_CONSTANT
+// None
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
+    { \
+        x = max(min(x, maxX - 1), minX); \
+        y = max(min(y, maxY - 1), minY); \
     }
-
-    uint4 tmp_sum = 0;
-    int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
-    int posY = (get_group_id(1) << 1);
-
-    for(int i=-anX; i<=anX; i++)
-    {
-        tmp_sum += vload4(get_local_id(0), temp+i);
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
+    { \
+        if (x < minX) \
+            x -= ((x - maxX + 1) / maxX) * maxX; \
+        if (x >= maxX) \
+            x %= maxX; \
+        if (y < minY) \
+            y -= ((y - maxY + 1) / maxY) * maxY; \
+        if (y >= maxY) \
+            y %= maxY; \
     }
-
-    if(posY < dst_rows && posX < dst_cols)
-    {
-        tmp_sum /= (uint4) alpha;
-        if(posX >= 0 && posX < dst_cols)
-            *(dst) = tmp_sum.x;
-        if(posX+1 >= 0 && posX+1 < dst_cols)
-            *(dst + 1) = tmp_sum.y;
-        if(posX+2 >= 0 && posX+2 < dst_cols)
-            *(dst + 2) = tmp_sum.z;
-        if(posX+3 >= 0 && posX+3 < dst_cols)
-            *(dst + 3) = tmp_sum.w;
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
+#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
+    { \
+        if (maxX - minX == 1) \
+            x = minX; \
+        else \
+            do \
+            { \
+                if (x < minX) \
+                    x = -(x - minX) - 1 + delta; \
+                else \
+                    x = maxX - 1 - (x - maxX) - delta; \
+            } \
+            while (x >= maxX || x < minX); \
+        \
+        if (maxY - minY == 1) \
+            y = minY; \
+        else \
+            do \
+            { \
+                if (y < minY) \
+                    y = -(y - minY) - 1 + delta; \
+                else \
+                    y = maxY - 1 - (y - maxY) - delta; \
+            } \
+            while (y >= maxY || y < minY); \
     }
-}
+#ifdef BORDER_REFLECT
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
+#elif defined(BORDER_REFLECT_101)
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
+#endif
+#else
+#error No extrapolation method
+#endif
+#else
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
+    { \
+        int _row = y - minY, _col = x - minX; \
+        _row = ADDR_H(_row, 0, maxY - minY); \
+        _row = ADDR_B(_row, maxY - minY, _row); \
+        y = _row + minY; \
+        \
+        _col = ADDR_L(_col, 0, maxX - minX); \
+        _col = ADDR_R(_col, maxX - minX, _col); \
+        x = _col + minX; \
+    }
+#endif
 
+#if USE_DOUBLE
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#define FPTYPE double
+#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
+#else
+#define FPTYPE float
+#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
+#endif
 
-inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
-                             int dst_rows, int dst_cols,
-                             int dst_startX, int dst_x_off,
-                             float alpha)
-{
-    if(get_local_id(0) >= (THREADS-ksX+1))
-    {
-        return;
-    }
+#if DATA_DEPTH == 0
+#define BASE_TYPE uchar
+#elif DATA_DEPTH == 1
+#define BASE_TYPE char
+#elif DATA_DEPTH == 2
+#define BASE_TYPE ushort
+#elif DATA_DEPTH == 3
+#define BASE_TYPE short
+#elif DATA_DEPTH == 4
+#define BASE_TYPE int
+#elif DATA_DEPTH == 5
+#define BASE_TYPE float
+#elif DATA_DEPTH == 6
+#define BASE_TYPE double
+#else
+#error data_depth
+#endif
 
-    int posX = dst_startX - dst_x_off + get_local_id(0);
-    int posY = (get_group_id(1) << 1);
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define uchar1 uchar
+#define char1 char
+#define ushort1 ushort
+#define short1 short
+#define int1 int
+#define float1 float
+#define double1 double
+
+#define convert_uchar1_sat_rte convert_uchar_sat_rte
+#define convert_char1_sat_rte convert_char_sat_rte
+#define convert_ushort1_sat_rte convert_ushort_sat_rte
+#define convert_short1_sat_rte convert_short_sat_rte
+#define convert_int1_sat_rte convert_int_sat_rte
+#define convert_float1
+#define convert_double1
+
+#if DATA_DEPTH == 5 || DATA_DEPTH == 6
+#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
+#else
+#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
+#endif
 
-    uint4 temp_sum = 0;
-    for(int i=-anX; i<=anX; i++)
-    {
-        temp_sum += temp[get_local_id(0) + anX + i];
-    }
+#define VEC_SIZE DATA_CHAN
 
-    if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
-        *dst = convert_uchar4(convert_float4(temp_sum)/alpha);
-}
+#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
+#define TYPE VEC_TYPE
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
-                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step
-                             )
-{
+#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
 
-    int col = get_local_id(0);
-    const int gX = get_group_id(0);
-    const int gY = get_group_id(1);
-    int src_x_off = src_offset % src_step;
-    int src_y_off = src_offset / src_step;
-    int dst_x_off = dst_offset % dst_step;
-    int dst_y_off = dst_offset / dst_step;
+#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
+
+struct RectCoords
+{
+    int x1, y1, x2, y2;
+};
 
-    int head_off = dst_x_off%4;
-    int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
-    int startY = (gY << 1) - anY + src_y_off;
-    int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
-    int dst_startY = (gY << 1) + dst_y_off;
+//#define DEBUG
+#ifdef DEBUG
+#define DEBUG_ONLY(x) x
+#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
+#else
+#define DEBUG_ONLY(x)
+#define ASSERT(condition)
+#endif
 
-    uint4 data[ksY+1];
-    __local uint4 temp[2][THREADS];
 
+inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global TYPE *src, const unsigned int srcStepBytes, const struct RectCoords srcCoords
 #ifdef BORDER_CONSTANT
-
-    for(int i=0; i < ksY+1; i++)
+               , SCALAR_TYPE borderValue
+#endif
+    )
+{
+#ifdef BORDER_ISOLATED
+    if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+#else
+    if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+#endif
     {
-        if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
-        {
-            data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
-            data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
-            data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
-            data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
-        }
-        else
-        {
-            data[i]=0;
-            int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
-            if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
-            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
-            if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
-            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
-            if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
-            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
-            if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
-        }
+        __global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
+        return CONVERT_TO_FPTYPE(*ptr);
     }
-
-#else
-    int not_all_in_range;
-    for(int i=0; i < ksY+1; i++)
+    else
     {
-        not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
-                           | (startY+i<0) | (startY+i>src_whole_rows-1);
-        if(not_all_in_range)
-        {
-            int selected_row;
-            int4 selected_col;
-            selected_row = ADDR_H(startY+i, 0, src_whole_rows);
-            selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
-
-            selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
-            selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
-
-            selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
-            selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
+#ifdef BORDER_CONSTANT
+        return borderValue;
+#else
+        int selected_col = pos.x;
+        int selected_row = pos.y;
 
-            selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
-            selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
+        EXTRAPOLATE(selected_col, selected_row,
+#ifdef BORDER_ISOLATED
+                srcCoords.x1, srcCoords.y1,
+#else
+                0, 0,
+#endif
+                srcCoords.x2, srcCoords.y2
+         );
 
-            selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
-            selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
+        // debug border mapping
+        //printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
 
-            data[i].x = *(src + selected_row * src_step + selected_col.x);
-            data[i].y = *(src + selected_row * src_step + selected_col.y);
-            data[i].z = *(src + selected_row * src_step + selected_col.z);
-            data[i].w = *(src + selected_row * src_step + selected_col.w);
+        pos = (int2)(selected_col, selected_row);
+        if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+        {
+            __global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
+            return CONVERT_TO_FPTYPE(*ptr);
         }
         else
         {
-            data[i] =  convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
+            // for debug only
+            DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
+            return (FPTYPE)(0.0f);
         }
-    }
 #endif
-    uint4 tmp_sum = 0;
-    for(int i=1; i < ksY; i++)
-    {
-        tmp_sum += (data[i]);
     }
-
-    int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
-
-    temp[0][col] = tmp_sum + (data[0]);
-    temp[1][col] = tmp_sum + (data[ksY]);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
-                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
-    update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
-                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
-
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
-                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step
-                             )
-{
-    int col = get_local_id(0);
-    const int gX = get_group_id(0);
-    const int gY = get_group_id(1);
-
-    int src_x_off = (src_offset % src_step) >> 2;
-    int src_y_off = src_offset / src_step;
-    int dst_x_off = (dst_offset % dst_step) >> 2;
-    int dst_y_off = dst_offset / dst_step;
-
-    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
-    int startY = (gY << 1) - anY + src_y_off;
-    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY << 1) + dst_y_off;
-
-    uint4 data[ksY+1];
-    __local uint4 temp[2][THREADS];
+// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
 
+__kernel
+__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
+void boxFilter(__global TYPE *src, const unsigned int srcStepBytes, const int4 srcRC,
+               __global TYPE *dst, const unsigned int dstStepBytes, const int4 dstRC,
 #ifdef BORDER_CONSTANT
-    bool con;
-    for(int i=0; i < ksY+1; i++)
-    {
-        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
-        int cur_col = clamp(startX + col, 0, src_whole_cols);
+               SCALAR_TYPE borderValue,
+#endif
+               FPTYPE alpha
+               )
+{
+    const struct RectCoords srcCoords = {srcRC.s0, srcRC.s1, srcRC.s2, srcRC.s3}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
+    const struct RectCoords dstCoords = {dstRC.s0, dstRC.s1, dstRC.s2, dstRC.s3};
 
-        data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
-        data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
-        data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
-        data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
-    }
-#else
-    for(int i=0; i < ksY+1; i++)
-    {
-        int selected_row;
-        int selected_col;
-        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
-        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+    const int x = get_local_id(0) + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
+    const int y = get_global_id(1) * BLOCK_SIZE_Y;
 
-        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
-        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+    const int local_id = get_local_id(0);
 
+    INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
+    __local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
 
-        data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
+    int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
+    for(int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
+    {
+        data[sy] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
+#ifdef BORDER_CONSTANT
+                , borderValue
+#endif
+                );
     }
 
-#endif
-    uint4 tmp_sum = 0;
-    for(int i=1; i < ksY; i++)
+    INTERMEDIATE_TYPE tmp_sum = 0;
+    for(int sy = 0; sy < KERNEL_SIZE_Y; sy++)
     {
-        tmp_sum += (data[i]);
+        tmp_sum += (data[sy]);
     }
 
-    int index = dst_startY * (dst_step>>2)+ dst_startX + col;
-
-    temp[0][col] = tmp_sum + (data[0]);
-    temp[1][col] = tmp_sum + (data[ksY]);
+    sumOfCols[local_id] = tmp_sum;
     barrier(CLK_LOCAL_MEM_FENCE);
-    update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
-                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
-    update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
-                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
 
-}
+    int2 pos = (int2)(dstCoords.x1 + x, dstCoords.y1 + y);
+    __global TYPE* dstPtr = (__global TYPE*)((__global char*)dst + pos.x * sizeof(TYPE) + pos.y * dstStepBytes); // Pointer can be out of bounds!
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
-                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step
-                             )
-{
-    int col = get_local_id(0);
-    const int gX = get_group_id(0);
-    const int gY = get_group_id(1);
-
-    int src_x_off = (src_offset % src_step) >> 2;
-    int src_y_off = src_offset / src_step;
-    int dst_x_off = (dst_offset % dst_step) >> 2;
-    int dst_y_off = dst_offset / dst_step;
-
-    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
-    int startY = (gY << 1) - anY + src_y_off;
-    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY << 1) + dst_y_off;
-    float data[ksY+1];
-    __local float temp[2][THREADS];
-#ifdef BORDER_CONSTANT
-    bool con;
-    float ss;
-    for(int i=0; i < ksY+1; i++)
+    int sy_index = 0; // current index in data[] array
+    int stepsY = min(dstCoords.y2 - pos.y, BLOCK_SIZE_Y);
+    ASSERT(stepsY > 0);
+    for (; ;)
     {
-        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
-
-        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
-
-        data[i] = con ? ss : 0.f;
-    }
-#else
-    for(int i=0; i < ksY+1; i++)
-    {
-        int selected_row;
-        int selected_col;
-        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
-        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
-
-        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
-        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
-
-        data[i] = src[selected_row * (src_step>>2) + selected_col];
-    }
+        ASSERT(pos.y < dstCoords.y2);
 
-#endif
-    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
-    for(int i=1; i < ksY; i++)
-    {
-        sum0 += (data[i]);
-    }
-    sum1 = sum0 + (data[0]);
-    sum2 = sum0 + (data[ksY]);
-    temp[0][col] = sum1;
-    temp[1][col] = sum2;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(col < (THREADS-(ksX-1)))
-    {
-        col += anX;
-        int posX = dst_startX - dst_x_off + col - anX;
-        int posY = (gY << 1);
+        if(local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
+            pos.x >= dstCoords.x1 && pos.x < dstCoords.x2)
+        {
+            ASSERT(pos.y >= dstCoords.y1 && pos.y < dstCoords.y2);
 
-        float tmp_sum[2]= {0.0, 0.0};
-        for(int k=0; k<2; k++)
-            for(int i=-anX; i<=anX; i++)
+            INTERMEDIATE_TYPE total_sum = 0;
+#pragma unroll
+            for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
             {
-                tmp_sum[k] += temp[k][col+i];
+                total_sum += sumOfCols[local_id + sx - ANCHOR_X];
             }
-        for(int i=0; i<2; i++)
-        {
-            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
-                dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
+            *dstPtr = CONVERT_TO_TYPE(((INTERMEDIATE_TYPE)alpha) * total_sum);
         }
 
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
-                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step
-                             )
-{
-    int col = get_local_id(0);
-    const int gX = get_group_id(0);
-    const int gY = get_group_id(1);
-
-    int src_x_off = (src_offset % src_step) >> 4;
-    int src_y_off = src_offset / src_step;
-    int dst_x_off = (dst_offset % dst_step) >> 4;
-    int dst_y_off = dst_offset / dst_step;
-
-    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
-    int startY = (gY << 1) - anY + src_y_off;
-    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY << 1) + dst_y_off;
-    float4 data[ksY+1];
-    __local float4 temp[2][THREADS];
-#ifdef BORDER_CONSTANT
-    bool con;
-    float4 ss;
-    for(int i=0; i < ksY+1; i++)
-    {
-        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
-
-        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
-
-        data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
-    }
+#if BLOCK_SIZE_Y == 1
+        break;
 #else
-    for(int i=0; i < ksY+1; i++)
-    {
-        int selected_row;
-        int selected_col;
-        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
-        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+        if (--stepsY == 0)
+            break;
 
-        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
-        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        data[i] = src[selected_row * (src_step>>4) + selected_col];
-    }
+        tmp_sum = sumOfCols[local_id]; // TODO FIX IT: workaround for BUG in OpenCL compiler
+        // only works with scalars: ASSERT(fabs(tmp_sum - sumOfCols[local_id]) < (INTERMEDIATE_TYPE)1e-6);
+        tmp_sum -= data[sy_index];
 
+        data[sy_index] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
+#ifdef BORDER_CONSTANT
+                , borderValue
 #endif
-    float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
-    for(int i=1; i < ksY; i++)
-    {
-        sum0 += (data[i]);
-    }
-    sum1 = sum0 + (data[0]);
-    sum2 = sum0 + (data[ksY]);
-    temp[0][col] = sum1;
-    temp[1][col] = sum2;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(col < (THREADS-(ksX-1)))
-    {
-        col += anX;
-        int posX = dst_startX - dst_x_off + col - anX;
-        int posY = (gY << 1);
+                );
+        srcPos.y++;
 
-        float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
-        for(int k=0; k<2; k++)
-            for(int i=-anX; i<=anX; i++)
-            {
-                tmp_sum[k] += temp[k][col+i];
-            }
-        for(int i=0; i<2; i++)
-        {
-            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
-                dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
-        }
+        tmp_sum += data[sy_index];
+        sumOfCols[local_id] = tmp_sum;
+
+        sy_index = (sy_index + 1 < KERNEL_SIZE_Y) ? sy_index + 1 : 0;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
 
+        // next line
+        DEBUG_ONLY(pos.y++);
+        dstPtr = (__global TYPE*)((__global char*)dstPtr + dstStepBytes); // Pointer can be out of bounds!
+#endif // BLOCK_SIZE_Y == 1
     }
 }
diff --git a/modules/ocl/src/opencl/filtering_filter2D.cl b/modules/ocl/src/opencl/filtering_filter2D.cl
new file mode 100644
index 0000000000..f966766895
--- /dev/null
+++ b/modules/ocl/src/opencl/filtering_filter2D.cl
@@ -0,0 +1,370 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+//blur function does not support BORDER_WRAP
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+#ifdef EXTRA_EXTRAPOLATION // border > src image size
+#ifdef BORDER_CONSTANT
+// None
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
+    { \
+        x = max(min(x, maxX - 1), minX); \
+        y = max(min(y, maxY - 1), minY); \
+    }
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
+    { \
+        if (x < minX) \
+            x -= ((x - maxX + 1) / maxX) * maxX; \
+        if (x >= maxX) \
+            x %= maxX; \
+        if (y < minY) \
+            y -= ((y - maxY + 1) / maxY) * maxY; \
+        if (y >= maxY) \
+            y %= maxY; \
+    }
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
+#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
+    { \
+        if (maxX - minX == 1) \
+            x = minX; \
+        else \
+            do \
+            { \
+                if (x < minX) \
+                    x = -(x - minX) - 1 + delta; \
+                else \
+                    x = maxX - 1 - (x - maxX) - delta; \
+            } \
+            while (x >= maxX || x < minX); \
+        \
+        if (maxY - minY == 1) \
+            y = minY; \
+        else \
+            do \
+            { \
+                if (y < minY) \
+                    y = -(y - minY) - 1 + delta; \
+                else \
+                    y = maxY - 1 - (y - maxY) - delta; \
+            } \
+            while (y >= maxY || y < minY); \
+    }
+#ifdef BORDER_REFLECT
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
+#elif defined(BORDER_REFLECT_101)
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
+#endif
+#else
+#error No extrapolation method
+#endif
+#else
+#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
+    { \
+        int _row = y - minY, _col = x - minX; \
+        _row = ADDR_H(_row, 0, maxY - minY); \
+        _row = ADDR_B(_row, maxY - minY, _row); \
+        y = _row + minY; \
+        \
+        _col = ADDR_L(_col, 0, maxX - minX); \
+        _col = ADDR_R(_col, maxX - minX, _col); \
+        x = _col + minX; \
+    }
+#endif
+
+#if USE_DOUBLE
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#define FPTYPE double
+#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
+#else
+#define FPTYPE float
+#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
+#endif
+
+#if DATA_DEPTH == 0
+#define BASE_TYPE uchar
+#elif DATA_DEPTH == 1
+#define BASE_TYPE char
+#elif DATA_DEPTH == 2
+#define BASE_TYPE ushort
+#elif DATA_DEPTH == 3
+#define BASE_TYPE short
+#elif DATA_DEPTH == 4
+#define BASE_TYPE int
+#elif DATA_DEPTH == 5
+#define BASE_TYPE float
+#elif DATA_DEPTH == 6
+#define BASE_TYPE double
+#else
+#error data_depth
+#endif
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define uchar1 uchar
+#define char1 char
+#define ushort1 ushort
+#define short1 short
+#define int1 int
+#define float1 float
+#define double1 double
+
+#define convert_uchar1_sat_rte convert_uchar_sat_rte
+#define convert_char1_sat_rte convert_char_sat_rte
+#define convert_ushort1_sat_rte convert_ushort_sat_rte
+#define convert_short1_sat_rte convert_short_sat_rte
+#define convert_int1_sat_rte convert_int_sat_rte
+#define convert_float1
+#define convert_double1
+
+#if DATA_DEPTH == 5 || DATA_DEPTH == 6
+#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
+#else
+#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
+#endif
+
+#define VEC_SIZE DATA_CHAN
+
+#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
+#define TYPE VEC_TYPE
+
+#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
+
+#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
+
+struct RectCoords
+{
+    int x1, y1, x2, y2;
+};
+
+//#define DEBUG
+#ifdef DEBUG
+#define DEBUG_ONLY(x) x
+#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
+#else
+#define DEBUG_ONLY(x) (void)0
+#define ASSERT(condition) (void)0
+#endif
+
+
+inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global TYPE *src, const unsigned int srcStepBytes, const struct RectCoords srcCoords
+#ifdef BORDER_CONSTANT
+               , SCALAR_TYPE borderValue
+#endif
+    )
+{
+#ifdef BORDER_ISOLATED
+    if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+#else
+    if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+#endif
+    {
+        __global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
+        return CONVERT_TO_FPTYPE(*ptr);
+    }
+    else
+    {
+#ifdef BORDER_CONSTANT
+        return borderValue;
+#else
+        int selected_col = pos.x;
+        int selected_row = pos.y;
+
+        EXTRAPOLATE(selected_col, selected_row,
+#ifdef BORDER_ISOLATED
+                srcCoords.x1, srcCoords.y1,
+#else
+                0, 0,
+#endif
+                srcCoords.x2, srcCoords.y2
+         );
+
+        // debug border mapping
+        //printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
+
+        pos = (int2)(selected_col, selected_row);
+        if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+        {
+            __global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
+            return CONVERT_TO_FPTYPE(*ptr);
+        }
+        else
+        {
+            // for debug only
+            DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
+            return (FPTYPE)(0.0f);
+        }
+#endif
+    }
+}
+
+// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
+
+__kernel
+__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
+void filter2D(__global TYPE *src, const unsigned int srcStepBytes, const int4 srcRC,
+              __global TYPE *dst, const unsigned int dstStepBytes, const int4 dstRC,
+#ifdef BORDER_CONSTANT
+              SCALAR_TYPE borderValue,
+#endif
+              __constant FPTYPE* kernelData // transposed: [KERNEL_SIZE_X][KERNEL_SIZE_Y2_ALIGNED]
+              )
+{
+    const struct RectCoords srcCoords = {srcRC.s0, srcRC.s1, srcRC.s2, srcRC.s3}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
+    struct RectCoords dstCoords = {dstRC.s0, dstRC.s1, dstRC.s2, dstRC.s3};
+
+    const int local_id = get_local_id(0);
+    const int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
+    const int y = get_global_id(1) * BLOCK_SIZE_Y;
+
+    INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
+    __local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
+
+    int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
+
+    int2 pos = (int2)(dstCoords.x1 + x, dstCoords.y1 + y);
+    __global TYPE* dstPtr = (__global TYPE*)((__global char*)dst + pos.x * sizeof(TYPE) + pos.y * dstStepBytes); // Pointer can be out of bounds!
+    bool writeResult = (local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
+                        pos.x >= dstCoords.x1 && pos.x < dstCoords.x2);
+
+#if BLOCK_SIZE_Y > 1
+    bool readAllpixels = true;
+    int sy_index = 0; // current index in data[] array
+
+    dstCoords.y2 = min(dstCoords.y2, pos.y + BLOCK_SIZE_Y);
+    for (;
+         pos.y < dstCoords.y2;
+         pos.y++,
+         dstPtr = (__global TYPE*)((__global char*)dstPtr + dstStepBytes))
+#endif
+    {
+        ASSERT(pos.y < dstCoords.y2);
+
+        for (
+#if BLOCK_SIZE_Y > 1
+            int sy = readAllpixels ? 0 : -1; sy < (readAllpixels ? KERNEL_SIZE_Y : 0);
+#else
+            int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y;
+#endif
+            sy++, srcPos.y++)
+        {
+            data[sy + sy_index] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
+#ifdef BORDER_CONSTANT
+                    , borderValue
+#endif
+                    );
+        }
+
+        INTERMEDIATE_TYPE total_sum = 0;
+        for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
+        {
+            {
+                __constant FPTYPE* k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * sx
+#if BLOCK_SIZE_Y > 1
+                                                   + KERNEL_SIZE_Y - sy_index
+#endif
+                                                   ];
+                INTERMEDIATE_TYPE tmp_sum = 0;
+                for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)
+                {
+                    tmp_sum += data[sy] * k[sy];
+                }
+
+                sumOfCols[local_id] = tmp_sum;
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            int id = local_id + sx - ANCHOR_X;
+            if (id >= 0 && id < LOCAL_SIZE)
+               total_sum += sumOfCols[id];
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if (writeResult)
+        {
+            ASSERT(pos.y >= dstCoords.y1 && pos.y < dstCoords.y2);
+            *dstPtr = CONVERT_TO_TYPE(total_sum);
+        }
+
+#if BLOCK_SIZE_Y > 1
+        readAllpixels = false;
+#if BLOCK_SIZE_Y > KERNEL_SIZE_Y
+        sy_index = (sy_index + 1 <= KERNEL_SIZE_Y) ? sy_index + 1 : 1;
+#else
+        sy_index++;
+#endif
+#endif // BLOCK_SIZE_Y == 1
+    }
+}
diff --git a/modules/ocl/src/opencl/filtering_laplacian.cl b/modules/ocl/src/opencl/filtering_laplacian.cl
deleted file mode 100644
index ea22967dff..0000000000
--- a/modules/ocl/src/opencl/filtering_laplacian.cl
+++ /dev/null
@@ -1,381 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Pang Erping, erping@multicorewareinc.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
-
-#ifdef BORDER_REFLECT
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? ((l_edge)<<1)-(i)-1                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? ((t_edge)<<1)-(i)-1                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT_101
-//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? ((l_edge)<<1)-(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? ((t_edge)<<1)-(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef IMG_C_1_0
-#define T_IMG   uchar
-#define T_IMGx4 uchar4
-#define T_IMG_C1 uchar
-#define CONVERT_TYPE   convert_uchar_sat
-#define CONVERT_TYPEx4 convert_uchar4_sat
-#endif
-#ifdef IMG_C_4_0
-#define T_IMG   uchar4
-#define T_IMGx4 uchar16
-#define T_IMG_C1 uchar
-#define CONVERT_TYPE   convert_uchar4_sat
-#define CONVERT_TYPEx4 convert_uchar16_sat
-#endif
-#ifdef IMG_C_1_5
-#define T_IMG   float
-#define T_IMGx4 float4
-#define T_IMG_C1 float
-#define CONVERT_TYPE   convert_float
-#define CONVERT_TYPEx4 convert_float4
-#endif
-#ifdef IMG_C_4_5
-#define T_IMG   float4
-#define T_IMGx4 float16
-#define T_IMG_C1 float
-#define CONVERT_TYPE   convert_float4
-#define CONVERT_TYPEx4 convert_float16
-#endif
-
-#ifndef CN
-#define CN 1
-#endif
-
-#if CN == 1
-#define T_SUM   float
-#define T_SUMx4 float4
-#define CONVERT_TYPE_SUM   convert_float
-#define CONVERT_TYPE_SUMx4 convert_float4
-#define SUM_ZERO   (0.0f)
-#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f)
-#define VLOAD4 vload4
-#define SX x
-#define SY y
-#define SZ z
-#define SW w
-#elif CN == 4
-#define T_SUM float4
-#define T_SUMx4 float16
-#define CONVERT_TYPE_SUM   convert_float4
-#define CONVERT_TYPE_SUMx4 convert_float16
-#define SUM_ZERO   (0.0f, 0.0f, 0.0f, 0.0f)
-#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
-#define VLOAD4 vload16
-#define SX s0123
-#define SY s4567
-#define SZ s89ab
-#define SW scdef
-#endif
-
-#ifndef FILTER_SIZE
-#define FILTER_SIZE 3
-#endif
-
-#define LOCAL_GROUP_SIZE 16
-
-#define LOCAL_WIDTH  ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
-#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
-
-#define FILTER_RADIUS (FILTER_SIZE >> 1)
-
-__kernel void filter2D(
-    __global T_IMG *src,
-    __global T_IMG *dst,
-    int src_step,
-    int dst_step,
-    __constant float *mat_kernel,
-    __local T_IMG *local_data,
-    int wholerows,
-    int wholecols,
-    int src_offset_x,
-    int src_offset_y,
-    int dst_offset_x,
-    int dst_offset_y,
-    int cols,
-    int rows,
-    int operate_cols
-)
-{
-    int groupStartCol = get_group_id(0) * get_local_size(0);
-    int groupStartRow = get_group_id(1) * get_local_size(1);
-
-    int localCol = get_local_id(0);
-    int localRow = get_local_id(1);
-    int globalCol = groupStartCol + localCol;
-    int globalRow = groupStartRow + localRow;
-    const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
-    const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
-
-#ifdef BORDER_CONSTANT
-    for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
-    {
-        int curRow = groupStartRow + i;
-        for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
-        {
-            int curCol = groupStartCol + j;
-            if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
-                curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x)
-            {
-                local_data[(i) * LOCAL_WIDTH + j] = 0;
-            }
-            else
-            {
-                local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset];
-            }
-        }
-    }
-#else
-    for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
-    {
-        int curRow = groupStartRow + i;
-
-        curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y);
-
-        curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS);
-
-        for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
-        {
-            int curCol = groupStartCol + j;
-            curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x);
-            curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS);
-            if(curRow < wholerows  && curCol < wholecols)
-            {
-                local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset];
-            }
-        }
-    }
-#endif
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(globalRow < rows && globalCol < cols)
-    {
-        T_SUM sum = (T_SUM)(SUM_ZERO);
-        int filterIdx = 0;
-        for(int i = 0; i < FILTER_SIZE; i++)
-        {
-            int offset = (i + localRow) * LOCAL_WIDTH;
-
-            for(int j = 0; j < FILTER_SIZE; j++)
-            {
-                sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++];
-            }
-        }
-        dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum);
-    }
-}
-
-/// following is specific for 3x3 kernels
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////Macro for define elements number per thread/////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define ANX                     1
-#define ANY                     1
-
-#define ROWS_PER_GROUP          4
-#define ROWS_PER_GROUP_BITS     2
-#define ROWS_FETCH              (ROWS_PER_GROUP + ANY + ANY)   //(ROWS_PER_GROUP + anY * 2)
-
-#define THREADS_PER_ROW         64
-#define THREADS_PER_ROW_BIT     6
-
-#define ELEMENTS_PER_THREAD     4
-#define ELEMENTS_PER_THREAD_BIT 2
-
-#define LOCAL_MEM_STEP          260 //divup((get_local_size(0) + anX * 2), 4) * 4
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void filter2D_3x3(
-    __global T_IMG *src,
-    __global T_IMG *dst,
-    int src_step,
-    int dst_step,
-    __constant float *mat_kernel,
-    __local T_IMG *local_data,
-    int wholerows,
-    int wholecols,
-    int src_offset_x,
-    int src_offset_y,
-    int dst_offset_x,
-    int dst_offset_y,
-    int cols,
-    int rows,
-    int operate_cols
-)
-{
-    int gX = get_global_id(0);
-    int gY = get_global_id(1);
-
-    int lX = get_local_id(0);
-
-    int groupX_size = get_local_size(0);
-    int groupX_id   = get_group_id(0);
-
-#define dst_align (dst_offset_x & 3)
-    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
-    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
-
-    if((gY << 2) < rows)
-    {
-        for(int i = 0; i < ROWS_FETCH; ++i)
-        {
-            if((rows_start_index - src_offset_y) + i < rows + ANY)
-            {
-#ifdef BORDER_CONSTANT
-                int selected_row  = rows_start_index + i;
-                int selected_cols = cols_start_index_group + lX;
-
-                T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
-                int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
-                data = con ? data : (T_IMG)(0);
-                local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
-
-                if(lX < (ANX << 1))
-                {
-                    selected_cols = cols_start_index_group + lX + groupX_size;
-
-                    data  = src[mad24(selected_row, src_step, selected_cols)];
-                    con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
-                    data = con ? data : (T_IMG)(0);
-                    local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
-                }
-#else
-                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
-                selected_row     = ADDR_B(rows_start_index + i, wholerows, selected_row);
-
-                int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
-                selected_cols     = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
-
-                T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
-
-                local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
-
-                if(lX < (ANX << 1))
-                {
-                    selected_cols = cols_start_index_group + lX + groupX_size;
-                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
-
-                    data = src[mad24(selected_row, src_step, selected_cols)];
-                    local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
-                }
-#endif
-            }
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
-    if(((gY << 2) < rows) && (process_col < operate_cols))
-    {
-        int dst_cols_start = dst_offset_x;
-        int dst_cols_end   = dst_offset_x + cols;
-        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
-
-        int dst_rows_end   = dst_offset_y + rows;
-        int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
-        dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index);
-
-        T_IMGx4 dst_data = *(__global T_IMGx4 *)dst;
-
-        T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4;
-        T_IMGx4 data;
-
-        for(int i = 0; i < FILTER_SIZE; i++)
-        {
-#pragma unroll
-            for(int j = 0; j < FILTER_SIZE; j++)
-            {
-                if(dst_rows_index < dst_rows_end)
-                {
-                    int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
-                    int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
-
-                    data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols));
-                    sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data));
-                }
-            }
-        }
-
-        if(dst_rows_index < dst_rows_end)
-        {
-            T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
-            tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ?
-                         tmp_dst.SX : dst_data.SX;
-            tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ?
-                         tmp_dst.SY : dst_data.SY;
-            tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ?
-                         tmp_dst.SZ : dst_data.SZ;
-            tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ?
-                         tmp_dst.SW : dst_data.SW;
-            *(__global T_IMGx4 *)dst = tmp_dst;
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl
index 5fa3533054..58ebb4c014 100644
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -11,6 +11,7 @@
 //    Jia Haipeng, jiahaipeng95@gmail.com
 //    Nathan, liujun@multicorewareinc.com
 //    Peng Xiao, pengxiao@outlook.com
+//    Erping Pang, erping@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -37,7 +38,6 @@
 //
 //
 
-#pragma OPENCL EXTENSION cl_amd_printf : enable
 #define CV_HAAR_FEATURE_MAX           3
 
 #define calc_sum(rect,offset)        (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])
@@ -101,6 +101,144 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
     float inv_window_area __attribute__((aligned (4)));
 } GpuHidHaarClassifierCascade;
 
+
+#ifdef PACKED_CLASSIFIER
+// this code is scalar, one pixel -> one workitem
+__kernel void gpuRunHaarClassifierCascadePacked(
+    global const GpuHidHaarStageClassifier * stagecascadeptr,
+    global const int4 * info,
+    global const GpuHidHaarTreeNode * nodeptr,
+    global const int * restrict sum,
+    global const float * restrict sqsum,
+    volatile global int4 * candidate,
+    const int pixelstep,
+    const int loopcount,
+    const int start_stage,
+    const int split_stage,
+    const int end_stage,
+    const int startnode,
+    const int splitnode,
+    const int4 p,
+    const int4 pq,
+    const float correction,
+    global const int* pNodesPK,
+    global const int4* pWGInfo
+    )
+
+{
+// this version used information provided for each workgroup
+// no empty WG
+    int     gid = (int)get_group_id(0);
+    int     lid_x = (int)get_local_id(0);
+    int     lid_y = (int)get_local_id(1);
+    int     lid = lid_y*LSx+lid_x;
+    int4    WGInfo = pWGInfo[gid];
+    int     GroupX = (WGInfo.y >> 16)&0xFFFF;
+    int     GroupY = (WGInfo.y >> 0 )& 0xFFFF;
+    int     Width  = (WGInfo.x >> 16)&0xFFFF;
+    int     Height = (WGInfo.x >> 0 )& 0xFFFF;
+    int     ImgOffset = WGInfo.z;
+    float   ScaleFactor = as_float(WGInfo.w);
+
+#define DATA_SIZE_X (LSx+WND_SIZE_X)
+#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
+#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
+
+    local int SumL[DATA_SIZE];
+
+    // read input data window into local mem
+    for(int i = 0; i<DATA_SIZE; i+=(LSx*LSy))
+    {
+        int     index = i+lid; // index in shared local memory
+        if(index<DATA_SIZE)
+        {// calc global x,y coordinat and read data from there
+            int     x = min(GroupX + (index % (DATA_SIZE_X)),Width-1);
+            int     y = min(GroupY + (index / (DATA_SIZE_X)),Height-1);
+            SumL[index] = sum[ImgOffset+y*pixelstep+x];
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // calc variance_norm_factor for all stages
+    float   variance_norm_factor;
+    int     nodecounter= startnode;
+    int4    info1 = p;
+    int4    info2 = pq;
+
+    {
+        int     xl = lid_x;
+        int     yl = lid_y;
+        int     OffsetLocal =          yl * DATA_SIZE_X +         xl;
+        int     OffsetGlobal = (GroupY+yl)* pixelstep   + (GroupX+xl);
+
+        // add shift to get position on scaled image
+        OffsetGlobal += ImgOffset;
+
+        float   mean =
+            SumL[info1.y*DATA_SIZE_X+info1.x+OffsetLocal] -
+            SumL[info1.y*DATA_SIZE_X+info1.z+OffsetLocal] -
+            SumL[info1.w*DATA_SIZE_X+info1.x+OffsetLocal] +
+            SumL[info1.w*DATA_SIZE_X+info1.z+OffsetLocal];
+        float sq =
+            sqsum[info2.y*pixelstep+info2.x+OffsetGlobal] -
+            sqsum[info2.y*pixelstep+info2.z+OffsetGlobal] -
+            sqsum[info2.w*pixelstep+info2.x+OffsetGlobal] +
+            sqsum[info2.w*pixelstep+info2.z+OffsetGlobal];
+
+        mean *= correction;
+        sq *= correction;
+
+        variance_norm_factor = sq - mean * mean;
+        variance_norm_factor = (variance_norm_factor >=0.f) ? sqrt(variance_norm_factor) : 1.f;
+    }// end calc variance_norm_factor for all stages
+
+    int result = (1.0f>0.0f);
+    for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
+    {// iterate until candidate is exist
+        float   stage_sum = 0.0f;
+        int2    stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+        float   stagethreshold = as_float(stageinfo.y);
+        int     lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
+        for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ )
+        {
+        // simple macro to extract shorts from int
+#define M0(_t) ((_t)&0xFFFF)
+#define M1(_t) (((_t)>>16)&0xFFFF)
+            // load packed node data from global memory (L3) into registers
+            global const int4* pN = (__global int4*)(pNodesPK+nodecounter*NODE_SIZE);
+            int4    n0 = pN[0];
+            int4    n1 = pN[1];
+            int4    n2 = pN[2];
+            float   nodethreshold  = as_float(n2.y) * variance_norm_factor;
+            // calc sum of intensity pixels according to node information
+            float classsum =
+                (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
+                (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
+                (SumL[M0(n1.x)+lcl_off] - SumL[M1(n1.x)+lcl_off] - SumL[M0(n1.y)+lcl_off] + SumL[M1(n1.y)+lcl_off]) * as_float(n2.x);
+            //accumulate stage responce
+            stage_sum += (classsum >= nodethreshold) ? as_float(n2.w) : as_float(n2.z);
+        }
+        result = (stage_sum >= stagethreshold);
+    }// next stage if needed
+
+    if(result)
+    {// all stages will be passed and there is a detected face on the tested position
+        int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
+        if(index<OUTPUTSZ)
+        {
+            int     x = GroupX+lid_x;
+            int     y = GroupY+lid_y;
+            int4 candidate_result;
+            candidate_result.x = convert_int_rtn(x*ScaleFactor);
+            candidate_result.y = convert_int_rtn(y*ScaleFactor);
+            candidate_result.z = convert_int_rtn(ScaleFactor*WND_SIZE_X);
+            candidate_result.w = convert_int_rtn(ScaleFactor*WND_SIZE_Y);
+            candidate[index] = candidate_result;
+        }
+    }
+}//end gpuRunHaarClassifierCascade
+#else
+
 __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
     global GpuHidHaarStageClassifier * stagecascadeptr,
     global int4 * info,
@@ -183,7 +321,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                 int glb_x = grpoffx + (lcl_x<<2);
                 int glb_y = grpoffy + lcl_y;
 
-                int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
+                int glb_off = mad24(min(glb_y, height + WINDOWSIZE - 1),pixelstep,glb_x);
                 int4 data = *(__global int4*)&sum[glb_off];
                 int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
 
@@ -283,12 +421,23 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
 
                 result = (stage_sum >= stagethreshold);
             }
-
-            if(result && (x < width) && (y < height))
+            if(factor < 2)
             {
-                int queueindex = atomic_inc(lclcount);
-                lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
-                lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+                if(result && lclidx %2 ==0 && lclidy %2 ==0 )
+                {
+                    int queueindex = atomic_inc(lclcount);
+                    lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
+                    lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
+                }
+            }
+            else
+            {
+                if(result)
+                {
+                    int queueindex = atomic_inc(lclcount);
+                    lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
+                    lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
+                }
             }
             barrier(CLK_LOCAL_MEM_FENCE);
             int queuecount  = lclcount[0];
@@ -411,13 +560,30 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                 int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
                 temp = glboutindex[0];
                 int4 candidate_result;
-                candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
-                candidate_result.x = convert_int_rtn(x*factor);
-                candidate_result.y = convert_int_rtn(y*factor);
+                candidate_result.zw = (int2)convert_int_rte(factor*20.f);
+                candidate_result.x = convert_int_rte(x*factor);
+                candidate_result.y = convert_int_rte(y*factor);
                 atomic_inc(glboutindex);
-                candidate[outputoff+temp+lcl_id] = candidate_result;
+
+                int i = outputoff+temp+lcl_id;
+                if(candidate[i].z == 0)
+                {
+                    candidate[i] = candidate_result;
+                }
+                else
+                {
+                    for(i=i+1;;i++)
+                    {
+                        if(candidate[i].z == 0)
+                        {
+                            candidate[i] = candidate_result;
+                            break;
+                        }
+                    }
+                }
             }
             barrier(CLK_LOCAL_MEM_FENCE);
         }//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
     }//end for(int scalei = 0; scalei <loopcount; scalei++)
 }
+#endif
diff --git a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
index 17e95b4e4a..3ace4470aa 100644
--- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -18,6 +18,7 @@
 //    Wu Xinglong, wxl370@126.com
 //    Sen Liu, swjtuls1987@126.com
 //    Peng Xiao, pengxiao@outlook.com
+//    Erping Pang, erping@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -120,7 +121,6 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
     int grpidx = get_group_id(0);
     int lclidx = get_local_id(0);
     int lclidy = get_local_id(1);
-    int lcl_sz = mul24(grpszx, grpszy);
     int lcl_id = mad24(lclidy, grpszx, lclidx);
     __local int glboutindex[1];
     __local int lclcount[1];
@@ -142,7 +142,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
         int totalgrp = scaleinfo1.y & 0xffff;
         float factor = as_float(scaleinfo1.w);
         float correction_t = correction[scalei];
-        int ystep = (int)(max(2.0f, factor) + 0.5f);
+        float ystep = max(2.0f, factor);
 
         for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
         {
@@ -151,8 +151,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
             int grpidx = grploop - mul24(grpidy, grpnumperline);
             int ix = mad24(grpidx, grpszx, lclidx);
             int iy = mad24(grpidy, grpszy, lclidy);
-            int x = ix * ystep;
-            int y = iy * ystep;
+            int x = round(ix * ystep);
+            int y = round(iy * ystep);
             lcloutindex[lcl_id] = 0;
             lclcount[0] = 0;
             int nodecounter;
@@ -243,7 +243,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
 
                 barrier(CLK_LOCAL_MEM_FENCE);
 
-                if (result && (ix < width) && (iy < height))
+                if (result)
                 {
                     int queueindex = atomic_inc(lclcount);
                     lcloutindex[queueindex] = (y << 16) | x;
@@ -258,10 +258,26 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                     int y = (temp & (int)0xffff0000) >> 16;
                     temp = atomic_inc(glboutindex);
                     int4 candidate_result;
-                    candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
+                    candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
                     candidate_result.x = x;
                     candidate_result.y = y;
-                    candidate[outputoff + temp + lcl_id] = candidate_result;
+
+                    int i = outputoff+temp+lcl_id;
+                    if(candidate[i].z == 0)
+                    {
+                        candidate[i] = candidate_result;
+                    }
+                    else
+                    {
+                        for(i=i+1;;i++)
+                        {
+                            if(candidate[i].z == 0)
+                            {
+                                candidate[i] = candidate_result;
+                                break;
+                            }
+                        }
+                    }
                 }
 
                 barrier(CLK_LOCAL_MEM_FENCE);
@@ -284,7 +300,7 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
         tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
     }
 
-    t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
+    t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
     counter += nodenum;
 #pragma unroll
 
diff --git a/modules/ocl/src/opencl/imgproc_bilateral.cl b/modules/ocl/src/opencl/imgproc_bilateral.cl
index f13e9670e9..cb317a0057 100644
--- a/modules/ocl/src/opencl/imgproc_bilateral.cl
+++ b/modules/ocl/src/opencl/imgproc_bilateral.cl
@@ -16,7 +16,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/src/opencl/imgproc_calcHarris.cl b/modules/ocl/src/opencl/imgproc_calcHarris.cl
index 1911a72016..3f53ddf9a5 100644
--- a/modules/ocl/src/opencl/imgproc_calcHarris.cl
+++ b/modules/ocl/src/opencl/imgproc_calcHarris.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -43,60 +43,63 @@
 //
 //M*/
 
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////Macro for border type////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
 
+#ifdef BORDER_CONSTANT
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        x = max(min(x, maxV - 1), 0); \
+    }
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        if (x < 0) \
+            x -= ((x - maxV + 1) / maxV) * maxV; \
+        if (x >= maxV) \
+            x %= maxV; \
+    }
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)
+#define EXTRAPOLATE_(x, maxV, delta) \
+    { \
+        if (maxV == 1) \
+            x = 0; \
+        else \
+            do \
+            { \
+                if ( x < 0 ) \
+                    x = -x - 1 + delta; \
+                else \
+                    x = maxV - 1 - (x - maxV) - delta; \
+            } \
+            while (x >= maxV || x < 0); \
+    }
 #ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
+#else
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
 #endif
-
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#else
+#error No extrapolation method
 #endif
 
 #define THREADS 256
-#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////calcHarris////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst,
-                              int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
-                              int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
-                              float k)
+
+__kernel void calcHarris(__global const float *Dx, __global const float *Dy, __global float *dst,
+                         int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
+                         int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
+                         int dst_offset, int dst_rows, int dst_cols, int dst_step, float k)
 {
     int col = get_local_id(0);
-    const int gX = get_group_id(0);
-    const int gY = get_group_id(1);
-    const int glx = get_global_id(0);
-    const int gly = get_global_id(1);
+    int gX = get_group_id(0);
+    int gY = get_group_id(1);
+    int gly = get_global_id(1);
 
     int dx_x_off = (dx_offset % dx_step) >> 2;
     int dx_y_off = dx_offset / dx_step;
@@ -112,41 +115,36 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
     int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
     int dst_startY = (gY << 1) + dst_y_off;
 
-    float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
+    float dx_data[ksY+1],dy_data[ksY+1], data[3][ksY+1];
     __local float temp[6][THREADS];
+
 #ifdef BORDER_CONSTANT
-    bool dx_con,dy_con;
-    float dx_s,dy_s;
-    for(int i=0; i < ksY+1; i++)
+    for (int i=0; i < ksY+1; i++)
     {
-        dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
-        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
-        dx_data[i] = dx_con ? dx_s : 0.0;
-        dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
-        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
-        dy_data[i] = dy_con ? dy_s : 0.0;
+        bool dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
+        int indexDx = (dx_startY+i)*(dx_step>>2)+(dx_startX+col);
+        float dx_s = dx_con ? Dx[indexDx] : 0.0f;
+        dx_data[i] = dx_s;
+        bool dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
+        int indexDy = (dy_startY+i)*(dy_step>>2)+(dy_startX+col);
+        float dy_s = dx_con ? Dy[indexDy] : 0.0f;
+        dy_data[i] = dy_s;
         data[0][i] = dx_data[i] * dx_data[i];
         data[1][i] = dx_data[i] * dy_data[i];
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #else
     int clamped_col = min(dst_cols, col);
-    for(int i=0; i < ksY+1; i++)
+    for (int i=0; i < ksY+1; i++)
     {
-        int dx_selected_row;
-        int dx_selected_col;
-        dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
-        dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
-        dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
-        dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
+        int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;
+        EXTRAPOLATE(dx_selected_row, dx_whole_rows)
+        EXTRAPOLATE(dx_selected_col, dx_whole_cols)
         dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
 
-        int dy_selected_row;
-        int dy_selected_col;
-        dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
-        dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
-        dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
-        dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
+        int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;
+        EXTRAPOLATE(dy_selected_row, dy_whole_rows)
+        EXTRAPOLATE(dy_selected_col, dy_whole_cols)
         dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
 
         data[0][i] = dx_data[i] * dx_data[i];
@@ -154,46 +152,45 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #endif
-    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
-    for(int i=1; i < ksY; i++)
+    float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
+    for (int i=1; i < ksY; i++)
     {
-        sum0 += (data[0][i]);
-        sum1 += (data[1][i]);
-        sum2 += (data[2][i]);
+        sum0 += data[0][i];
+        sum1 += data[1][i];
+        sum2 += data[2][i];
     }
-    float sum01,sum02,sum11,sum12,sum21,sum22;
-    sum01 = sum0 + (data[0][0]);
-    sum02 = sum0 + (data[0][ksY]);
+
+    float sum01 = sum0 + data[0][0];
+    float sum02 = sum0 + data[0][ksY];
     temp[0][col] = sum01;
     temp[1][col] = sum02;
-    sum11 = sum1 + (data[1][0]);
-    sum12 = sum1 + (data[1][ksY]);
+    float sum11 = sum1 + data[1][0];
+    float sum12 = sum1 + data[1][ksY];
     temp[2][col] = sum11;
     temp[3][col] = sum12;
-    sum21 = sum2 + (data[2][0]);
-    sum22 = sum2 + (data[2][ksY]);
+    float sum21 = sum2 + data[2][0];
+    float sum22 = sum2 + data[2][ksY];
     temp[4][col] = sum21;
     temp[5][col] = sum22;
     barrier(CLK_LOCAL_MEM_FENCE);
-    if(col < (THREADS-(ksX-1)))
+
+    if (col < (THREADS- (ksX - 1)))
     {
         col += anX;
         int posX = dst_startX - dst_x_off + col - anX;
         int posY = (gly << 1);
         int till = (ksX + 1)%2;
-        float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
-        for(int k=0; k<6; k++)
-            for(int i=-anX; i<=anX - till; i++)
-            {
+        float tmp_sum[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+        for (int k=0; k<6; k++)
+            for (int i=-anX; i<=anX - till; i++)
                 tmp_sum[k] += temp[k][col+i];
-            }
 
-        if(posX < dst_cols && (posY) < dst_rows)
+        if (posX < dst_cols && (posY) < dst_rows)
         {
             dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
                     tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
         }
-        if(posX < dst_cols && (posY + 1) < dst_rows)
+        if (posX < dst_cols && (posY + 1) < dst_rows)
         {
             dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
                     tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
diff --git a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
index 462ec77925..c598246aec 100644
--- a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
+++ b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -43,60 +43,62 @@
 //
 //M*/
 
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////Macro for border type////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
 
+#ifdef BORDER_CONSTANT
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        x = max(min(x, maxV - 1), 0); \
+    }
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        if (x < 0) \
+            x -= ((x - maxV + 1) / maxV) * maxV; \
+        if (x >= maxV) \
+            x %= maxV; \
+    }
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)
+#define EXTRAPOLATE_(x, maxV, delta) \
+    { \
+        if (maxV == 1) \
+            x = 0; \
+        else \
+            do \
+            { \
+                if ( x < 0 ) \
+                    x = -x - 1 + delta; \
+                else \
+                    x = maxV - 1 - (x - maxV) - delta; \
+            } \
+            while (x >= maxV || x < 0); \
+    }
 #ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
+#else
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
 #endif
-
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#else
+#error No extrapolation method
 #endif
 
 #define THREADS 256
-#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////calcHarris////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst,
                               int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
                               int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
-                              float k)
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step, float k)
 {
     int col = get_local_id(0);
-    const int gX = get_group_id(0);
-    const int gY = get_group_id(1);
-    const int glx = get_global_id(0);
-    const int gly = get_global_id(1);
+    int gX = get_group_id(0);
+    int gY = get_group_id(1);
+    int gly = get_global_id(1);
 
     int dx_x_off = (dx_offset % dx_step) >> 2;
     int dx_y_off = dx_offset / dx_step;
@@ -112,42 +114,36 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
     int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
     int dst_startY = (gY << 1) + dst_y_off;
 
-    float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
+    float dx_data[ksY+1], dy_data[ksY+1], data[3][ksY+1];
     __local float temp[6][THREADS];
+
 #ifdef BORDER_CONSTANT
-    bool dx_con,dy_con;
-    float dx_s,dy_s;
-    for(int i=0; i < ksY+1; i++)
+    for (int i=0; i < ksY+1; i++)
     {
-        dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
-        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
-        dx_data[i] = dx_con ? dx_s : 0.0;
-        dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
-        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
-        dy_data[i] = dy_con ? dy_s : 0.0;
+        bool dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
+        int indexDx = (dx_startY+i)*(dx_step>>2)+(dx_startX+col);
+        float dx_s = dx_con ? Dx[indexDx] : 0.0f;
+        dx_data[i] = dx_s;
+        bool dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
+        int indexDy = (dy_startY+i)*(dy_step>>2)+(dy_startX+col);
+        float dy_s = dx_con ? Dy[indexDy] : 0.0f;
+        dy_data[i] = dy_s;
         data[0][i] = dx_data[i] * dx_data[i];
         data[1][i] = dx_data[i] * dy_data[i];
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #else
     int clamped_col = min(dst_cols, col);
-
-    for(int i=0; i < ksY+1; i++)
+    for (int i=0; i < ksY+1; i++)
     {
-        int dx_selected_row;
-        int dx_selected_col;
-        dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
-        dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
-        dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
-        dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
+        int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;
+        EXTRAPOLATE(dx_selected_row, dx_whole_rows)
+        EXTRAPOLATE(dx_selected_col, dx_whole_cols)
         dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
 
-        int dy_selected_row;
-        int dy_selected_col;
-        dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
-        dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
-        dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
-        dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
+        int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;
+        EXTRAPOLATE(dy_selected_row, dy_whole_rows)
+        EXTRAPOLATE(dy_selected_col, dy_whole_cols)
         dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
 
         data[0][i] = dx_data[i] * dx_data[i];
@@ -155,39 +151,38 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #endif
-    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
-    for(int i=1; i < ksY; i++)
+    float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
+    for (int i=1; i < ksY; i++)
     {
         sum0 += (data[0][i]);
         sum1 += (data[1][i]);
         sum2 += (data[2][i]);
     }
-    float sum01,sum02,sum11,sum12,sum21,sum22;
-    sum01 = sum0 + (data[0][0]);
-    sum02 = sum0 + (data[0][ksY]);
+
+    float sum01 = sum0 + (data[0][0]);
+    float sum02 = sum0 + (data[0][ksY]);
     temp[0][col] = sum01;
     temp[1][col] = sum02;
-    sum11 = sum1 + (data[1][0]);
-    sum12 = sum1 + (data[1][ksY]);
+    float sum11 = sum1 + (data[1][0]);
+    float sum12 = sum1 + (data[1][ksY]);
     temp[2][col] = sum11;
     temp[3][col] = sum12;
-    sum21 = sum2 + (data[2][0]);
-    sum22 = sum2 + (data[2][ksY]);
+    float sum21 = sum2 + (data[2][0]);
+    float sum22 = sum2 + (data[2][ksY]);
     temp[4][col] = sum21;
     temp[5][col] = sum22;
     barrier(CLK_LOCAL_MEM_FENCE);
+
     if(col < (THREADS-(ksX-1)))
     {
         col += anX;
         int posX = dst_startX - dst_x_off + col - anX;
         int posY = (gly << 1);
         int till = (ksX + 1)%2;
-        float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
-        for(int k=0; k<6; k++)
-            for(int i=-anX; i<=anX - till; i++)
-            {
+        float tmp_sum[6] = { 0.0f, 0.0f , 0.0f, 0.0f, 0.0f, 0.0f };
+        for (int k=0; k<6; k++)
+            for (int i=-anX; i<=anX - till; i++)
                 tmp_sum[k] += temp[k][col+i];
-            }
 
         if(posX < dst_cols && (posY) < dst_rows)
         {
@@ -196,7 +191,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
             float c = tmp_sum[4] * 0.5f;
             dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
         }
-        if(posX < dst_cols && (posY + 1) < dst_rows)
+        if (posX < dst_cols && (posY + 1) < dst_rows)
         {
             float a = tmp_sum[1] * 0.5f;
             float b = tmp_sum[3];
diff --git a/modules/ocl/src/opencl/imgproc_canny.cl b/modules/ocl/src/opencl/imgproc_canny.cl
index ca670b6db7..0a54f1468c 100644
--- a/modules/ocl/src/opencl/imgproc_canny.cl
+++ b/modules/ocl/src/opencl/imgproc_canny.cl
@@ -43,9 +43,6 @@
 //
 //M*/
 
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
 #ifdef L2GRAD
 inline float calc(int x, int y)
 {
@@ -248,7 +245,12 @@ void calcMagnitude
 //////////////////////////////////////////////////////////////////////////////////////////
 // 0.4142135623730950488016887242097 is tan(22.5)
 #define CANNY_SHIFT 15
-#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+
+#ifdef DOUBLE_SUPPORT
+    #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+#else
+    #define TG22        (int)(0.4142135623730950488016887242097f*(1<<CANNY_SHIFT) + 0.5f)
+#endif
 
 //First pass of edge detection and non-maximum suppression
 // edgetype is set to for each pixel:
@@ -374,6 +376,14 @@ calcMap
 #undef CANNY_SHIFT
 #undef TG22
 
+struct PtrStepSz {
+    __global int *ptr;
+    int step;
+    int rows, cols;
+};
+inline int get(struct PtrStepSz data, int y, int x) { return *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)); }
+inline void set(struct PtrStepSz data, int y, int x, int value) { *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)) = value; }
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // do Hysteresis for pixel whose edge type is 1
 //
@@ -390,7 +400,7 @@ void
 __attribute__((reqd_work_group_size(16,16,1)))
 edgesHysteresisLocal
 (
-    __global int * map,
+    __global int * map_ptr,
     __global ushort2 * st,
     __global unsigned int * counter,
     int rows,
@@ -399,10 +409,11 @@ edgesHysteresisLocal
     int map_offset
 )
 {
+#if 0
     map_step   /= sizeof(*map);
     map_offset /= sizeof(*map);
 
-    map += map_offset;
+    const __global int* map = map_ptr + map_offset;
 
     __local int smem[18][18];
 
@@ -482,6 +493,92 @@ edgesHysteresisLocal
             st[ind] = (ushort2)(gidx + 1, gidy + 1);
         }
     }
+#else
+    struct PtrStepSz map = {((__global int *)((__global char*)map_ptr + map_offset)), map_step, rows, cols};
+
+    __local int smem[18][18];
+
+    int2 blockIdx = (int2)(get_group_id(0), get_group_id(1));
+    int2 blockDim = (int2)(get_local_size(0), get_local_size(1));
+    int2 threadIdx = (int2)(get_local_id(0), get_local_id(1));
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0;
+    if (threadIdx.y == 0)
+        smem[0][threadIdx.x + 1] = y > 0 ? get(map, y - 1, x) : 0;
+    if (threadIdx.y == blockDim.y - 1)
+        smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0;
+    if (threadIdx.x == 0)
+        smem[threadIdx.y + 1][0] = x > 0 ? get(map, y, x - 1) : 0;
+    if (threadIdx.x == blockDim.x - 1)
+        smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? get(map, y, x + 1) : 0;
+    if (threadIdx.x == 0 && threadIdx.y == 0)
+        smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0;
+    if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+        smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? get(map, y - 1, x + 1) : 0;
+    if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+        smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? get(map, y + 1, x - 1) : 0;
+    if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+        smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? get(map, y + 1, x + 1) : 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x >= map.cols || y >= map.rows)
+        return;
+
+    int n;
+
+    #pragma unroll
+    for (int k = 0; k < 16; ++k)
+    {
+        n = 0;
+
+        if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+        {
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+        }
+
+        if (n > 0)
+            smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+    }
+
+    const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+    set(map, y, x, e);
+
+    n = 0;
+
+    if (e == 2)
+    {
+        n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+        n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+        n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+
+        n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+
+        n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+    }
+
+    if (n > 0)
+    {
+        const int ind = atomic_inc(counter);
+        st[ind] = (ushort2)(x, y);
+    }
+#endif
 }
 
 __constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
@@ -505,17 +602,12 @@ edgesHysteresisGlobal
     int map_offset
 )
 {
-
     map_step   /= sizeof(*map);
     map_offset /= sizeof(*map);
 
     map += map_offset;
 
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
     int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
 
     int grp_idx = get_group_id(0);
     int grp_idy = get_group_id(1);
@@ -536,71 +628,64 @@ edgesHysteresisGlobal
     if(ind < count)
     {
         ushort2 pos = st1[ind];
-        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+        if (lidx < 8)
         {
-            if (lidx < 8)
+            pos.x += c_dx[lidx];
+            pos.y += c_dy[lidx];
+            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && map[pos.x + pos.y * map_step] == 1)
             {
-                pos.x += c_dx[lidx];
-                pos.y += c_dy[lidx];
+                map[pos.x + pos.y * map_step] = 2;
 
-                if (map[pos.x + pos.y * map_step] == 1)
-                {
-                    map[pos.x + pos.y * map_step] = 2;
+                ind = atomic_inc(&s_counter);
 
-                    ind = atomic_inc(&s_counter);
-
-                    s_st[ind] = pos;
-                }
+                s_st[ind] = pos;
             }
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
-            {
-                const int subTaskIdx = lidx >> 3;
-                const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-                pos.x = pos.y = 0;
+        while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
+        {
+            const int subTaskIdx = lidx >> 3;
+            const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
 
-                if (subTaskIdx < portion)
-                    pos = s_st[s_counter - 1 - subTaskIdx];
-                barrier(CLK_LOCAL_MEM_FENCE);
+            if (subTaskIdx < portion)
+                pos = s_st[s_counter - 1 - subTaskIdx];
+            barrier(CLK_LOCAL_MEM_FENCE);
 
-                if (lidx == 0)
-                    s_counter -= portion;
-                barrier(CLK_LOCAL_MEM_FENCE);
+            if (lidx == 0)
+                s_counter -= portion;
+            barrier(CLK_LOCAL_MEM_FENCE);
 
-                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+            if (subTaskIdx < portion)
+            {
+                pos.x += c_dx[lidx & 7];
+                pos.y += c_dy[lidx & 7];
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && map[pos.x + pos.y * map_step] == 1)
                 {
-                    pos.x += c_dx[lidx & 7];
-                    pos.y += c_dy[lidx & 7];
-
-                    if (map[pos.x + pos.y * map_step] == 1)
-                    {
-                        map[pos.x + pos.y * map_step] = 2;
+                    map[pos.x + pos.y * map_step] = 2;
 
-                        ind = atomic_inc(&s_counter);
+                    ind = atomic_inc(&s_counter);
 
-                        s_st[ind] = pos;
-                    }
+                    s_st[ind] = pos;
                 }
-                barrier(CLK_LOCAL_MEM_FENCE);
             }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
 
-            if (s_counter > 0)
+        if (s_counter > 0)
+        {
+            if (lidx == 0)
             {
-                if (lidx == 0)
-                {
-                    ind = atomic_add(counter, s_counter);
-                    s_ind = ind - s_counter;
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
+                ind = atomic_add(counter, s_counter);
+                s_ind = ind - s_counter;
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
 
-                ind = s_ind;
+            ind = s_ind;
 
-                for (int i = lidx; i < s_counter; i += get_local_size(0))
-                {
-                    st2[ind + i] = s_st[i];
-                }
+            for (int i = lidx; i < (int)s_counter; i += get_local_size(0))
+            {
+                st2[ind + i] = s_st[i];
             }
         }
     }
diff --git a/modules/ocl/src/opencl/imgproc_clahe.cl b/modules/ocl/src/opencl/imgproc_clahe.cl
index 16c68fd474..57d945e21c 100644
--- a/modules/ocl/src/opencl/imgproc_clahe.cl
+++ b/modules/ocl/src/opencl/imgproc_clahe.cl
@@ -47,7 +47,7 @@
 #define WAVE_SIZE 1
 #endif
 
-int calc_lut(__local int* smem, int val, int tid)
+static int calc_lut(__local int* smem, int val, int tid)
 {
     smem[tid] = val;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -61,7 +61,7 @@ int calc_lut(__local int* smem, int val, int tid)
 }
 
 #ifdef CPU
-void reduce(volatile __local int* smem, int val, int tid)
+static void reduce(volatile __local int* smem, int val, int tid)
 {
     smem[tid] = val;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -101,7 +101,7 @@ void reduce(volatile __local int* smem, int val, int tid)
 
 #else
 
-void reduce(__local volatile int* smem, int val, int tid)
+static void reduce(__local volatile int* smem, int val, int tid)
 {
     smem[tid] = val;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -147,9 +147,9 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
 {
     __local int smem[512];
 
-    const int tx = get_group_id(0);
-    const int ty = get_group_id(1);
-    const unsigned int tid = get_local_id(1) * get_local_size(0)
+    int tx = get_group_id(0);
+    int ty = get_group_id(1);
+    int tid = get_local_id(1) * get_local_size(0)
                              + get_local_id(0);
 
     smem[tid] = 0;
diff --git a/modules/ocl/src/opencl/imgproc_histogram.cl b/modules/ocl/src/opencl/imgproc_histogram.cl
index 6df81c7ba7..bac9a6b899 100644
--- a/modules/ocl/src/opencl/imgproc_histogram.cl
+++ b/modules/ocl/src/opencl/imgproc_histogram.cl
@@ -19,7 +19,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/src/opencl/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl
index 9ced01d020..05e76f9647 100644
--- a/modules/ocl/src/opencl/imgproc_integral.cl
+++ b/modules/ocl/src/opencl/imgproc_integral.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -63,8 +63,8 @@
 kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum,
                           int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
     int4 src_t[2], sum_t[2];
     float4 sqsum_t[2];
     __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
@@ -75,8 +75,8 @@ kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global fl
     gid = gid << 1;
     for(int i = 0; i < rows; i =i + LSIZE_1)
     {
-        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
-        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
+        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : 0);
+        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : 0);
 
         sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
         sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
@@ -163,8 +163,8 @@ kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__
                           __global float *sqsum,int rows,int cols,int src_step,int sum_step,
                           int sqsum_step,int sum_offset,int sqsum_offset)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
     int4 src_t[2], sum_t[2];
     float4 sqsrc_t[2],sqsum_t[2];
     __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
@@ -279,8 +279,8 @@ kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__
 kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
                           int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
     float4 src_t[2], sum_t[2];
     float4 sqsum_t[2];
     __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
@@ -291,8 +291,8 @@ kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global
     gid = gid << 1;
     for(int i = 0; i < rows; i =i + LSIZE_1)
     {
-        src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0);
-        src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0);
+        src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : (float4)0);
+        src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : (float4)0);
 
         sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
         sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
@@ -379,8 +379,8 @@ kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,
                           __global float *sqsum,int rows,int cols,int src_step,int sum_step,
                           int sqsum_step,int sum_offset,int sqsum_offset)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
     float4 src_t[2], sum_t[2];
     float4 sqsrc_t[2],sqsum_t[2];
     __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
diff --git a/modules/ocl/src/opencl/imgproc_integral_sum.cl b/modules/ocl/src/opencl/imgproc_integral_sum.cl
index 70f0c63df2..a6f73c748d 100644
--- a/modules/ocl/src/opencl/imgproc_integral_sum.cl
+++ b/modules/ocl/src/opencl/imgproc_integral_sum.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -64,8 +64,8 @@
 kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
                               int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
     int4 src_t[2], sum_t[2];
     __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
     __local int* sum_p;
@@ -146,8 +146,8 @@ kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
                               int rows,int cols,int src_step,int sum_step,
                               int sum_offset)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
     int4 src_t[2], sum_t[2];
     __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
     __local int *sum_p;
@@ -239,8 +239,8 @@ kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
 kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
                               int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
     float4 src_t[2], sum_t[2];
     __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
     __local float* sum_p;
@@ -321,8 +321,8 @@ kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
                               int rows,int cols,int src_step,int sum_step,
                               int sum_offset)
 {
-    unsigned int lid = get_local_id(0);
-    unsigned int gid = get_group_id(0);
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
     float4 src_t[2], sum_t[2];
     __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
     __local float *sum_p;
diff --git a/modules/ocl/src/opencl/imgproc_median.cl b/modules/ocl/src/opencl/imgproc_median.cl
index b87af96891..5fa7a17b8e 100644
--- a/modules/ocl/src/opencl/imgproc_median.cl
+++ b/modules/ocl/src/opencl/imgproc_median.cl
@@ -16,7 +16,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -106,10 +106,10 @@ __kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,
     op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
     op(p4, p2); op(p6, p4); op(p4, p2);
 
-    if(get_global_id(1)<rows && get_global_id(0)<cols)
+    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
 }
-#undef op(a,b)
+#undef op
 
 #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
 __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
@@ -148,10 +148,10 @@ __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst,  i
     op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
     op(p4, p2); op(p6, p4); op(p4, p2);
 
-    if(get_global_id(1)<rows && get_global_id(0)<cols)
+    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
 }
-#undef op(a,b)
+#undef op
 
 #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
 __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
@@ -190,10 +190,10 @@ __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst,  i
     op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
     op(p4, p2); op(p6, p4); op(p4, p2);
 
-    if(get_global_id(1)<rows && get_global_id(0)<cols)
+    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
 }
-#undef op(a,b)
+#undef op
 
 #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
 __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
@@ -232,10 +232,10 @@ __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,
     op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
     op(p4, p2); op(p6, p4); op(p4, p2);
 
-    if(get_global_id(1)<rows && get_global_id(0)<cols)
+    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
 }
-#undef op(a,b)
+#undef op
 
 #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
 __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
@@ -294,10 +294,10 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
     op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
     op(p7, p11); op(p11, p13); op(p11, p12);
 
-    if(get_global_id(1)<rows && get_global_id(0)<cols)
+    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
-#undef op(a,b)
+#undef op
 
 #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
 __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
@@ -356,10 +356,10 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst,  i
     op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
     op(p7, p11); op(p11, p13); op(p11, p12);
 
-    if(get_global_id(1)<rows && get_global_id(0)<cols)
+    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
-#undef op(a,b)
+#undef op
 
 #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
 __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
@@ -418,10 +418,10 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
     op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
     op(p7, p11); op(p11, p13); op(p11, p12);
 
-    if(get_global_id(1)<rows && get_global_id(0)<cols)
+    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
-#undef op(a,b)
+#undef op
 
 #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
 __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
@@ -480,7 +480,7 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  i
     op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
     op(p7, p11); op(p11, p13); op(p11, p12);
 
-    if(get_global_id(1)<rows && get_global_id(0)<cols)
+    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
-#undef op(a,b)
+#undef op
diff --git a/modules/ocl/src/opencl/imgproc_remap.cl b/modules/ocl/src/opencl/imgproc_remap.cl
index 23899bdbbc..53c053947f 100644
--- a/modules/ocl/src/opencl/imgproc_remap.cl
+++ b/modules/ocl/src/opencl/imgproc_remap.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -60,7 +60,7 @@
 #elif defined BORDER_REPLICATE
 #define EXTRAPOLATE(v2, v) \
     { \
-        v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \
+        v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \
         v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
     }
 #elif defined BORDER_WRAP
@@ -139,7 +139,9 @@ __kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst,
 
         if (NEED_EXTRAPOLATION(gx, gy))
         {
-            int2 gxy = (int2)(gx, gy), zero = (int2)(0);
+#ifndef BORDER_CONSTANT
+            int2 gxy = (int2)(gx, gy);
+#endif
             EXTRAPOLATE(gxy, dst[dstIdx]);
         }
         else
@@ -167,10 +169,7 @@ __kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __g
         int gx = gxy.x, gy = gxy.y;
 
         if (NEED_EXTRAPOLATION(gx, gy))
-        {
-            int2 zero = (int2)(0);
-            EXTRAPOLATE(gxy, dst[dstIdx]);
-        }
+            EXTRAPOLATE(gxy, dst[dstIdx])
         else
         {
             int srcIdx = mad24(gy, src_step, gx + src_offset);
@@ -196,10 +195,7 @@ __kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __g
         int gx = gxy.x, gy = gxy.y;
 
         if (NEED_EXTRAPOLATION(gx, gy))
-        {
-            int2 zero = (int2)(0);
-            EXTRAPOLATE(gxy, dst[dstIdx]);
-        }
+            EXTRAPOLATE(gxy, dst[dstIdx])
         else
         {
             int srcIdx = mad24(gy, src_step, gx + src_offset);
@@ -231,7 +227,6 @@ __kernel void remap_2_32FC1(__global T const * restrict  src, __global T * dst,
         int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
         int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
         int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
-        int2 zero = (int2)(0);
 
         float2 _u = map_data - convert_float2(map_dataA);
         WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
@@ -285,7 +280,6 @@ __kernel void remap_32FC2(__global T const * restrict  src, __global T * dst,
         int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
         int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
         int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
-        int2 zero = (int2)(0);
 
         float2 _u = map_data - convert_float2(map_dataA);
         WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
diff --git a/modules/ocl/src/opencl/imgproc_resize.cl b/modules/ocl/src/opencl/imgproc_resize.cl
index fd486de40a..2bb75b90cf 100644
--- a/modules/ocl/src/opencl/imgproc_resize.cl
+++ b/modules/ocl/src/opencl/imgproc_resize.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -182,10 +182,10 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
     int x = floor(sx), y = floor(sy);
     float u = sx - x, v = sy - y;
 
-    x<0 ? x=0,u=0 : x,u;
-    x>=src_cols ? x=src_cols-1,u=0 : x,u;
-    y<0 ? y=0,v=0 : y,v;
-    y>=src_rows ? y=src_rows-1,v=0 : y,v;
+    if ( x<0 ) x=0,u=0;
+    if ( x>=src_cols ) x=src_cols-1,u=0;
+    if ( y<0 ) y=0,v=0;
+    if (y>=src_rows ) y=src_rows-1,v=0;
 
     u = u * INTER_RESIZE_COEF_SCALE;
     v = v * INTER_RESIZE_COEF_SCALE;
@@ -225,10 +225,10 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
     int x = floor(sx), y = floor(sy);
     float u = sx - x, v = sy - y;
 
-    x<0 ? x=0,u=0 : x,u;
-    x>=src_cols ? x=src_cols-1,u=0 : x,u;
-    y<0 ? y=0,v=0 : y,v;
-    y>=src_rows ? y=src_rows-1,v=0 : y,v;
+    if ( x<0 ) x=0,u=0;
+    if ( x>=src_cols ) x=src_cols-1,u=0;
+    if ( y<0 ) y=0,v=0;
+    if (y>=src_rows ) y=src_rows-1,v=0;
 
     int y_ = INC(y,src_rows);
     int x_ = INC(x,src_cols);
@@ -264,10 +264,10 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
     int x = floor(sx), y = floor(sy);
     float u = sx - x, v = sy - y;
 
-    x<0 ? x=0,u=0 : x;
-    x>=src_cols ? x=src_cols-1,u=0 : x;
-    y<0 ? y=0,v=0 : y;
-    y>=src_rows ? y=src_rows-1,v=0 : y;
+    if ( x<0 ) x=0,u=0;
+    if ( x>=src_cols ) x=src_cols-1,u=0;
+    if ( y<0 ) y=0,v=0;
+    if (y>=src_rows ) y=src_rows-1,v=0;
 
     int y_ = INC(y,src_rows);
     int x_ = INC(x,src_cols);
diff --git a/modules/ocl/src/opencl/imgproc_sobel3.cl b/modules/ocl/src/opencl/imgproc_sobel3.cl
new file mode 100644
index 0000000000..d6a995f552
--- /dev/null
+++ b/modules/ocl/src/opencl/imgproc_sobel3.cl
@@ -0,0 +1,108 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT101
+//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+__kernel void sobel3(
+        __global uchar* Src,
+        __global float* DstX,
+        __global float* DstY,
+        int width, int height,
+        uint srcStride, uint dstStride,
+        float scale
+        )
+{
+    __local float lsmem[BLK_Y+2][BLK_X+2];
+
+    int lix = get_local_id(0);
+    int liy = get_local_id(1);
+
+    int gix = get_group_id(0);
+    int giy = get_group_id(1);
+
+    int id_x = get_global_id(0);
+    int id_y = get_global_id(1);
+
+    lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
+
+    int id_y_h = ADDR_H(id_y-1, 0,height);
+    int id_y_b = ADDR_B(id_y+1, height,id_y+1);
+
+    int id_x_l = ADDR_L(id_x-1, 0,width);
+    int id_x_r = ADDR_R(id_x+1, width,id_x+1);
+
+    if(liy==0)
+    {
+        lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
+
+        if(lix==0)
+            lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
+        else if(lix==BLK_X-1)
+            lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
+    }
+    else if(liy==BLK_Y-1)
+    {
+        lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
+
+        if(lix==0)
+            lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
+        else if(lix==BLK_X-1)
+            lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
+    }
+
+    if(lix==0)
+        lsmem[liy+1][0]    = convert_float(Src[ id_y * srcStride + id_x_l ]);
+    else if(lix==BLK_X-1)
+        lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float u1 = lsmem[liy][lix];
+    float u2 = lsmem[liy][lix+1];
+    float u3 = lsmem[liy][lix+2];
+
+    float m1 = lsmem[liy+1][lix];
+    float m2 = lsmem[liy+1][lix+1];
+    float m3 = lsmem[liy+1][lix+2];
+
+    float b1 = lsmem[liy+2][lix];
+    float b2 = lsmem[liy+2][lix+1];
+    float b3 = lsmem[liy+2][lix+2];
+
+    //m2 * scale;//
+    float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
+    DstX[ id_y * dstStride + id_x ] = dx * scale;
+
+    float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
+    DstY[ id_y * dstStride + id_x ] = dy * scale;
+}
\ No newline at end of file
diff --git a/modules/ocl/src/opencl/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl
index 8d7c77e1fa..400ac806cf 100644
--- a/modules/ocl/src/opencl/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -44,109 +44,93 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif
 
-// threshold type:
-// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
-//       THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
+#ifdef VECTORIZED
 
-__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
-                              int src_offset, int src_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
-                              uchar thresh, uchar max_val, int thresh_type
-                              )
+__kernel void threshold(__global const T * restrict src, int src_offset, int src_step,
+                        __global T * dst, int dst_offset, int dst_step,
+                        T thresh, T max_val, int max_index, int rows, int cols)
 {
     int gx = get_global_id(0);
-    const int gy = get_global_id(1);
-
-    int offset = (dst_offset & 15);
-    src_offset -= offset;
+    int gy = get_global_id(1);
 
-    int dstart = (gx << 4) - offset;
-    if(dstart < dst_cols && gy < dst_rows)
+    if (gx < cols && gy < rows)
     {
-        uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
-        uchar16 ddata;
-        uchar16 zero = 0;
-        switch (thresh_type)
-        {
-            case 0:
-                ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0);
-                break;
-            case 1:
-                ddata = ((sdata > thresh)) ? zero  : (uchar16)(max_val);
-                break;
-            case 2:
-                ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata;
-                break;
-            case 3:
-                ddata = ((sdata > thresh)) ? sdata : zero;
-                break;
-            case 4:
-                ddata = ((sdata > thresh)) ? zero : sdata;
-                break;
-            default:
-                ddata = sdata;
-        }
-        int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
-                             dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
-        uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
-        int16 con = dpos >= 0 && dpos < dst_cols;
-        ddata = convert_uchar16(con != 0) ? ddata : dVal;
-        if(dstart < dst_cols)
+        gx *= VECSIZE;
+        int src_index = mad24(gy, src_step, src_offset + gx);
+        int dst_index = mad24(gy, dst_step, dst_offset + gx);
+
+#ifdef SRC_ALIGNED
+        VT sdata = *((__global VT *)(src + src_index));
+#else
+        VT sdata = VLOADN(0, src + src_index);
+#endif
+        VT vthresh = (VT)(thresh);
+
+#ifdef THRESH_BINARY
+        VT vecValue = sdata > vthresh ? max_val : (VT)(0);
+#elif defined THRESH_BINARY_INV
+        VT vecValue = sdata > vthresh ? (VT)(0) : max_val;
+#elif defined THRESH_TRUNC
+        VT vecValue = sdata > vthresh ? thresh : sdata;
+#elif defined THRESH_TOZERO
+        VT vecValue = sdata > vthresh ? sdata : (VT)(0);
+#elif defined THRESH_TOZERO_INV
+        VT vecValue = sdata > vthresh ? (VT)(0) : sdata;
+#endif
+
+        if (gx + VECSIZE <= max_index)
+#ifdef DST_ALIGNED
+            *(__global VT*)(dst + dst_index) = vecValue;
+#else
+            VSTOREN(vecValue, 0, dst + dst_index);
+#endif
+        else
         {
-            *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
+            T array[VECSIZE];
+            VSTOREN(vecValue, 0, array);
+            #pragma unroll
+            for (int i = 0; i < VECSIZE; ++i)
+                if (gx + i < max_index)
+                    dst[dst_index + i] = array[i];
         }
     }
 }
 
+#else
 
-__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
-                              int src_offset, int src_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
-                              float thresh, float max_val, int thresh_type
-                              )
+__kernel void threshold(__global const T * restrict src, int src_offset, int src_step,
+                        __global T * dst, int dst_offset, int dst_step,
+                        T thresh, T max_val, int rows, int cols)
 {
-    const int gx = get_global_id(0);
-    const int gy = get_global_id(1);
-
-    int offset = (dst_offset & 3);
-    src_offset -= offset;
+    int gx = get_global_id(0);
+    int gy = get_global_id(1);
 
-    int dstart = (gx << 2) - offset;
-    if(dstart < dst_cols && gy < dst_rows)
+    if (gx < cols && gy < rows)
     {
-        float4 sdata = vload4(gx, src+src_offset+gy*src_step);
-        float4 ddata;
-        float4 zero = 0;
-        switch (thresh_type)
-        {
-            case 0:
-                ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f);
-                break;
-            case 1:
-                ddata = sdata > thresh ? zero : (float4)max_val;
-                break;
-            case 2:
-                ddata = sdata > thresh ? (float4)thresh : sdata;
-                break;
-            case 3:
-                ddata = sdata > thresh ? sdata : (float4)(0.f);
-                break;
-            case 4:
-                ddata = sdata > thresh ? (float4)(0.f) : sdata;
-                break;
-            default:
-                ddata = sdata;
-        }
-        int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
-        float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
-        int4 con = dpos >= 0 && dpos < dst_cols;
-        ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
-        if(dstart < dst_cols)
-        {
-            *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
-        }
+        int src_index = mad24(gy, src_step, src_offset + gx);
+        int dst_index = mad24(gy, dst_step, dst_offset + gx);
+
+        T sdata = src[src_index];
+
+#ifdef THRESH_BINARY
+        dst[dst_index] = sdata > thresh ? max_val : (T)(0);
+#elif defined THRESH_BINARY_INV
+        dst[dst_index] = sdata > thresh ? (T)(0) : max_val;
+#elif defined THRESH_TRUNC
+        dst[dst_index] = sdata > thresh ? thresh : sdata;
+#elif defined THRESH_TOZERO
+        dst[dst_index] = sdata > thresh ? sdata : (T)(0);
+#elif defined THRESH_TOZERO_INV
+        dst[dst_index] = sdata > thresh ? (T)(0) : sdata;
+#endif
     }
 }
+
+#endif
diff --git a/modules/ocl/src/opencl/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl
index 16971e252b..a5050bbf03 100644
--- a/modules/ocl/src/opencl/imgproc_warpAffine.cl
+++ b/modules/ocl/src/opencl/imgproc_warpAffine.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -537,9 +537,9 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
 
         float tab[4];
         float taby[2], tabx[2];
-        taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+        taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
         taby[1] = 1.f/INTER_TAB_SIZE*ay0;
-        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+        tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
         tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
 
         tab[0] = taby[0] * tabx[0];
@@ -680,9 +680,9 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
 
         float tab[4];
         float taby[2], tabx[2];
-        taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+        taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
         taby[1] = 1.f/INTER_TAB_SIZE*ay0;
-        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+        tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
         tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
 
         tab[0] = taby[0] * tabx[0];
diff --git a/modules/ocl/src/opencl/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl
index ef9e77058c..eee1c81750 100644
--- a/modules/ocl/src/opencl/imgproc_warpPerspective.cl
+++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -100,8 +100,8 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
         F4 Y0 = M[3]*DX + M[4]*dy + M[5];
         F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0;
         W = (W!=zero) ? one/W : zero;
-        short4 X = convert_short4(rint(X0*W));
-        short4 Y = convert_short4(rint(Y0*W));
+        short4 X = convert_short4_sat_rte(X0*W);
+        short4 Y = convert_short4_sat_rte(Y0*W);
         int4 sx = convert_int4(X);
         int4 sy = convert_int4(Y);
 
@@ -133,12 +133,12 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
         int X = rint(X0*W);
         int Y = rint(Y0*W);
 
-        int sx = (short)(X >> INTER_BITS);
-        int sy = (short)(Y >> INTER_BITS);
+        int sx = convert_short_sat(X >> INTER_BITS);
+        int sy = convert_short_sat(Y >> INTER_BITS);
         int ay = (short)(Y & (INTER_TAB_SIZE-1));
         int ax = (short)(X & (INTER_TAB_SIZE-1));
 
@@ -150,16 +150,16 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
 
         short itab[4];
         float tab1y[2], tab1x[2];
-        tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
+        tab1y[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay;
         tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
-        tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
+        tab1x[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax;
         tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
 
 #pragma unroll 4
         for(i=0; i<4;  i++)
         {
             float v = tab1y[(i>>1)] * tab1x[(i&1)];
-            itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE ));
+            itab[i] = convert_short_sat_rte( v * INTER_REMAP_COEF_SCALE );
         }
         if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
         {
@@ -185,12 +185,12 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
         int X = rint(X0*W);
         int Y = rint(Y0*W);
 
-        short sx = (short)(X >> INTER_BITS) - 1;
-        short sy = (short)(Y >> INTER_BITS) - 1;
+        short sx = convert_short_sat(X >> INTER_BITS) - 1;
+        short sy = convert_short_sat(Y >> INTER_BITS) - 1;
         short ay = (short)(Y & (INTER_TAB_SIZE-1));
         short ax = (short)(X & (INTER_TAB_SIZE-1));
 
@@ -265,11 +265,9 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? 1./W : 0.0;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-        short sx = (short)X;
-        short sy = (short)Y;
+        W = (W != 0.0f) ? 1.f/W : 0.0f;
+        short sx = convert_short_sat_rte(X0*W);
+        short sy = convert_short_sat_rte(Y0*W);
 
         if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
             dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
@@ -291,12 +289,12 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
         int X = rint(X0*W);
         int Y = rint(Y0*W);
 
-        short sx = (short)(X >> INTER_BITS);
-        short sy = (short)(Y >> INTER_BITS);
+        short sx = convert_short_sat(X >> INTER_BITS);
+        short sy = convert_short_sat(Y >> INTER_BITS);
         short ay = (short)(Y & (INTER_TAB_SIZE-1));
         short ax = (short)(X & (INTER_TAB_SIZE-1));
 
@@ -343,12 +341,12 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
         int X = rint(X0*W);
         int Y = rint(Y0*W);
 
-        short sx = (short)(X >> INTER_BITS) - 1;
-        short sy = (short)(Y >> INTER_BITS) - 1;
+        short sx = convert_short_sat(X >> INTER_BITS) - 1;
+        short sy = convert_short_sat(Y >> INTER_BITS) - 1;
         short ay = (short)(Y & (INTER_TAB_SIZE-1));
         short ax = (short)(X & (INTER_TAB_SIZE-1));
 
@@ -426,11 +424,9 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? 1./W : 0.0;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-        short sx = (short)X;
-        short sy = (short)Y;
+        W = (W != 0.0f) ? 1.f/W : 0.0f;
+        short sx = convert_short_sat_rte(X0*W);
+        short sy = convert_short_sat_rte(Y0*W);
 
         if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
             dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
@@ -451,12 +447,12 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
         int X = rint(X0*W);
         int Y = rint(Y0*W);
 
-        short sx = (short)(X >> INTER_BITS);
-        short sy = (short)(Y >> INTER_BITS);
+        short sx = convert_short_sat(X >> INTER_BITS);
+        short sy = convert_short_sat(Y >> INTER_BITS);
         short ay = (short)(Y & (INTER_TAB_SIZE-1));
         short ax = (short)(X & (INTER_TAB_SIZE-1));
 
@@ -469,9 +465,9 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
 
         float tab[4];
         float taby[2], tabx[2];
-        taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
+        taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay;
         taby[1] = 1.f/INTER_TAB_SIZE*ay;
-        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
+        tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax;
         tabx[1] = 1.f/INTER_TAB_SIZE*ax;
 
         tab[0] = taby[0] * tabx[0];
@@ -501,12 +497,12 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
         int X = rint(X0*W);
         int Y = rint(Y0*W);
 
-        short sx = (short)(X >> INTER_BITS) - 1;
-        short sy = (short)(Y >> INTER_BITS) - 1;
+        short sx = convert_short_sat(X >> INTER_BITS) - 1;
+        short sy = convert_short_sat(Y >> INTER_BITS) - 1;
         short ay = (short)(Y & (INTER_TAB_SIZE-1));
         short ax = (short)(X & (INTER_TAB_SIZE-1));
 
@@ -561,11 +557,9 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W =(W != 0.0)? 1./W : 0.0;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-        short sx = (short)X;
-        short sy = (short)Y;
+        W =(W != 0.0f)? 1.f/W : 0.0f;
+        short sx = convert_short_sat_rte(X0*W);
+        short sy = convert_short_sat_rte(Y0*W);
 
         if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
             dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
@@ -589,12 +583,12 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
         int X = rint(X0*W);
         int Y = rint(Y0*W);
 
-        short sx0 = (short)(X >> INTER_BITS);
-        short sy0 = (short)(Y >> INTER_BITS);
+        short sx0 = convert_short_sat(X >> INTER_BITS);
+        short sy0 = convert_short_sat(Y >> INTER_BITS);
         short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
         short ax0 = (short)(X & (INTER_TAB_SIZE-1));
 
@@ -608,9 +602,9 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
 
         float tab[4];
         float taby[2], tabx[2];
-        taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+        taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
         taby[1] = 1.f/INTER_TAB_SIZE*ay0;
-        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+        tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
         tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
 
         tab[0] = taby[0] * tabx[0];
@@ -642,12 +636,12 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
         F X0 = M[0]*dx + M[1]*dy + M[2];
         F Y0 = M[3]*dx + M[4]*dy + M[5];
         F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
         int X = rint(X0*W);
         int Y = rint(Y0*W);
 
-        short sx = (short)(X >> INTER_BITS)-1;
-        short sy = (short)(Y >> INTER_BITS)-1;
+        short sx = convert_short_sat(X >> INTER_BITS)-1;
+        short sy = convert_short_sat(Y >> INTER_BITS)-1;
         short ay = (short)(Y & (INTER_TAB_SIZE-1));
         short ax = (short)(X & (INTER_TAB_SIZE-1));
 
diff --git a/modules/ocl/src/opencl/kernel_sort_by_key.cl b/modules/ocl/src/opencl/kernel_sort_by_key.cl
index 0ad11b8bcf..0e8d581b74 100644
--- a/modules/ocl/src/opencl/kernel_sort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_sort_by_key.cl
@@ -192,7 +192,6 @@ __kernel
 {
     const int          i  = get_local_id(0); // index in workgroup
     const int numOfGroups = get_num_groups(0); // index in workgroup
-    const int groupID     = get_group_id(0);
     const int         wg  = get_local_size(0); // workgroup size = block size
     int pos = 0, same = 0;
     const int offset = get_group_id(0) * wg;
diff --git a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl b/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
index 2d2c0a19cd..2d38fbf2f7 100644
--- a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
@@ -63,7 +63,7 @@
 
 ///////////// parallel merge sort ///////////////
 // ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
-uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
+static uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
 {
     //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
     uint firstIndex = left;
@@ -94,7 +94,7 @@ uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
 //  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
 //  passed as a functor parameter my_comp
 //  This function returns an index that is the first index whos value would be equal to the searched value
-uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
+static uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
 {
     //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
     uint firstIndex = left;
@@ -130,7 +130,7 @@ uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
 //  passed as a functor parameter my_comp
 //  This function returns an index that is the first index whos value would be greater than the searched value
 //  If the search value is not found in the sequence, upperbound returns the same result as lowerbound
-uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
+static uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
 {
     uint upperBound = lowerBoundBinary( data, left, right, searchVal );
 
@@ -167,9 +167,6 @@ kernel void merge(
 )
 {
     size_t globalID     = get_global_id( 0 );
-    size_t groupID      = get_group_id( 0 );
-    size_t localID      = get_local_id( 0 );
-    size_t wgSize       = get_local_size( 0 );
 
     //  Abort threads that are passed the end of the input vector
     if( globalID >= srcVecSize )
@@ -230,12 +227,12 @@ kernel void blockInsertionSort(
     local V_T*    val_lds
 )
 {
-    size_t gloId    = get_global_id( 0 );
-    size_t groId    = get_group_id( 0 );
-    size_t locId    = get_local_id( 0 );
-    size_t wgSize   = get_local_size( 0 );
+    int gloId    = get_global_id( 0 );
+    int groId    = get_group_id( 0 );
+    int locId    = get_local_id( 0 );
+    int wgSize   = get_local_size( 0 );
 
-    bool in_range = gloId < vecSize;
+    bool in_range = gloId < (int)vecSize;
     K_T key;
     V_T val;
     //  Abort threads that are passed the end of the input vector
@@ -254,7 +251,7 @@ kernel void blockInsertionSort(
     {
         //  The last workgroup may have an irregular size, so we calculate a per-block endIndex
         //  endIndex is essentially emulating a mod operator with subtraction and multiply
-        size_t endIndex = vecSize - ( groId * wgSize );
+        int endIndex = vecSize - ( groId * wgSize );
         endIndex = min( endIndex, wgSize );
 
         // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
diff --git a/modules/ocl/src/opencl/kmeans_kernel.cl b/modules/ocl/src/opencl/kmeans_kernel.cl
index c6af0ad249..f62a08f636 100644
--- a/modules/ocl/src/opencl/kmeans_kernel.cl
+++ b/modules/ocl/src/opencl/kmeans_kernel.cl
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Xiaopeng Fu, fuxiaopeng2222@163.com
+//    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -25,7 +26,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -43,42 +44,81 @@
 //
 //M*/
 
-__kernel void distanceToCenters(
-    int label_step, int K,
-    __global float *src,
-    __global int *labels, int dims, int rows,
-    __global float *centers,
-    __global float *dists)
+#ifdef L1_DIST
+#  define DISTANCE(A, B) fabs((A) - (B))
+#elif defined L2SQR_DIST
+#  define DISTANCE(A, B) ((A) - (B)) * ((A) - (B))
+#else
+#  define DISTANCE(A, B) ((A) - (B)) * ((A) - (B))
+#endif
+
+inline float dist(__global const float * center, __global const float * src, int feature_cols)
 {
-    int gid = get_global_id(1);
+    float res = 0;
+    float4 tmp4;
+    int i;
+    for(i = 0; i < feature_cols / 4; i += 4, center += 4, src += 4)
+    {
+        tmp4 = vload4(0, center) - vload4(0, src);
+#ifdef L1_DIST
+        tmp4 = fabs(tmp4);
+#else
+        tmp4 *= tmp4;
+#endif
+        res += tmp4.x + tmp4.y + tmp4.z + tmp4.w;
+    }
 
-    float dist, euDist, min;
-    int minCentroid;
+    for(; i < feature_cols; ++i, ++center, ++src)
+    {
+        res += DISTANCE(*src, *center);
+    }
+    return res;
+}
 
-    if(gid >= rows)
+// to be distinguished with distanceToCenters in kmeans_kernel.cl
+__kernel void distanceToCenters(
+    __global const float *src,
+    __global const float *centers,
+#ifdef USE_INDEX
+    __global const int   *indices,
+#endif
+    __global int   *labels,
+    __global float *dists,
+    int feature_cols,
+    int src_step,
+    int centers_step,
+    int label_step,
+    int input_size,
+    int K,
+    int offset_src,
+    int offset_centers
+)
+{
+    int gid = get_global_id(0);
+    float euDist, minval;
+    int minCentroid;
+    if(gid >= input_size)
+    {
         return;
-
-    for(int i = 0 ; i < K; i++)
+    }
+    src += offset_src;
+    centers += offset_centers;
+#ifdef USE_INDEX
+    src += indices[gid] * src_step;
+#else
+    src += gid * src_step;
+#endif
+    minval = dist(centers, src, feature_cols);
+    minCentroid = 0;
+    for(int i = 1 ; i < K; i++)
     {
-        euDist = 0;
-        for(int j = 0; j < dims; j++)
-        {
-            dist = (src[j + gid * dims]
-                    - centers[j + i * dims]);
-            euDist += dist * dist;
-        }
-
-        if(i == 0)
-        {
-            min = euDist;
-            minCentroid = 0;
-        }
-        else if(euDist < min)
+        euDist = dist(centers + i * centers_step, src, feature_cols);
+        if(euDist < minval)
         {
-            min = euDist;
+            minval = euDist;
             minCentroid = i;
         }
     }
-    dists[gid] = min;
-    labels[label_step * gid] = minCentroid;
+    labels[gid * label_step] = minCentroid;
+    dists[gid] = minval;
 }
diff --git a/modules/ocl/src/opencl/knearest.cl b/modules/ocl/src/opencl/knearest.cl
index e670df7e6f..bc0ae89a83 100644
--- a/modules/ocl/src/opencl/knearest.cl
+++ b/modules/ocl/src/opencl/knearest.cl
@@ -129,58 +129,53 @@ __kernel void knn_find_nearest(__global float* sample, int sample_row, int sampl
     }
     /*! find_nearest_neighbor done!*/
     /*! write_results start!*/
-    switch (regression)
+    if (regression)
     {
-    case true:
-        {
-            TYPE s;
+        TYPE s;
 #ifdef DOUBLE_SUPPORT
-            s = 0.0;
+        s = 0.0;
 #else
-            s = 0.0f;
+        s = 0.0f;
 #endif
-            for(j = 0; j < K1; j++)
-                s += nr[j * nThreads + threadY];
+        for(j = 0; j < K1; j++)
+            s += nr[j * nThreads + threadY];
 
-            _results[y * _results_step] = (float)(s * inv_scale);
-        }
-        break;
-    case false:
-        {
-            int prev_start = 0, best_count = 0, cur_count;
-            float best_val;
+        _results[y * _results_step] = (float)(s * inv_scale);
+    }
+    else
+    {
+        int prev_start = 0, best_count = 0, cur_count;
+        float best_val;
 
-            for(j = K1 - 1; j > 0; j--)
+        for(j = K1 - 1; j > 0; j--)
+        {
+            bool swap_f1 = false;
+            for(j1 = 0; j1 < j; j1++)
             {
-                bool swap_f1 = false;
-                for(j1 = 0; j1 < j; j1++)
+                if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
                 {
-                    if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
-                    {
-                        int t;
-                        CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
-                        swap_f1 = true;
-                    }
+                    int t;
+                    CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
+                    swap_f1 = true;
                 }
-                if(!swap_f1)
-                    break;
             }
+            if(!swap_f1)
+                break;
+        }
 
-            best_val = 0;
-            for(j = 1; j <= K1; j++)
-                if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
+        best_val = 0;
+        for(j = 1; j <= K1; j++)
+            if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
+            {
+                cur_count = j - prev_start;
+                if(best_count < cur_count)
                 {
-                    cur_count = j - prev_start;
-                    if(best_count < cur_count)
-                    {
-                        best_count = cur_count;
-                        best_val = nr[(j - 1) * nThreads + threadY];
-                    }
-                    prev_start = j;
+                    best_count = cur_count;
+                    best_val = nr[(j - 1) * nThreads + threadY];
                 }
-                _results[y * _results_step] = best_val;
-        }
-        break;
+                prev_start = j;
+            }
+            _results[y * _results_step] = best_val;
     }
     ///*! write_results done!*/
 }
diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl
index 6fc4c748cf..8b63c3bd2d 100644
--- a/modules/ocl/src/opencl/match_template.cl
+++ b/modules/ocl/src/opencl/match_template.cl
@@ -43,8 +43,6 @@
 //
 //M*/
 
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-
 #if defined (DOUBLE_SUPPORT)
 
 #ifdef cl_khr_fp64
@@ -70,7 +68,7 @@
 #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
 // normAcc* are accurate normalization routines which make GPU matchTemplate
 // consistent with CPU one
-float normAcc(float num, float denum)
+inline float normAcc(float num, float denum)
 {
     if(fabs(num) < denum)
     {
@@ -83,7 +81,7 @@ float normAcc(float num, float denum)
     return 0;
 }
 
-float normAcc_SQDIFF(float num, float denum)
+inline float normAcc_SQDIFF(float num, float denum)
 {
     if(fabs(num) < denum)
     {
diff --git a/modules/ocl/src/opencl/meanShift.cl b/modules/ocl/src/opencl/meanShift.cl
index a5b110812d..ea5060e467 100644
--- a/modules/ocl/src/opencl/meanShift.cl
+++ b/modules/ocl/src/opencl/meanShift.cl
@@ -28,7 +28,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -46,7 +46,7 @@
 //
 //M*/
 
-short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
+static short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
                __global uchar4* in, int in_step, int dst_off, int src_off,
                int cols, int rows, int sp, int sr, int maxIter, float eps)
 {
@@ -56,7 +56,6 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
     src_off = src_off >> 2;
     dst_off = dst_off >> 2;
     int idx = src_off + y0 * in_step + x0;
-//    uchar4 c = vload4(0, (__global uchar*)in+idx);
     uchar4 c = in[idx];
     int base = dst_off + get_global_id(1)*out_step + get_global_id(0) ;
 
diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl
index d61b8d5ae7..31c4c85ec7 100644
--- a/modules/ocl/src/opencl/moments.cl
+++ b/modules/ocl/src/opencl/moments.cl
@@ -15,6 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
+//    Jin Ma,  jin@multicorewareinc.com
 //    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -44,22 +45,14 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
-
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #elif defined (cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
 typedef double T;
-typedef double F;
-typedef double4 F4;
-#define convert_F4 convert_double4
-
 #else
-typedef float F;
-typedef float4 F4;
 typedef long T;
-#define convert_F4 convert_float4
 #endif
 
 #define DST_ROW_00     0
@@ -99,7 +92,6 @@ __kernel void icvContourMoments(int contour_total,
         xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
         yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
     }
-
     xi2 = xi * xi;
     yi2 = yi * yi;
     dxy = xi_1 * yi - xi * yi_1;
@@ -117,864 +109,324 @@ __kernel void icvContourMoments(int contour_total,
     *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
     *( dst_a + DST_ROW_21 * dst_step + idx) =
         dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
-               xi2 * (yi_1 + 3 * yi));
+        xi2 * (yi_1 + 3 * yi));
     *( dst_a + DST_ROW_12 * dst_step + idx) =
         dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
-               yi2 * (xi_1 + 3 * xi));
+        yi2 * (xi_1 + 3 * xi));
 }
 
-__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE,
-                      __global F* sum, __global F* dst_m, int dst_step)
-{
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int block_y = src_rows/tile_height;
-    int block_x = src_cols/tile_width;
-    int block_num;
-
-    if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
-        block_y ++;
-    if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
-        block_x ++;
-    block_num = block_y * block_x;
-    __local F dst_sum[10][128];
-    if(gidy<128-block_num)
-        for(int i=0; i<10; i++)
-            dst_sum[i][gidy+block_num]=0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    dst_step /= sizeof(F);
-    if(gidy<block_num)
-    {
-        dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy));
-        dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
-        dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
-        dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
-        dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
-        dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
-        dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
-        dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
-        dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
-        dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int lsize=64; lsize>0; lsize>>=1)
-    {
-        if(gidy<lsize)
-        {
-            int lsize2 = gidy + lsize;
-            for(int i=0; i<10; i++)
-                dst_sum[i][gidy] += dst_sum[i][lsize2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(gidy==0)
-        for(int i=0; i<10; i++)
-            sum[i] = dst_sum[i][0];
-}
+#if defined (DOUBLE_SUPPORT)
+#define WT double
+#define WT4 double4
+#define convert_T4 convert_double4
+#define convert_T convert_double
+#else
+#define WT float
+#define WT4 float4
+#define convert_T4 convert_float4
+#define convert_T convert_float
+#endif
 
-__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
-                           __global F* dst_m,
-                           int dst_cols, int dst_step, int blocky,
-                           int depth, int cn, int coi, int binary, int TILE_SIZE)
+#ifdef CV_8UC1
+#define TT uchar
+#elif defined CV_16UC1
+#define TT ushort
+#elif defined CV_16SC1
+#define TT short
+#elif defined CV_32FC1
+#define TT float
+#elif defined CV_64FC1
+#ifdef DOUBLE_SUPPORT
+#define TT double
+#else
+#define TT float
+#endif
+#endif
+__kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int src_step,
+                        __global WT* dst_m,
+                        int dst_cols, int dst_step, int binary)
 {
-    uchar tmp_coi[16]; // get the coi data
-    uchar16 tmp[16];
-    int VLEN_C = 16;  // vector length of uchar
-
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int wgidy = get_group_id(0);
-    int wgidx = get_group_id(1);
-    int lidy = get_local_id(0);
-    int lidx = get_local_id(1);
-    int y = wgidy*TILE_SIZE; // vector length of uchar
-    int x = wgidx*TILE_SIZE;  // vector length of uchar
-    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step, TILE_SIZE);
-    int tileSize_height = min(TILE_SIZE, src_rows - y);
-    int tileSize_width = min(TILE_SIZE, src_cols - x);
-
-    if ( y+lidy < src_rows )
-    {
-        if( tileSize_width < TILE_SIZE )
-            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
-                *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
+    int dy = get_global_id(1);
+    int ly = get_local_id(1);
+    int gidx = get_group_id(0);
+    int gidy = get_group_id(1);
+    int x_rest = src_cols % 256;
+    int y_rest = src_rows % 256;
+    __local int codxy[256];
+    codxy[ly] = ly;
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-        if( coi > 0 )	//channel of interest
-            for(int i = 0; i < tileSize_width; i += VLEN_C)
-            {
-                for(int j=0; j<VLEN_C; j++)
-                    tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
-                tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
-                                          tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
-            }
-        else
-            for(int i=0; i < tileSize_width; i+=VLEN_C)
-                tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
-    }
+    WT4 x0 = (WT4)(0.f);
+    WT4 x1 = (WT4)(0.f);
+    WT4 x2 = (WT4)(0.f);
+    WT4 x3 = (WT4)(0.f);
 
-    uchar16 zero = (uchar16)(0);
-    uchar16 full = (uchar16)(255);
-    if( binary )
-        for(int i=0; i < tileSize_width; i+=VLEN_C)
-            tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
+    __global TT* row = src_data + gidy * src_step + ly * src_step + gidx * 256;
 
-    F mom[10];
-    __local int m[10][128];
-    if(lidy < 128)
-    {
-        for(int i=0; i<10; i++)
-            m[i][lidy]=0;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
+    WT4 p;
+    WT4 x;
+    WT4 xp;
+    WT4 xxp;
 
-    int lm[10] = {0};
-    int16 x0 = (int16)(0);
-    int16 x1 = (int16)(0);
-    int16 x2 = (int16)(0);
-    int16 x3 = (int16)(0);
-    for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
-    {
-        int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
-        int16 p = convert_int16(tmp[xt/VLEN_C]);
-        int16 xp = v_xt * p, xxp = xp *v_xt;
-        x0 += p;
-        x1 += xp;
-        x2 += xxp;
-        x3 += xxp * v_xt;
-    }
-    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
-    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
-    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
-    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
-    int py = lidy * ((int)x0.s0);
-    int sy = lidy*lidy;
-    int bheight = min(tileSize_height, TILE_SIZE/2);
-    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
-    {
-        m[9][lidy-bheight] = ((int)py) * sy;  // m03
-        m[8][lidy-bheight] = ((int)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((int)x2.s0) * lidy;  // m21
-        m[6][lidy-bheight] = x3.s0;             // m30
-        m[5][lidy-bheight] = x0.s0 * sy;        // m02
-        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
-        m[3][lidy-bheight] = x2.s0;             // m20
-        m[2][lidy-bheight] = py;             // m01
-        m[1][lidy-bheight] = x1.s0;             // m10
-        m[0][lidy-bheight] = x0.s0;             // m00
-    }
-    else if(lidy < bheight)
-    {
-        lm[9] = ((int)py) * sy;  // m03
-        lm[8] = ((int)x1.s0) * sy;  // m12
-        lm[7] = ((int)x2.s0) * lidy;  // m21
-        lm[6] = x3.s0;             // m30
-        lm[5] = x0.s0 * sy;        // m02
-        lm[4] = x1.s0 * lidy;         // m11
-        lm[3] = x2.s0;             // m20
-        lm[2] = py;             // m01
-        lm[1] = x1.s0;             // m10
-        lm[0] = x0.s0;             // m00
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for( int j = bheight; j >= 1; j = j/2 )
-    {
-        if(lidy < j)
-            for( int i = 0; i < 10; i++ )
-                lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lidy >= j/2&&lidy < j)
-            for( int i = 0; i < 10; i++ )
-                m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
+    WT py = 0.f, sy = 0.f;
 
-    if(lidy == 0&&lidx == 0)
+    if(dy < src_rows)
     {
-        for( int mt = 0; mt < 10; mt++ )
-            mom[mt] = (F)lm[mt];
-        if(binary)
+        if((x_rest > 0) && (gidx == ((int)get_num_groups(0) - 1)))
         {
-            F s = 1./255;
-            for( int mt = 0; mt < 10; mt++ )
-                mom[mt] *= s;
-        }
-        F xm = x * mom[0], ym = y * mom[0];
-
-        // accumulate moments computed in each tile
-        dst_step /= sizeof(F);
-
-        // + m00 ( = m00' )
-        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
-        // + m10 ( = m10' + x*m00' )
-        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
-        // + m01 ( = m01' + y*m00' )
-        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
-        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+            int i;
+            for(i = 0; i < x_rest - 4; i += 4)
+            {
+                p = convert_T4(vload4(0, row + i));
+                x = convert_T4(vload4(0, codxy + i));
+                xp = x * p;
+                xxp = xp * x;
+
+                x0 += p;
+                x1 += xp;
+                x2 += xxp;
+                x3 += convert_T4(xxp * x);
+            }
 
-        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+            x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
+            x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
+            x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
+            x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
 
-        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+            WT x0_ = 0;
+            WT x1_ = 0;
+            WT x2_ = 0;
+            WT x3_ = 0;
 
-        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+            for(; i < x_rest; i++)
+            {
+                WT p_ = 0;
+                p_ = row[i];
+                WT x_ = convert_T(codxy[i]);
 
-        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
 
-        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+                WT xp_ = x_ * p_;
+                WT xxp_ = xp_ * x_;
 
-        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
-    }
-}
+                x0_ += p_;
+                x1_ += xp_;
+                x2_ += xxp_;
+                x3_ += xxp_ * x_;
+            }
 
-__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
-                           __global F* dst_m,
-                           int dst_cols, int dst_step, int blocky,
-                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
-    ushort tmp_coi[8]; // get the coi data
-    ushort8 tmp[32];
-    int VLEN_US = 8; // vector length of ushort
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int wgidy = get_group_id(0);
-    int wgidx = get_group_id(1);
-    int lidy = get_local_id(0);
-    int lidx = get_local_id(1);
-    int y = wgidy*TILE_SIZE;  // real Y index of pixel
-    int x = wgidx*TILE_SIZE;  // real X index of pixel
-    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step/2, TILE_SIZE);
-    int tileSize_height = min(TILE_SIZE, src_rows - y);
-    int tileSize_width = min(TILE_SIZE, src_cols -x);
-
-    if ( y+lidy < src_rows )
-    {
-        if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
-            for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
-                *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
-        if( coi > 0 )
-            for(int i=0; i < tileSize_width; i+=VLEN_US)
+            x0.s0 += x0_;
+            x1.s0 += x1_;
+            x2.s0 += x2_;
+            x3.s0 += x3_;
+        }else
+        {
+            for(int i = 0; i < 256; i += 4)
             {
-                for(int j=0; j<VLEN_US; j++)
-                    tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
-                tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+                p = convert_T4(vload4(0, row + i));
+                x = convert_T4(vload4(0, codxy + i));
+                xp = x * p;
+                xxp = xp * x;
+
+                x0 += p;
+                x1 += xp;
+                x2 += xxp;
+                x3 += convert_T4(xxp * x);
             }
-        else
-            for(int i=0; i < tileSize_width; i+=VLEN_US)
-                tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
-    }
-
-    ushort8 zero = (ushort8)(0);
-    ushort8 full = (ushort8)(255);
-    if( binary )
-        for(int i=0; i < tileSize_width; i+=VLEN_US)
-            tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
-    F mom[10];
-    __local long m[10][128];
-    if(lidy < 128)
-        for(int i=0; i<10; i++)
-            m[i][lidy]=0;
-    barrier(CLK_LOCAL_MEM_FENCE);
 
-    long lm[10] = {0};
-    int8 x0 = (int8)(0);
-    int8 x1 = (int8)(0);
-    int8 x2 = (int8)(0);
-    long8 x3 = (long8)(0);
-    for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
-    {
-        int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
-        int8 p = convert_int8(tmp[xt/VLEN_US]);
-        int8 xp = v_xt * p, xxp = xp * v_xt;
-        x0 += p;
-        x1 += xp;
-        x2 += xxp;
-        x3 += convert_long8(xxp) *convert_long8(v_xt);
-    }
-    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
-    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
-    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
-    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
-
-    int py = lidy * x0.s0, sy = lidy*lidy;
-    int bheight = min(tileSize_height, TILE_SIZE/2);
-    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
-    {
-        m[9][lidy-bheight] = ((long)py) * sy;  // m03
-        m[8][lidy-bheight] = ((long)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((long)x2.s0) * lidy;  // m21
-        m[6][lidy-bheight] = x3.s0;             // m30
-        m[5][lidy-bheight] = x0.s0 * sy;        // m02
-        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
-        m[3][lidy-bheight] = x2.s0;             // m20
-        m[2][lidy-bheight] = py;             // m01
-        m[1][lidy-bheight] = x1.s0;             // m10
-        m[0][lidy-bheight] = x0.s0;             // m00
-    }
-    else if(lidy < bheight)
-    {
-        lm[9] = ((long)py) * sy;  // m03
-        lm[8] = ((long)x1.s0) * sy;  // m12
-        lm[7] = ((long)x2.s0) * lidy;  // m21
-        lm[6] = x3.s0;             // m30
-        lm[5] = x0.s0 * sy;        // m02
-        lm[4] = x1.s0 * lidy;         // m11
-        lm[3] = x2.s0;             // m20
-        lm[2] = py;             // m01
-        lm[1] = x1.s0;             // m10
-        lm[0] = x0.s0;             // m00
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
+            x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
+            x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
+            x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
+            x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
+        }
 
-    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
-    {
-        if(lidy < j)
-            for( int i = 0; i < 10; i++ )
-                lm[i] = lm[i] + m[i][lidy];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
-    {
-        if(lidy >= j/2&&lidy < j)
-            for( int i = 0; i < 10; i++ )
-                m[i][lidy-j/2] = lm[i];
+        py = ly * x0.s0;
+        sy = ly * ly;
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __local WT mom[10][256];
 
-    if(lidy == 0&&lidx == 0)
+    if((y_rest > 0) && (gidy == ((int)get_num_groups(1) - 1)))
     {
-        for(int mt = 0; mt < 10; mt++ )
-            mom[mt] = (F)lm[mt];
-
-        if(binary)
+        if(ly < y_rest)
         {
-            F s = 1./255;
-            for( int mt = 0; mt < 10; mt++ )
-                mom[mt] *= s;
+            mom[9][ly] = py * sy;
+            mom[8][ly] = x1.s0 * sy;
+            mom[7][ly] = x2.s0 * ly;
+            mom[6][ly] = x3.s0;
+            mom[5][ly] = x0.s0 * sy;
+            mom[4][ly] = x1.s0 * ly;
+            mom[3][ly] = x2.s0;
+            mom[2][ly] = py;
+            mom[1][ly] = x1.s0;
+            mom[0][ly] = x0.s0;
         }
-
-        F xm = x  *mom[0], ym = y * mom[0];
-
-        // accumulate moments computed in each tile
-        dst_step /= sizeof(F);
-
-        // + m00 ( = m00' )
-        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
-        // + m10 ( = m10' + x*m00' )
-        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
-        // + m01 ( = m01' + y*m00' )
-        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
-        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
-        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
-        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
-
-        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
-        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
-        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
-        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(ly < 10)
+            for(int i = 1; i < y_rest; i++)
+                mom[ly][0] = mom[ly][i] + mom[ly][0];
     }
-}
-
-__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
-                           __global F* dst_m,
-                           int dst_cols, int dst_step, int blocky,
-                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
-    short tmp_coi[8]; // get the coi data
-    short8 tmp[32];
-    int VLEN_S =8; // vector length of short
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int wgidy = get_group_id(0);
-    int wgidx = get_group_id(1);
-    int lidy = get_local_id(0);
-    int lidx = get_local_id(1);
-    int y = wgidy*TILE_SIZE;  // real Y index of pixel
-    int x = wgidx*TILE_SIZE;  // real X index of pixel
-    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step/2, TILE_SIZE);
-    int tileSize_height = min(TILE_SIZE, src_rows - y);
-    int tileSize_width = min(TILE_SIZE, src_cols -x);
-
-    if ( y+lidy < src_rows )
+    else
     {
-        if(tileSize_width < TILE_SIZE)
-            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
-                *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
-        if( coi > 0 )
-            for(int i=0; i < tileSize_width; i+=VLEN_S)
-            {
-                for(int j=0; j<VLEN_S; j++)
-                    tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
-                tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
-            }
-        else
-            for(int i=0; i < tileSize_width; i+=VLEN_S)
-                tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
-    }
+        mom[9][ly] = py * sy;
+        mom[8][ly] = x1.s0 * sy;
+        mom[7][ly] = x2.s0 * ly;
+        mom[6][ly] = x3.s0;
+        mom[5][ly] = x0.s0 * sy;
+        mom[4][ly] = x1.s0 * ly;
+        mom[3][ly] = x2.s0;
+        mom[2][ly] = py;
+        mom[1][ly] = x1.s0;
+        mom[0][ly] = x0.s0;
 
-    short8 zero = (short8)(0);
-    short8 full = (short8)(255);
-    if( binary )
-        for(int i=0; i < tileSize_width; i+=(VLEN_S))
-            tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
-
-    F mom[10];
-    __local long m[10][128];
-    if(lidy < 128)
-        for(int i=0; i<10; i++)
-            m[i][lidy]=0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    long lm[10] = {0};
-    int8 x0 = (int8)(0);
-    int8 x1 = (int8)(0);
-    int8 x2 = (int8)(0);
-    long8 x3 = (long8)(0);
-    for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
-    {
-        int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
-        int8 p = convert_int8(tmp[xt/VLEN_S]);
-        int8 xp = v_xt * p, xxp = xp * v_xt;
-        x0 += p;
-        x1 += xp;
-        x2 += xxp;
-        x3 += convert_long8(xxp) * convert_long8(v_xt);
-    }
-    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
-    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
-    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
-    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
-
-    int py = lidy * x0.s0, sy = lidy*lidy;
-    int bheight = min(tileSize_height, TILE_SIZE/2);
-    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
-    {
-        m[9][lidy-bheight] = ((long)py) * sy;  // m03
-        m[8][lidy-bheight] = ((long)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((long)x2.s0) * lidy;  // m21
-        m[6][lidy-bheight] = x3.s0;             // m30
-        m[5][lidy-bheight] = x0.s0 * sy;        // m02
-        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
-        m[3][lidy-bheight] = x2.s0;             // m20
-        m[2][lidy-bheight] = py;             // m01
-        m[1][lidy-bheight] = x1.s0;             // m10
-        m[0][lidy-bheight] = x0.s0;             // m00
-    }
-    else if(lidy < bheight)
-    {
-        lm[9] = ((long)py) * sy;  // m03
-        lm[8] = ((long)(x1.s0)) * sy;  // m12
-        lm[7] = ((long)(x2.s0)) * lidy;  // m21
-        lm[6] = x3.s0;             // m30
-        lm[5] = x0.s0 * sy;        // m02
-        lm[4] = x1.s0 * lidy;         // m11
-        lm[3] = x2.s0;             // m20
-        lm[2] = py;             // m01
-        lm[1] = x1.s0;             // m10
-        lm[0] = x0.s0;             // m00
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for( int j = TILE_SIZE/2; j >=1; j = j/2 )
-    {
-        if(lidy < j)
-            for( int i = 0; i < 10; i++ )
-                lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lidy >= j/2&&lidy < j)
-            for( int i = 0; i < 10; i++ )
-                m[i][lidy-j/2] = lm[i];
         barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lidy ==0 &&lidx ==0)
-    {
-        for(int mt = 0; mt < 10; mt++ )
-            mom[mt] = (F)lm[mt];
 
-        if(binary)
+        if(ly < 128)
         {
-            F s = 1./255;
-            for( int mt = 0; mt < 10; mt++ )
-                mom[mt] *= s;
+            mom[0][ly] = mom[0][ly] + mom[0][ly + 128];
+            mom[1][ly] = mom[1][ly] + mom[1][ly + 128];
+            mom[2][ly] = mom[2][ly] + mom[2][ly + 128];
+            mom[3][ly] = mom[3][ly] + mom[3][ly + 128];
+            mom[4][ly] = mom[4][ly] + mom[4][ly + 128];
+            mom[5][ly] = mom[5][ly] + mom[5][ly + 128];
+            mom[6][ly] = mom[6][ly] + mom[6][ly + 128];
+            mom[7][ly] = mom[7][ly] + mom[7][ly + 128];
+            mom[8][ly] = mom[8][ly] + mom[8][ly + 128];
+            mom[9][ly] = mom[9][ly] + mom[9][ly + 128];
         }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        F xm = x * mom[0], ym = y*mom[0];
-
-        // accumulate moments computed in each tile
-        dst_step /= sizeof(F);
-
-        // + m00 ( = m00' )
-        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
-        // + m10 ( = m10' + x*m00' )
-        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
-        // + m01 ( = m01' + y*m00' )
-        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
-        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
-        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+        if(ly < 64)
+        {
+            mom[0][ly] = mom[0][ly] + mom[0][ly + 64];
+            mom[1][ly] = mom[1][ly] + mom[1][ly + 64];
+            mom[2][ly] = mom[2][ly] + mom[2][ly + 64];
+            mom[3][ly] = mom[3][ly] + mom[3][ly + 64];
+            mom[4][ly] = mom[4][ly] + mom[4][ly + 64];
+            mom[5][ly] = mom[5][ly] + mom[5][ly + 64];
+            mom[6][ly] = mom[6][ly] + mom[6][ly + 64];
+            mom[7][ly] = mom[7][ly] + mom[7][ly + 64];
+            mom[8][ly] = mom[8][ly] + mom[8][ly + 64];
+            mom[9][ly] = mom[9][ly] + mom[9][ly + 64];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+        if(ly < 32)
+        {
+            mom[0][ly] = mom[0][ly] + mom[0][ly + 32];
+            mom[1][ly] = mom[1][ly] + mom[1][ly + 32];
+            mom[2][ly] = mom[2][ly] + mom[2][ly + 32];
+            mom[3][ly] = mom[3][ly] + mom[3][ly + 32];
+            mom[4][ly] = mom[4][ly] + mom[4][ly + 32];
+            mom[5][ly] = mom[5][ly] + mom[5][ly + 32];
+            mom[6][ly] = mom[6][ly] + mom[6][ly + 32];
+            mom[7][ly] = mom[7][ly] + mom[7][ly + 32];
+            mom[8][ly] = mom[8][ly] + mom[8][ly + 32];
+            mom[9][ly] = mom[9][ly] + mom[9][ly + 32];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+        if(ly < 16)
+        {
+            mom[0][ly] = mom[0][ly] + mom[0][ly + 16];
+            mom[1][ly] = mom[1][ly] + mom[1][ly + 16];
+            mom[2][ly] = mom[2][ly] + mom[2][ly + 16];
+            mom[3][ly] = mom[3][ly] + mom[3][ly + 16];
+            mom[4][ly] = mom[4][ly] + mom[4][ly + 16];
+            mom[5][ly] = mom[5][ly] + mom[5][ly + 16];
+            mom[6][ly] = mom[6][ly] + mom[6][ly + 16];
+            mom[7][ly] = mom[7][ly] + mom[7][ly + 16];
+            mom[8][ly] = mom[8][ly] + mom[8][ly + 16];
+            mom[9][ly] = mom[9][ly] + mom[9][ly + 16];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+        if(ly < 8)
+        {
+            mom[0][ly] = mom[0][ly] + mom[0][ly + 8];
+            mom[1][ly] = mom[1][ly] + mom[1][ly + 8];
+            mom[2][ly] = mom[2][ly] + mom[2][ly + 8];
+            mom[3][ly] = mom[3][ly] + mom[3][ly + 8];
+            mom[4][ly] = mom[4][ly] + mom[4][ly + 8];
+            mom[5][ly] = mom[5][ly] + mom[5][ly + 8];
+            mom[6][ly] = mom[6][ly] + mom[6][ly + 8];
+            mom[7][ly] = mom[7][ly] + mom[7][ly + 8];
+            mom[8][ly] = mom[8][ly] + mom[8][ly + 8];
+            mom[9][ly] = mom[9][ly] + mom[9][ly + 8];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+        if(ly < 4)
+        {
+            mom[0][ly] = mom[0][ly] + mom[0][ly + 4];
+            mom[1][ly] = mom[1][ly] + mom[1][ly + 4];
+            mom[2][ly] = mom[2][ly] + mom[2][ly + 4];
+            mom[3][ly] = mom[3][ly] + mom[3][ly + 4];
+            mom[4][ly] = mom[4][ly] + mom[4][ly + 4];
+            mom[5][ly] = mom[5][ly] + mom[5][ly + 4];
+            mom[6][ly] = mom[6][ly] + mom[6][ly + 4];
+            mom[7][ly] = mom[7][ly] + mom[7][ly + 4];
+            mom[8][ly] = mom[8][ly] + mom[8][ly + 4];
+            mom[9][ly] = mom[9][ly] + mom[9][ly + 4];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
-    }
-}
+        if(ly < 2)
+        {
+            mom[0][ly] = mom[0][ly] + mom[0][ly + 2];
+            mom[1][ly] = mom[1][ly] + mom[1][ly + 2];
+            mom[2][ly] = mom[2][ly] + mom[2][ly + 2];
+            mom[3][ly] = mom[3][ly] + mom[3][ly + 2];
+            mom[4][ly] = mom[4][ly] + mom[4][ly + 2];
+            mom[5][ly] = mom[5][ly] + mom[5][ly + 2];
+            mom[6][ly] = mom[6][ly] + mom[6][ly + 2];
+            mom[7][ly] = mom[7][ly] + mom[7][ly + 2];
+            mom[8][ly] = mom[8][ly] + mom[8][ly + 2];
+            mom[9][ly] = mom[9][ly] + mom[9][ly + 2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
-                            __global F* dst_m,
-                            int dst_cols, int dst_step, int blocky,
-                            int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
-    float tmp_coi[4]; // get the coi data
-    float4 tmp[64] ;
-    int VLEN_F = 4; // vector length of float
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int wgidy = get_group_id(0);
-    int wgidx = get_group_id(1);
-    int lidy = get_local_id(0);
-    int lidx = get_local_id(1);
-    int y = wgidy*TILE_SIZE;  // real Y index of pixel
-    int x = wgidx*TILE_SIZE;  // real X index of pixel
-    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step/4, TILE_SIZE);
-    int tileSize_height = min(TILE_SIZE, src_rows - y);
-    int tileSize_width = min(TILE_SIZE, src_cols -x);
-    int maxIdx = mul24(src_rows, src_cols);
-    int yOff = (y+lidy)*src_step;
-    int index;
-
-    if ( y+lidy < src_rows )
-    {
-        if(tileSize_width < TILE_SIZE)
-            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
-                *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
-        if( coi > 0 )
-            for(int i=0; i < tileSize_width; i+=VLEN_F)
-            {
-                for(int j=0; j<4; j++)
-                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
-                tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-            }
-        else
-            for(int i=0; i < tileSize_width; i+=VLEN_F)
-                tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
+        if(ly < 1)
+        {
+            mom[0][ly] = mom[0][ly] + mom[0][ly + 1];
+            mom[1][ly] = mom[1][ly] + mom[1][ly + 1];
+            mom[2][ly] = mom[2][ly] + mom[2][ly + 1];
+            mom[3][ly] = mom[3][ly] + mom[3][ly + 1];
+            mom[4][ly] = mom[4][ly] + mom[4][ly + 1];
+            mom[5][ly] = mom[5][ly] + mom[5][ly + 1];
+            mom[6][ly] = mom[6][ly] + mom[6][ly + 1];
+            mom[7][ly] = mom[7][ly] + mom[7][ly + 1];
+            mom[8][ly] = mom[8][ly] + mom[8][ly + 1];
+            mom[9][ly] = mom[9][ly] + mom[9][ly + 1];
+        }
     }
 
-    float4 zero = (float4)(0);
-    float4 full = (float4)(255);
-    if( binary )
-        for(int i=0; i < tileSize_width; i+=4)
-            tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
-    F mom[10];
-    __local F m[10][128];
-    if(lidy < 128)
-        for(int i = 0; i < 10; i ++)
-            m[i][lidy] = 0;
     barrier(CLK_LOCAL_MEM_FENCE);
-    F lm[10] = {0};
-    F4 x0 = (F4)(0);
-    F4 x1 = (F4)(0);
-    F4 x2 = (F4)(0);
-    F4 x3 = (F4)(0);
-    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
-    {
-        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
-        F4 p = convert_F4(tmp[xt/VLEN_F]);
-        F4 xp = v_xt * p, xxp = xp * v_xt;
-        x0 += p;
-        x1 += xp;
-        x2 += xxp;
-        x3 += xxp * v_xt;
-    }
-    x0.s0 += x0.s1 + x0.s2 + x0.s3;
-    x1.s0 += x1.s1 + x1.s2 + x1.s3;
-    x2.s0 += x2.s1 + x2.s2 + x2.s3;
-    x3.s0 += x3.s1 + x3.s2 + x3.s3;
-
-    F py = lidy * x0.s0, sy = lidy*lidy;
-    int bheight = min(tileSize_height, TILE_SIZE/2);
-    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
-    {
-        m[9][lidy-bheight] = ((F)py) * sy;  // m03
-        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
-        m[6][lidy-bheight] = x3.s0;             // m30
-        m[5][lidy-bheight] = x0.s0 * sy;        // m02
-        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
-        m[3][lidy-bheight] = x2.s0;             // m20
-        m[2][lidy-bheight] = py;             // m01
-        m[1][lidy-bheight] = x1.s0;             // m10
-        m[0][lidy-bheight] = x0.s0;             // m00
-    }
 
-    else if(lidy < bheight)
-    {
-        lm[9] = ((F)py) * sy;  // m03
-        lm[8] = ((F)x1.s0) * sy;  // m12
-        lm[7] = ((F)x2.s0) * lidy;  // m21
-        lm[6] = x3.s0;             // m30
-        lm[5] = x0.s0 * sy;        // m02
-        lm[4] = x1.s0 * lidy;         // m11
-        lm[3] = x2.s0;             // m20
-        lm[2] = py;             // m01
-        lm[1] = x1.s0;             // m10
-        lm[0] = x0.s0;             // m00
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    if(binary)
     {
-        if(lidy < j)
-            for( int i = 0; i < 10; i++ )
-                lm[i] = lm[i] + m[i][lidy];
+        WT s = 1.0f/255;
+        if(ly < 10)
+            mom[ly][0] *= s;
         barrier(CLK_LOCAL_MEM_FENCE);
-        if(lidy >= j/2&&lidy < j)
-            for( int i = 0; i < 10; i++ )
-                m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lidy == 0&&lidx == 0)
-    {
-        for( int mt = 0; mt < 10; mt++ )
-            mom[mt] = (F)lm[mt];
-        if(binary)
-        {
-            F s = 1./255;
-            for( int mt = 0; mt < 10; mt++ )
-                mom[mt] *= s;
-        }
-
-        F xm = x * mom[0], ym = y * mom[0];
-
-        // accumulate moments computed in each tile
-        dst_step /= sizeof(F);
-
-        // + m00 ( = m00' )
-        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
-        // + m10 ( = m10' + x*m00' )
-        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
-        // + m01 ( = m01' + y*m00' )
-        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
-        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
-        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
-        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
-
-        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
-        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
-        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
-        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
     }
-}
+    WT xm = (gidx * 256) * mom[0][0];
+    WT ym = (gidy * 256) * mom[0][0];
 
-__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step,
-                           __global F* dst_m,
-                           int dst_cols, int dst_step, int blocky,
-                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
-    F tmp_coi[4]; // get the coi data
-    F4 tmp[64];
-    int VLEN_D = 4; // length of vetor
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int wgidy = get_group_id(0);
-    int wgidx = get_group_id(1);
-    int lidy = get_local_id(0);
-    int lidx = get_local_id(1);
-    int y = wgidy*TILE_SIZE;  // real Y index of pixel
-    int x = wgidx*TILE_SIZE;  // real X index of pixel
-    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step/8, TILE_SIZE);
-    int tileSize_height = min(TILE_SIZE,  src_rows - y);
-    int tileSize_width = min(TILE_SIZE, src_cols - x);
-
-    if ( y+lidy < src_rows )
+    if(ly == 0)
     {
-        if(tileSize_width < TILE_SIZE)
-            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
-                *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
-        if( coi > 0 )
-            for(int i=0; i < tileSize_width; i+=VLEN_D)
-            {
-                for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
-                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
-                tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-            }
-        else
-            for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
-                tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+        mom[0][1] = mom[0][0];
+        mom[1][1] = mom[1][0] + xm;
+        mom[2][1] = mom[2][0] + ym;
+        mom[3][1] = mom[3][0] + gidx * 256 * (mom[1][0] * 2 + xm);
+        mom[4][1] = mom[4][0] + gidx * 256 * (mom[2][0] + ym) + gidy * 256 * mom[1][0];
+        mom[5][1] = mom[5][0] + gidy * 256 * (mom[2][0] * 2 + ym);
+        mom[6][1] = mom[6][0] + gidx * 256 * (3 * mom[3][0] + 256 * gidx * (3 * mom[1][0] + xm));
+        mom[7][1] = mom[7][0] + gidx * 256 * (2 * (mom[4][0] + 256 * gidy * mom[1][0]) + 256 * gidx * (mom[2][0] + ym)) + 256 * gidy * mom[3][0];
+        mom[8][1] = mom[8][0] + gidy * 256 * (2 * (mom[4][0] + 256 * gidx * mom[2][0]) + 256 * gidy * (mom[1][0] + xm)) + 256 * gidx * mom[5][0];
+        mom[9][1] = mom[9][0] + gidy * 256 * (3 * mom[5][0] + 256 * gidy * (3 * mom[2][0] + ym));
     }
 
-    F4 zero = (F4)(0);
-    F4 full = (F4)(255);
-    if( binary )
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
-    F mom[10];
-    __local F m[10][128];
-    if(lidy < 128)
-        for(int i=0; i<10; i++)
-            m[i][lidy]=0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    F lm[10] = {0};
-    F4 x0 = (F4)(0);
-    F4 x1 = (F4)(0);
-    F4 x2 = (F4)(0);
-    F4 x3 = (F4)(0);
-    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
-    {
-        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
-        F4 p = tmp[xt/VLEN_D];
-        F4 xp = v_xt * p, xxp = xp * v_xt;
-        x0 += p;
-        x1 += xp;
-        x2 += xxp;
-        x3 += xxp *v_xt;
-    }
-    x0.s0 += x0.s1 + x0.s2 + x0.s3;
-    x1.s0 += x1.s1 + x1.s2 + x1.s3;
-    x2.s0 += x2.s1 + x2.s2 + x2.s3;
-    x3.s0 += x3.s1 + x3.s2 + x3.s3;
-
-    F py = lidy * x0.s0, sy = lidy*lidy;
-    int bheight = min(tileSize_height, TILE_SIZE/2);
-    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
-    {
-        m[9][lidy-bheight] = ((F)py) * sy;  // m03
-        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
-        m[6][lidy-bheight] = x3.s0;             // m30
-        m[5][lidy-bheight] = x0.s0 * sy;        // m02
-        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
-        m[3][lidy-bheight] = x2.s0;             // m20
-        m[2][lidy-bheight] = py;             // m01
-        m[1][lidy-bheight] = x1.s0;             // m10
-        m[0][lidy-bheight] = x0.s0;             // m00
-    }
-    else if(lidy < bheight)
-    {
-        lm[9] = ((F)py) * sy;  // m03
-        lm[8] = ((F)x1.s0) * sy;  // m12
-        lm[7] = ((F)x2.s0) * lidy;  // m21
-        lm[6] = x3.s0;             // m30
-        lm[5] = x0.s0 * sy;        // m02
-        lm[4] = x1.s0 * lidy;         // m11
-        lm[3] = x2.s0;             // m20
-        lm[2] = py;             // m01
-        lm[1] = x1.s0;             // m10
-        lm[0] = x0.s0;             // m00
-    }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
-    {
-        if(lidy < j)
-            for( int i = 0; i < 10; i++ )
-                lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lidy >= j/2&&lidy < j)
-            for( int i = 0; i < 10; i++ )
-                m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lidy == 0&&lidx == 0)
-    {
-        for( int mt = 0; mt < 10; mt++ )
-            mom[mt] = (F)lm[mt];
-        if(binary)
-        {
-            F s = 1./255;
-            for( int mt = 0; mt < 10; mt++ )
-                mom[mt] *= s;
-        }
-
-        F xm = x * mom[0], ym = y * mom[0];
-
-        // accumulate moments computed in each tile
-        dst_step /= sizeof(F);
-
-        // + m00 ( = m00' )
-        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
-        // + m10 ( = m10' + x*m00' )
-        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
-        // + m01 ( = m01' + y*m00' )
-        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
-        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
-        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
-        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
-
-        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
-        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
-        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
-        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
-    }
+    if(ly < 10)
+        dst_m[10 * gidy * dst_step + ly * dst_step + gidx] = mom[ly][1];
 }
diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl
index 685eccf688..0d2f26f966 100644
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -200,7 +200,7 @@ __kernel void normalize_hists_36_kernel(__global float* block_hists,
 //-------------------------------------------------------------
 //  Normalization of histograms via L2Hys_norm
 //
-float reduce_smem(volatile __local float* smem, int size)
+static float reduce_smem(volatile __local float* smem, int size)
 {
     unsigned int tid = get_local_id(0);
     float sum = smem[tid];
@@ -564,7 +564,6 @@ __kernel void compute_gradients_8UC4_kernel(
     const int x = get_global_id(0);
     const int tid = get_local_id(0);
     const int gSizeX = get_local_size(0);
-    const int gidX = get_group_id(0);
     const int gidY = get_group_id(1);
 
     __global const uchar4* row = img + gidY * img_step;
@@ -667,7 +666,6 @@ __kernel void compute_gradients_8UC1_kernel(
     const int x = get_global_id(0);
     const int tid = get_local_id(0);
     const int gSizeX = get_local_size(0);
-    const int gidX = get_group_id(0);
     const int gidY = get_group_id(1);
 
     __global const uchar* row = img + gidY * img_step;
diff --git a/modules/ocl/src/opencl/operator_copyToM.cl b/modules/ocl/src/opencl/operator_copyToM.cl
index 69b5ea4ab4..dcf5af975e 100644
--- a/modules/ocl/src/opencl/operator_copyToM.cl
+++ b/modules/ocl/src/opencl/operator_copyToM.cl
@@ -16,7 +16,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/src/opencl/operator_setTo.cl b/modules/ocl/src/opencl/operator_setTo.cl
index 1d2ad65977..8ac480347e 100644
--- a/modules/ocl/src/opencl/operator_setTo.cl
+++ b/modules/ocl/src/opencl/operator_setTo.cl
@@ -16,7 +16,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/src/opencl/operator_setToM.cl b/modules/ocl/src/opencl/operator_setToM.cl
index a1cb092f87..8a489da9dc 100644
--- a/modules/ocl/src/opencl/operator_setToM.cl
+++ b/modules/ocl/src/opencl/operator_setToM.cl
@@ -16,7 +16,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/ocl/src/opencl/optical_flow_farneback.cl b/modules/ocl/src/opencl/optical_flow_farneback.cl
index 917f7f215d..4725662c60 100644
--- a/modules/ocl/src/opencl/optical_flow_farneback.cl
+++ b/modules/ocl/src/opencl/optical_flow_farneback.cl
@@ -44,10 +44,10 @@
 //M*/
 
 
-#define tx  get_local_id(0)
+#define tx  (int)get_local_id(0)
 #define ty  get_local_id(1)
 #define bx  get_group_id(0)
-#define bdx get_local_size(0)
+#define bdx (int)get_local_size(0)
 
 #define BORDER_SIZE 5
 #define MAX_KSIZE_HALF 100
diff --git a/modules/ocl/src/opencl/pyr_down.cl b/modules/ocl/src/opencl/pyr_down.cl
index e09846457c..6f10067e9f 100644
--- a/modules/ocl/src/opencl/pyr_down.cl
+++ b/modules/ocl/src/opencl/pyr_down.cl
@@ -43,32 +43,32 @@
 //
 //M*/
 
-int idx_row_low(int y, int last_row)
+inline int idx_row_low(int y, int last_row)
 {
     return abs(y) % (last_row + 1);
 }
 
-int idx_row_high(int y, int last_row)
+inline int idx_row_high(int y, int last_row)
 {
     return abs(last_row - (int)abs(last_row - y)) % (last_row + 1);
 }
 
-int idx_row(int y, int last_row)
+inline int idx_row(int y, int last_row)
 {
     return idx_row_low(idx_row_high(y, last_row), last_row);
 }
 
-int idx_col_low(int x, int last_col)
+inline int idx_col_low(int x, int last_col)
 {
     return abs(x) % (last_col + 1);
 }
 
-int idx_col_high(int x, int last_col)
+inline int idx_col_high(int x, int last_col)
 {
     return abs(last_col - (int)abs(last_col - x)) % (last_col + 1);
 }
 
-int idx_col(int x, int last_col)
+inline int idx_col(int x, int last_col)
 {
     return idx_col_low(idx_col_high(x, last_col), last_col);
 }
diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl
index 85f4d39343..a7fc27838b 100644
--- a/modules/ocl/src/opencl/pyrlk.cl
+++ b/modules/ocl/src/opencl/pyrlk.cl
@@ -53,7 +53,8 @@
 #define WAVE_SIZE 1
 #endif
 #ifdef CPU
-void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
+
+static void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
@@ -72,7 +73,7 @@ void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local
     }
 }
 
-void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
+static void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
@@ -89,7 +90,7 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
     }
 }
 
-void reduce1(float val1, volatile __local float* smem1, int tid)
+static void reduce1(float val1, volatile __local float* smem1, int tid)
 {
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -104,7 +105,7 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
     }
 }
 #else
-void reduce3(float val1, float val2, float val3,
+static void reduce3(float val1, float val2, float val3,
              __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
 {
     smem1[tid] = val1;
@@ -151,7 +152,7 @@ void reduce3(float val1, float val2, float val3,
     barrier(CLK_LOCAL_MEM_FENCE);
 }
 
-void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
+static void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
@@ -190,7 +191,7 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola
     barrier(CLK_LOCAL_MEM_FENCE);
 }
 
-void reduce1(float val1, __local volatile float* smem1, int tid)
+static void reduce1(float val1, __local volatile float* smem1, int tid)
 {
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -226,7 +227,7 @@ void reduce1(float val1, __local volatile float* smem1, int tid)
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
 
-void SetPatch(image2d_t I, float x, float y,
+static void SetPatch(image2d_t I, float x, float y,
               float* Pch, float* Dx, float* Dy,
               float* A11, float* A12, float* A22)
 {
@@ -247,7 +248,7 @@ void SetPatch(image2d_t I, float x, float y,
     *A22 += dIdy * dIdy;
 }
 
-void GetPatch(image2d_t J, float x, float y,
+inline void GetPatch(image2d_t J, float x, float y,
               float* Pch, float* Dx, float* Dy,
               float* b1, float* b2)
 {
@@ -257,13 +258,13 @@ void GetPatch(image2d_t J, float x, float y,
     *b2 += diff**Dy;
 }
 
-void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
+inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
 {
     float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
     *errval += fabs(diff);
 }
 
-void SetPatch4(image2d_t I, const float x, const float y,
+static void SetPatch4(image2d_t I, const float x, const float y,
                float4* Pch, float4* Dx, float4* Dy,
                float* A11, float* A12, float* A22)
 {
@@ -286,7 +287,7 @@ void SetPatch4(image2d_t I, const float x, const float y,
     *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
 }
 
-void GetPatch4(image2d_t J, const float x, const float y,
+static void GetPatch4(image2d_t J, const float x, const float y,
                const float4* Pch, const float4* Dx, const float4* Dy,
                float* b1, float* b2)
 {
@@ -298,7 +299,7 @@ void GetPatch4(image2d_t J, const float x, const float y,
     *b2 += xdiff.x + xdiff.y + xdiff.z;
 }
 
-void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
+static void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
 {
     float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
     *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
@@ -318,7 +319,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
     unsigned int gid=get_group_id(0);
     unsigned int xsize=get_local_size(0);
     unsigned int ysize=get_local_size(1);
-    int xBase, yBase, i, j, k;
+    int xBase, yBase, k;
 
     float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
 
@@ -597,7 +598,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
     unsigned int gid=get_group_id(0);
     unsigned int xsize=get_local_size(0);
     unsigned int ysize=get_local_size(1);
-    int xBase, yBase, i, j, k;
+    int xBase, yBase, k;
 
     float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
 
diff --git a/modules/ocl/src/opencl/split_mat.cl b/modules/ocl/src/opencl/split_mat.cl
index b59e6b75b1..b9aa048b07 100644
--- a/modules/ocl/src/opencl/split_mat.cl
+++ b/modules/ocl/src/opencl/split_mat.cl
@@ -10,13 +10,9 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -46,1177 +42,171 @@
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 
-///////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////optimized code using vector ////////////////////////////////
-////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
-////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
-                                  __global uchar *mat_dst3, int dst3_step, int dst3_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 2;
-
-        int src_idx  = mad24(y, src_step, src_offset + (x << 2));
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
-
-        int dst2_start = mad24(y, dst2_step, dst2_offset);
-        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
-        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
-
-        int dst3_start = mad24(y, dst3_step, dst3_offset);
-        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
-        int dst3_idx   = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
-
-        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
-        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx)));
-        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx)));
-        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
-
-        int total_bytes = src_offset + rows * src_step;
-        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx)));
-        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx)));
-        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
-
-        uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
-
-        if((dst0_offset & 3) == 3)
-            tmp_data0 = (uchar4)(data_0.x, data_1.x, data_2.x, data_3.x);
-        if((dst0_offset & 3) == 2)
-            tmp_data0 = (uchar4)(data_1.x, data_2.x, data_3.x, data_4.x);
-        if((dst0_offset & 3) == 1)
-            tmp_data0 = (uchar4)(data_2.x, data_3.x, data_4.x, data_5.x);
-        if((dst0_offset & 3) == 0)
-            tmp_data0 = (uchar4)(data_3.x, data_4.x, data_5.x, data_6.x);
-
-        if((dst1_offset & 3) == 3)
-            tmp_data1 = (uchar4)(data_0.y, data_1.y, data_2.y, data_3.y);
-        if((dst1_offset & 3) == 2)
-            tmp_data1 = (uchar4)(data_1.y, data_2.y, data_3.y, data_4.y);
-        if((dst1_offset & 3) == 1)
-            tmp_data1 = (uchar4)(data_2.y, data_3.y, data_4.y, data_5.y);
-        if((dst1_offset & 3) == 0)
-            tmp_data1 = (uchar4)(data_3.y, data_4.y, data_5.y, data_6.y);
-
-        if((dst2_offset & 3) == 3)
-            tmp_data2 = (uchar4)(data_0.z, data_1.z, data_2.z, data_3.z);
-        if((dst2_offset & 3) == 2)
-            tmp_data2 = (uchar4)(data_1.z, data_2.z, data_3.z, data_4.z);
-        if((dst2_offset & 3) == 1)
-            tmp_data2 = (uchar4)(data_2.z, data_3.z, data_4.z, data_5.z);
-        if((dst2_offset & 3) == 0)
-            tmp_data2 = (uchar4)(data_3.z, data_4.z, data_5.z, data_6.z);
-
-        if((dst3_offset & 3) == 3)
-            tmp_data3 = (uchar4)(data_0.w, data_1.w, data_2.w, data_3.w);
-        if((dst3_offset & 3) == 2)
-            tmp_data3 = (uchar4)(data_1.w, data_2.w, data_3.w, data_4.w);
-        if((dst3_offset & 3) == 1)
-            tmp_data3 = (uchar4)(data_2.w, data_3.w, data_4.w, data_5.w);
-        if((dst3_offset & 3) == 0)
-            tmp_data3 = (uchar4)(data_3.w, data_4.w, data_5.w, data_6.w);
-
-        uchar4 dst0_data  = *((__global uchar4 *)(mat_dst0 + dst0_idx));
-        uchar4 dst1_data  = *((__global uchar4 *)(mat_dst1 + dst1_idx));
-        uchar4 dst2_data  = *((__global uchar4 *)(mat_dst2 + dst2_idx));
-        uchar4 dst3_data  = *((__global uchar4 *)(mat_dst3 + dst3_idx));
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-        tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
-        tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-        tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
-        tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
-        tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
-        tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-        tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
-        tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
-        tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
-        tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
-        tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
-        tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
-
-        *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
-        *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
-        *((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
-        *((__global uchar4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
-    }
-}
-
-__kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 2;
-
-        int src_idx  = mad24(y, src_step, src_offset);
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);
-
-        int dst2_start = mad24(y, dst2_step, dst2_offset);
-        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
-        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
-        uchar4 dst0_data  = *((__global uchar4 *)(mat_dst0 + dst0_idx));
-        uchar4 dst1_data  = *((__global uchar4 *)(mat_dst1 + dst1_idx));
-        uchar4 dst2_data  = *((__global uchar4 *)(mat_dst2 + dst2_idx));
-
-        uchar4 tmp_data0, tmp_data1, tmp_data2;
-
-        uchar src_data_0  =  *(mat_src + src_idx + 3 * x - 9);
-        uchar src_data_1  =  *(mat_src + src_idx + 3 * x - 8);
-        uchar src_data_2  =  *(mat_src + src_idx + 3 * x - 7);
-
-        uchar src_data_3  =  *(mat_src + src_idx + 3 * x - 6);
-        uchar src_data_4  =  *(mat_src + src_idx + 3 * x - 5);
-        uchar src_data_5  =  *(mat_src + src_idx + 3 * x - 4);
-
-        uchar src_data_6  =  *(mat_src + src_idx + 3 * x - 3);
-        uchar src_data_7  =  *(mat_src + src_idx + 3 * x - 2);
-        uchar src_data_8  =  *(mat_src + src_idx + 3 * x - 1);
-
-        uchar src_data_9  =  *(mat_src + src_idx + 3 * x + 0);
-        uchar src_data_10 =  *(mat_src + src_idx + 3 * x + 1);
-        uchar src_data_11 =  *(mat_src + src_idx + 3 * x + 2);
-
-        uchar src_data_12 =  *(mat_src + src_idx + 3 * x + 3);
-        uchar src_data_13 =  *(mat_src + src_idx + 3 * x + 4);
-        uchar src_data_14 =  *(mat_src + src_idx + 3 * x + 5);
-
-        uchar src_data_15 =  *(mat_src + src_idx + 3 * x + 6);
-        uchar src_data_16 =  *(mat_src + src_idx + 3 * x + 7);
-        uchar src_data_17 =  *(mat_src + src_idx + 3 * x + 8);
-
-        uchar src_data_18 =  *(mat_src + src_idx + 3 * x + 9);
-        uchar src_data_19 =  *(mat_src + src_idx + 3 * x + 10);
-        uchar src_data_20 =  *(mat_src + src_idx + 3 * x + 11);
-
-        uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
-        int index = 3 - dst0_offset & 3;
-        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
-
-        uchar4 data0, data1, data2;
-
-        data0     = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
-        data1     = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
-        data2     = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
-        tmp_data1 = (dst1_offset & 3) == 0 ? (uchar4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
-
-        data0     = (uchar4)(src_data_2, src_data_5, src_data_8, src_data_11);
-        data1     = (dst2_offset & 3) == 2 ? (uchar4)(src_data_5, src_data_8, src_data_11, src_data_14)   : data0;
-        data2     = (dst2_offset & 3) == 1 ? (uchar4)(src_data_8, src_data_11, src_data_14, src_data_17)  : data1;
-        tmp_data2 = (dst2_offset & 3) == 0 ? (uchar4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-        tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
-        tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-        tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
-        tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
-        tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
-        tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-        tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
-        tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
-        *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
-        *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
-        *((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
-    }
-}
-
-__kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 2;
-
-        #define dst0_align ((dst0_offset & 3) << 1)
-        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
-        uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
-        uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
-        if(src_idx_0 == -6)
-            src_data_0.s01234567 = src_data_0.s67012345;
-        if(src_idx_0 == -4)
-            src_data_0.s01234567 = src_data_0.s45670123;
-        if(src_idx_0 == -2)
-            src_data_0.s01234567 = src_data_0.s23456701;
-        if(src_idx_1 == -6)
-            src_data_1.s01234567 = src_data_1.s67012345;
-        if(src_idx_1 == -4)
-            src_data_1.s01234567 = src_data_1.s45670123;
-        if(src_idx_1 == -2)
-            src_data_1.s01234567 = src_data_1.s23456701;
-
-        uchar4 dst0_data  = *((__global uchar4 *)(mat_dst0 + dst0_idx));
-        uchar4 dst1_data  = *((__global uchar4 *)(mat_dst1 + dst1_idx));
-
-        uchar4 tmp_data0, tmp_data1;
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
-        tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
-        tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
-        tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
-        tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
-
-        *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
-        *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
-    }
-}
-
-__kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
-                                  __global char *mat_dst3, int dst3_step, int dst3_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 2;
-
-        int src_idx  = mad24(y, src_step, src_offset + (x << 2));
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
-        int dst2_start = mad24(y, dst2_step, dst2_offset);
-        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
-        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
-        int dst3_start = mad24(y, dst3_step, dst3_offset);
-        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
-        int dst3_idx   = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-
-        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
-        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
-        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
-        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
-        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
-        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
-        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
-
-        char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
-
-        if((dst0_offset & 3) == 3)
-            tmp_data0 = (char4)(data_0.x, data_1.x, data_2.x, data_3.x);
-        if((dst0_offset & 3) == 2)
-            tmp_data0 = (char4)(data_1.x, data_2.x, data_3.x, data_4.x);
-        if((dst0_offset & 3) == 1)
-            tmp_data0 = (char4)(data_2.x, data_3.x, data_4.x, data_5.x);
-        if((dst0_offset & 3) == 0)
-            tmp_data0 = (char4)(data_3.x, data_4.x, data_5.x, data_6.x);
-
-        if((dst1_offset & 3) == 3)
-            tmp_data1 = (char4)(data_0.y, data_1.y, data_2.y, data_3.y);
-        if((dst1_offset & 3) == 2)
-            tmp_data1 = (char4)(data_1.y, data_2.y, data_3.y, data_4.y);
-        if((dst1_offset & 3) == 1)
-            tmp_data1 = (char4)(data_2.y, data_3.y, data_4.y, data_5.y);
-        if((dst1_offset & 3) == 0)
-            tmp_data1 = (char4)(data_3.y, data_4.y, data_5.y, data_6.y);
-
-        if((dst2_offset & 3) == 3)
-            tmp_data2 = (char4)(data_0.z, data_1.z, data_2.z, data_3.z);
-        if((dst2_offset & 3) == 2)
-            tmp_data2 = (char4)(data_1.z, data_2.z, data_3.z, data_4.z);
-        if((dst2_offset & 3) == 1)
-            tmp_data2 = (char4)(data_2.z, data_3.z, data_4.z, data_5.z);
-        if((dst2_offset & 3) == 0)
-            tmp_data2 = (char4)(data_3.z, data_4.z, data_5.z, data_6.z);
-
-        if((dst3_offset & 3) == 3)
-            tmp_data3 = (char4)(data_0.w, data_1.w, data_2.w, data_3.w);
-        if((dst3_offset & 3) == 2)
-            tmp_data3 = (char4)(data_1.w, data_2.w, data_3.w, data_4.w);
-        if((dst3_offset & 3) == 1)
-            tmp_data3 = (char4)(data_2.w, data_3.w, data_4.w, data_5.w);
-        if((dst3_offset & 3) == 0)
-            tmp_data3 = (char4)(data_3.w, data_4.w, data_5.w, data_6.w);
-
-        char4 dst0_data  = *((__global char4 *)(mat_dst0 + dst0_idx));
-        char4 dst1_data  = *((__global char4 *)(mat_dst1 + dst1_idx));
-        char4 dst2_data  = *((__global char4 *)(mat_dst2 + dst2_idx));
-        char4 dst3_data  = *((__global char4 *)(mat_dst3 + dst3_idx));
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-        tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
-        tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-        tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
-        tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
-        tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
-        tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-        tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
-        tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
-        tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
-        tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
-        tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
-        tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
-
-        *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
-        *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
-        *((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
-        *((__global char4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
-    }
-}
-
-__kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 2;
-
-        int src_idx  = mad24(y, src_step, src_offset);
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);
-
-        int dst2_start = mad24(y, dst2_step, dst2_offset);
-        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
-        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
-        char4 dst0_data  = *((__global char4 *)(mat_dst0 + dst0_idx));
-        char4 dst1_data  = *((__global char4 *)(mat_dst1 + dst1_idx));
-        char4 dst2_data  = *((__global char4 *)(mat_dst2 + dst2_idx));
-
-        char4 tmp_data0, tmp_data1, tmp_data2;
-
-        char src_data_0  =  *(mat_src + src_idx + 3 * x - 9);
-        char src_data_1  =  *(mat_src + src_idx + 3 * x - 8);
-        char src_data_2  =  *(mat_src + src_idx + 3 * x - 7);
-
-        char src_data_3  =  *(mat_src + src_idx + 3 * x - 6);
-        char src_data_4  =  *(mat_src + src_idx + 3 * x - 5);
-        char src_data_5  =  *(mat_src + src_idx + 3 * x - 4);
-
-        char src_data_6  =  *(mat_src + src_idx + 3 * x - 3);
-        char src_data_7  =  *(mat_src + src_idx + 3 * x - 2);
-        char src_data_8  =  *(mat_src + src_idx + 3 * x - 1);
-
-        char src_data_9  =  *(mat_src + src_idx + 3 * x + 0);
-        char src_data_10 =  *(mat_src + src_idx + 3 * x + 1);
-        char src_data_11 =  *(mat_src + src_idx + 3 * x + 2);
-
-        char src_data_12 =  *(mat_src + src_idx + 3 * x + 3);
-        char src_data_13 =  *(mat_src + src_idx + 3 * x + 4);
-        char src_data_14 =  *(mat_src + src_idx + 3 * x + 5);
-
-        char src_data_15 =  *(mat_src + src_idx + 3 * x + 6);
-        char src_data_16 =  *(mat_src + src_idx + 3 * x + 7);
-        char src_data_17 =  *(mat_src + src_idx + 3 * x + 8);
-
-        char src_data_18 =  *(mat_src + src_idx + 3 * x + 9);
-        char src_data_19 =  *(mat_src + src_idx + 3 * x + 10);
-        char src_data_20 =  *(mat_src + src_idx + 3 * x + 11);
-
-        char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
-        int index = 3 - dst0_offset & 3;
-        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
-
-        char4 data0, data1, data2;
-
-        data0     = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
-        data1     = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
-        data2     = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
-        tmp_data1 = (dst1_offset & 3) == 0 ? (char4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
-
-        data0     = (char4)(src_data_2, src_data_5, src_data_8, src_data_11);
-        data1     = (dst2_offset & 3) == 2 ? (char4)(src_data_5, src_data_8, src_data_11, src_data_14)   : data0;
-        data2     = (dst2_offset & 3) == 1 ? (char4)(src_data_8, src_data_11, src_data_14, src_data_17)  : data1;
-        tmp_data2 = (dst2_offset & 3) == 0 ? (char4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-        tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
-        tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-        tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
-        tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
-        tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
-        tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-        tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
-        tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
-        *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
-        *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
-        *((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
-    }
-}
-
-__kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 2;
-
-        #define dst0_align ((dst0_offset & 3) << 1)
-        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-    int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
-        char8 src_data_0 = vload8(0, mat_src + src_idx_0);
-        char8 src_data_1 = vload8(0, mat_src + src_idx_1);
-        if(src_idx_0 == -6)
-            src_data_0.s01234567 = src_data_0.s67012345;
-        if(src_idx_0 == -4)
-            src_data_0.s01234567 = src_data_0.s45670123;
-        if(src_idx_0 == -2)
-            src_data_0.s01234567 = src_data_0.s23456701;
-        if(src_idx_1 == -6)
-            src_data_1.s01234567 = src_data_1.s67012345;
-        if(src_idx_1 == -4)
-            src_data_1.s01234567 = src_data_1.s45670123;
-        if(src_idx_1 == -2)
-            src_data_1.s01234567 = src_data_1.s23456701;
-        char4 dst0_data  = *((__global char4 *)(mat_dst0 + dst0_idx));
-        char4 dst1_data  = *((__global char4 *)(mat_dst1 + dst1_idx));
-
-        char4 tmp_data0, tmp_data1;
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
-        tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
-        tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
-        tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
-        tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
-
-        *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
-        *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
-    }
-}
-
-__kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
-                                  __global ushort *mat_dst3, int dst3_step, int dst3_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst2_start = mad24(y, dst2_step, dst2_offset);
-        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
-        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst3_start = mad24(y, dst3_step, dst3_offset);
-        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
-        int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-
-    int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-        ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
-             if(src_idx_0 == -6)
-            src_data0.s01234567 = src_data0.s67012345;
-        if(src_idx_0 == -4)
-            src_data0.s01234567 = src_data0.s45670123;
-        if(src_idx_0 == -2)
-            src_data0.s01234567 = src_data0.s23456701;
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
-
-        ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
-        ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
-        ushort2 dst2_data  = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
-        ushort2 dst3_data  = *((__global ushort2 *)((__global char *)mat_dst3 + dst3_idx));
-
-        ushort2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
-
-        tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data0.s4, src_data1.s0) : (ushort2)(src_data0.s0, src_data0.s4);
-        tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data0.s5, src_data1.s1) : (ushort2)(src_data0.s1, src_data0.s5);
-        tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data0.s6, src_data1.s2) : (ushort2)(src_data0.s2, src_data0.s6);
-        tmp_data3 = (dst3_offset & 3) == 0 ? (ushort2)(src_data0.s7, src_data1.s3) : (ushort2)(src_data0.s3, src_data0.s7);
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
-        tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
-        tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
-        tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
-        tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
-
-        *((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
-        *((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
-        *((global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
-        *((global ushort2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
-    }
-}
-
-__kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        int src_idx  = mad24(y, src_step, src_offset);
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst2_start = mad24(y, dst2_step, dst2_offset);
-        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
-        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
-        ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
-        ushort2 dst2_data  = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
-
-        ushort2 tmp_data0, tmp_data1, tmp_data2;
-
-        ushort src_data_0 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 3];
-        ushort src_data_1 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 2];
-        ushort src_data_2 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 1];
-        ushort src_data_3 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 0];
-        ushort src_data_4 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 1];
-        ushort src_data_5 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 2];
-        ushort src_data_6 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 3];
-        ushort src_data_7 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 4];
-        ushort src_data_8 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 5];
-
-        tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data_3, src_data_6) : (ushort2)(src_data_0, src_data_3);
-        tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data_4, src_data_7) : (ushort2)(src_data_1, src_data_4);
-        tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data_5, src_data_8) : (ushort2)(src_data_2, src_data_5);
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
-        tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
-        tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
-        *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
-        *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
-        *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
-    }
-}
-
-__kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        #define dst0_align ((dst0_offset & 3) << 1)
-        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
-        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
-        ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
-        ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
-        if(src_idx_0 < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
-            src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
-        }
-        if(src_idx_1 < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
-            src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
-        }
-
-        ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
-        ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
-
-        ushort2 tmp_data0, tmp_data1;
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
-
-        *((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
-        *((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
-    }
-}
-__kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
-                                  __global short *mat_dst3, int dst3_step, int dst3_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst2_start = mad24(y, dst2_step, dst2_offset);
-        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
-        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst3_start = mad24(y, dst3_step, dst3_offset);
-        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
-        int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-        short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
-
-        if(src_idx_0 == -6)
-            src_data0.s01234567 = src_data0.s67012345;
-        if(src_idx_0 == -4)
-            src_data0.s01234567 = src_data0.s45670123;
-        if(src_idx_0 == -2)
-            src_data0.s01234567 = src_data0.s23456701;
-
-        short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
-
-        short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
-        short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
-        short2 dst2_data  = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
-        short2 dst3_data  = *((__global short2 *)((__global char *)mat_dst3 + dst3_idx));
-
-        short2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
-
-        tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data0.s4, src_data1.s0) : (short2)(src_data0.s0, src_data0.s4);
-        tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data0.s5, src_data1.s1) : (short2)(src_data0.s1, src_data0.s5);
-        tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data0.s6, src_data1.s2) : (short2)(src_data0.s2, src_data0.s6);
-        tmp_data3 = (dst3_offset & 3) == 0 ? (short2)(src_data0.s7, src_data1.s3) : (short2)(src_data0.s3, src_data0.s7);
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
-        tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
-        tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
-        tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
-        tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
-
-        *((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
-        *((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
-        *((global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
-        *((global short2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
-    }
-}
-__kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        int src_idx  = mad24(y, src_step, src_offset);
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst2_start = mad24(y, dst2_step, dst2_offset);
-        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
-        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
+#if DATA_DEPTH == 0
+#define BASE_TYPE uchar
+#elif DATA_DEPTH == 1
+#error data_depth char, use uchar datatype instead
+#elif DATA_DEPTH == 2
+#define BASE_TYPE ushort
+#elif DATA_DEPTH == 3
+#error data_depth short, use ushort datatype instead
+#elif DATA_DEPTH == 4
+#define BASE_TYPE int
+#elif DATA_DEPTH == 5
+#define BASE_TYPE float
+#elif DATA_DEPTH == 6
+#define BASE_TYPE double
+#else
+#error data_depth
+#endif
 
-        short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
-        short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
-        short2 dst2_data  = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
+#if DATA_CHAN == 2
+#define SRC_VEC_SIZE 2
+#elif DATA_CHAN == 3
+#define SRC_VEC_SIZE 4 // C3 is stored as C4
+#elif DATA_CHAN == 4
+#define SRC_VEC_SIZE 4
+#else
+#error data_chan
+#endif
 
-        short2 tmp_data0, tmp_data1, tmp_data2;
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
 
-        short src_data_0 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 3];
-        short src_data_1 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 2];
-        short src_data_2 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 1];
-        short src_data_3 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 0];
-        short src_data_4 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 1];
-        short src_data_5 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 2];
-        short src_data_6 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 3];
-        short src_data_7 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 4];
-        short src_data_8 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 5];
+#define uchar1 uchar
+#define char1 char
+#define ushort1 ushort
+#define short1 short
+#define int1 int
+#define float1 float
+#define double1 double
 
-        tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data_3, src_data_6) : (short2)(src_data_0, src_data_3);
-        tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data_4, src_data_7) : (short2)(src_data_1, src_data_4);
-        tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data_5, src_data_8) : (short2)(src_data_2, src_data_5);
+#define TYPE BASE_TYPE
 
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
+#define SRC_TYPE CAT(BASE_TYPE, SRC_VEC_SIZE)
 
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
+#define DST_VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
 
-        tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
-        tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
-        *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
-        *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
-        *((__global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
-    }
-}
+#define vstore1 vstore
+#define VSTORE CAT(vstore, VEC_SIZE)
+#define VSTORE_ALIGNED(ptr, v) *((__global DST_VEC_TYPE*)(ptr)) = (v)
+#define VSTORE_UNALIGNED(ptr, v) VSTORE((v), 0, (__global TYPE*)(ptr))
 
+#ifdef DST0_ALIGNED
+#define VSTORE_dst0 VSTORE_ALIGNED
+#else
+#define VSTORE_dst0 VSTORE_UNALIGNED
+#endif
+#ifdef DST1_ALIGNED
+#define VSTORE_dst1 VSTORE_ALIGNED
+#else
+#define VSTORE_dst1 VSTORE_UNALIGNED
+#endif
+#ifdef DST2_ALIGNED
+#define VSTORE_dst2 VSTORE_ALIGNED
+#else
+#define VSTORE_dst2 VSTORE_UNALIGNED
+#endif
+#ifdef DST3_ALIGNED
+#define VSTORE_dst3 VSTORE_ALIGNED
+#else
+#define VSTORE_dst3 VSTORE_UNALIGNED
+#endif
 
-__kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
-                                  int rows, int cols, int dst_step1)
+__kernel void split_vector(
+        __global SRC_TYPE* src, int srcStepBytes, int2 srcOffset, // offset.x in bytes
+        __global TYPE* dst0, int dst0StepBytes, int2 dst0Offset,
+        __global TYPE* dst1, int dst1StepBytes, int2 dst1Offset,
+#if DATA_CHAN > 2
+        __global TYPE* dst2, int dst2StepBytes, int2 dst2Offset,
+#endif
+#if DATA_CHAN > 3
+        __global TYPE* dst3, int dst3StepBytes, int2 dst3Offset,
+#endif
+        int2 size)
 
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * VEC_SIZE;
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows))
+    if (x < size.x && y < size.y)
     {
-        x = x << 1;
-
-        #define dst0_align ((dst0_offset & 3) << 1)
-        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));
-
-        int dst0_start = mad24(y, dst0_step, dst0_offset);
-        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
-        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
-        int dst1_start = mad24(y, dst1_step, dst1_offset);
-        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
-        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
-        short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
-        short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
-        if(src_idx_0 < 0)
+        SRC_TYPE srcData[VEC_SIZE];
+        int xOffsetLimitBytes = srcOffset.x + size.x * sizeof(SRC_TYPE);
+        int xOffsetBytes = srcOffset.x + x * sizeof(SRC_TYPE);
+        int yOffsetBytes = (srcOffset.y + y) * srcStepBytes;
+#pragma unroll
+        for (int i = 0; i < VEC_SIZE; i++, xOffsetBytes += sizeof(SRC_TYPE))
         {
-            short4 tmp;
-            tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
-            src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+            srcData[i] = (xOffsetBytes >= xOffsetLimitBytes) ? (SRC_TYPE)0 :
+                    *(__global SRC_TYPE*)((__global char*)src + yOffsetBytes + xOffsetBytes);
         }
-        if(src_idx_1< 0)
-        {
-            short4 tmp;
-            tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
-            src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
-        }
-
-
-        short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
-        short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
-
-        short2 tmp_data0, tmp_data1;
-
-        tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
-        tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
 
-        tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
-        tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
-
-        *((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
-        *((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
-    }
-}
-__kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
-                                  __global int *mat_dst3, int dst3_step, int dst3_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-        int dst2_idx = mad24(y, dst2_step, dst2_offset);
-        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
-        int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
-
-        ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
-        ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
-        ((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
-        ((__global int *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
-    }
-}
-__kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-        int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
-        int src_data_0 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 0];
-        int src_data_1 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 1];
-        int src_data_2 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
-        ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
-        ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
-        ((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
-    }
-}
-
-__kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
-        int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
-
-        ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
-        ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
-    }
-}
-
-__kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
-                                  __global float *mat_dst3, int dst3_step, int dst3_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-        int dst2_idx = mad24(y, dst2_step, dst2_offset);
-        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
-        float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
-
-        ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
-        ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
-        ((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
-        ((__global float *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
-    }
-}
-
-__kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-        int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
-        float src_data_0 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 0];
-        float src_data_1 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 1];
-        float src_data_2 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
-        ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
-        ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
-        ((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
-    }
-}
-
-__kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
-        float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
-
-        ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
-        ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
-                                  __global double *mat_dst3, int dst3_step, int dst3_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-        int dst2_idx = mad24(y, dst2_step, dst2_offset);
-        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
-        double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
-
-        ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
-        ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
-        ((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
-        ((__global double *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
-    }
-}
-
-__kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-        int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
-        double src_data_0 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 0];
-        double src_data_1 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 1];
-        double src_data_2 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
-        ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
-        ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
-        ((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
-    }
-}
-
-__kernel void split_vector_C2_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
-                                  int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x  < cols) && (y < rows))
-    {
-        int src_idx  = mad24(y, src_step,  src_offset);
-        int dst0_idx = mad24(y, dst0_step, dst0_offset);
-        int dst1_idx = mad24(y, dst1_step, dst1_offset);
+#if VEC_SIZE == 1
+        TYPE dstC0 = srcData[0].s0;
+        TYPE dstC1 = srcData[0].s1;
+#if DATA_CHAN > 2
+        TYPE dstC2 = srcData[0].s2;
+#endif
+#if DATA_CHAN > 3
+        TYPE dstC3 = srcData[0].s3;
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[1] = {v};
+#elif VEC_SIZE == 2
+        DST_VEC_TYPE dstC0 = (DST_VEC_TYPE)(srcData[0].s0, srcData[1].s0);
+        DST_VEC_TYPE dstC1 = (DST_VEC_TYPE)(srcData[0].s1, srcData[1].s1);
+#if DATA_CHAN > 2
+        DST_VEC_TYPE dstC2 = (DST_VEC_TYPE)(srcData[0].s2, srcData[1].s2);
+#endif
+#if DATA_CHAN > 3
+        DST_VEC_TYPE dstC3 = (DST_VEC_TYPE)(srcData[0].s3, srcData[1].s3);
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[2] = {v.s0, v.s1};
+#elif VEC_SIZE == 4
+        DST_VEC_TYPE dstC0 = (DST_VEC_TYPE)(srcData[0].s0, srcData[1].s0, srcData[2].s0, srcData[3].s0);
+        DST_VEC_TYPE dstC1 = (DST_VEC_TYPE)(srcData[0].s1, srcData[1].s1, srcData[2].s1, srcData[3].s1);
+#if DATA_CHAN > 2
+        DST_VEC_TYPE dstC2 = (DST_VEC_TYPE)(srcData[0].s2, srcData[1].s2, srcData[2].s2, srcData[3].s2);
+#endif
+#if DATA_CHAN > 3
+        DST_VEC_TYPE dstC3 = (DST_VEC_TYPE)(srcData[0].s3, srcData[1].s3, srcData[2].s3, srcData[3].s3);
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[4] = {v.s0, v.s1, v.s2, v.s3};
+#endif
 
-        double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
+#ifndef BYPASS_VSTORE
+#define BYPASS_VSTORE false
+#endif
 
-        ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
-        ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
+#define WRITE_VEC_DST(dst, vecValue) \
+{ \
+        int dst ## xOffsetLimitBytes = dst ## Offset.x + size.x * sizeof(TYPE); \
+        int dst ## xOffsetBytes = dst ## Offset.x + x * sizeof(TYPE); \
+        int dst ## yOffsetBytes = (dst ## Offset.y + y) * dst ## StepBytes; \
+        if (!BYPASS_VSTORE && dst ## xOffsetBytes + (int)sizeof(DST_VEC_TYPE) <= dst ## xOffsetLimitBytes) \
+        { \
+            VSTORE_ ## dst(((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes), vecValue); \
+        } \
+        else \
+        { \
+            VEC_TO_ARRAY(vecValue, vecValue##Array); \
+            for (int i = 0; i < VEC_SIZE; i++, dst ## xOffsetBytes += sizeof(TYPE)) \
+            { \
+                if (dst ## xOffsetBytes + (int)sizeof(TYPE) <= dst ## xOffsetLimitBytes) \
+                    *(__global TYPE*)((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes) = vecValue##Array[i]; \
+                else \
+                    break; \
+            } \
+        } \
+}
+
+        WRITE_VEC_DST(dst0, dstC0);
+        WRITE_VEC_DST(dst1, dstC1);
+#if DATA_CHAN > 2
+        WRITE_VEC_DST(dst2, dstC2);
+#endif
+#if DATA_CHAN > 3
+        WRITE_VEC_DST(dst3, dstC3);
+#endif
     }
 }
-#endif
diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl
index 773aee618f..207bf0047f 100644
--- a/modules/ocl/src/opencl/stereobm.cl
+++ b/modules/ocl/src/opencl/stereobm.cl
@@ -56,7 +56,7 @@
 #define radius 64
 #endif
 
-unsigned int CalcSSD(__local unsigned int *col_ssd)
+static unsigned int CalcSSD(__local unsigned int *col_ssd)
 {
     unsigned int cache = col_ssd[0];
 
@@ -67,7 +67,7 @@ unsigned int CalcSSD(__local unsigned int *col_ssd)
     return cache;
 }
 
-uint2 MinSSD(__local unsigned int *col_ssd)
+static uint2 MinSSD(__local unsigned int *col_ssd)
 {
     unsigned int ssd[N_DISPARITIES];
     const int win_size = (radius << 1);
@@ -95,7 +95,7 @@ uint2 MinSSD(__local unsigned int *col_ssd)
     return (uint2)(mssd, bestIdx);
 }
 
-void StepDown(int idx1, int idx2, __global unsigned char* imageL,
+static void StepDown(int idx1, int idx2, __global unsigned char* imageL,
               __global unsigned char* imageR, int d,   __local unsigned int *col_ssd)
 {
     uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7)));
@@ -114,7 +114,7 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL,
     col_ssd[7 * (BLOCK_W + win_size)] += res.s0;
 }
 
-void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
+static void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
                 __global unsigned char* imageR, int d,
                  __local unsigned int *col_ssd)
 {
@@ -153,7 +153,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
 
     int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
 
-#define Y (get_group_id(1) * ROWSperTHREAD + radius)
+#define Y (int)(get_group_id(1) * ROWSperTHREAD + radius)
 
     __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
     __global unsigned char* disparImage = disp + X + Y * disp_step;
@@ -241,7 +241,7 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned
 /////////////////////////////////// Textureness filtering ////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
-float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
+static float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
 {
     float conv = 0;
     int y1 = y==0? 0 : y-1;
@@ -256,7 +256,7 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
     return fabs(conv);
 }
 
-float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
+static float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
 {
     unsigned int cache = cols[0];
 
diff --git a/modules/ocl/src/opencl/stereobp.cl b/modules/ocl/src/opencl/stereobp.cl
index 24bf55cb21..ec02f827a9 100644
--- a/modules/ocl/src/opencl/stereobp.cl
+++ b/modules/ocl/src/opencl/stereobp.cl
@@ -26,7 +26,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -65,7 +65,7 @@
 ///////////////////////////////////////////////////////////////
 /////////////////common///////////////////////////////////////
 /////////////////////////////////////////////////////////////
-T saturate_cast(float v){
+inline T saturate_cast(float v){
 #ifdef T_SHORT
     return convert_short_sat_rte(v);
 #else
@@ -73,7 +73,7 @@ T saturate_cast(float v){
 #endif
 }
 
-T4 saturate_cast4(float4 v){
+inline T4 saturate_cast4(float4 v){
 #ifdef T_SHORT
     return convert_short4_sat_rte(v);
 #else
@@ -99,7 +99,7 @@ inline float pix_diff_1(const uchar4 l, __global const uchar *rs)
     return abs((int)(l.x) - *rs);
 }
 
-float pix_diff_4(const uchar4 l, __global const uchar *rs)
+static float pix_diff_4(const uchar4 l, __global const uchar *rs)
 {
     uchar4 r;
     r = *((__global uchar4 *)rs);
@@ -235,7 +235,7 @@ __kernel void level_up_message(__global T *src, int src_rows, int src_step,
 ///////////////////////////////////////////////////////////////
 ////////////////////  calc all iterations /////////////////////
 ///////////////////////////////////////////////////////////////
-void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
+static void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
               const __global T *dt,
               int u_step, int msg_disp_step, int data_disp_step,
               float4 cmax_disc_term, float4 cdisc_single_jump)
diff --git a/modules/ocl/src/opencl/stereocsbp.cl b/modules/ocl/src/opencl/stereocsbp.cl
index 50aabaca68..13a201cc1c 100644
--- a/modules/ocl/src/opencl/stereocsbp.cl
+++ b/modules/ocl/src/opencl/stereocsbp.cl
@@ -248,7 +248,7 @@ __kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, _
 ///////////////////////////////////////////////////////////////
 /////////////////////// init data cost ////////////////////////
 ///////////////////////////////////////////////////////////////
-float compute_3(__global uchar* left, __global uchar* right,
+inline float compute_3(__global uchar* left, __global uchar* right,
     float cdata_weight,  float cmax_data_term)
 {
     float tb = 0.114f * abs((int)left[0] - right[0]);
@@ -257,17 +257,21 @@ float compute_3(__global uchar* left, __global uchar* right,
 
     return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
 }
-float compute_1(__global uchar* left, __global uchar* right,
+inline float compute_1(__global uchar* left, __global uchar* right,
     float cdata_weight,  float cmax_data_term)
 {
     return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term);
 }
-short round_short(float v){
+
+inline short round_short(float v)
+{
     return convert_short_sat_rte(v);
 }
+
 ///////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////init_data_cost///////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////
+
 __kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright,
     int h, int w, int level, int channels,
     int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1,
@@ -993,7 +997,8 @@ __kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr
 ///////////////////////////////////////////////////////////////
 //////////////////////// init message /////////////////////////
 ///////////////////////////////////////////////////////////////
-void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new,
+
+static void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new,
     __global short *r_new, __global const short *u_cur, __global const short *d_cur,
     __global const short *l_cur, __global const short *r_cur,
     __global short *data_cost_selected, __global short *disparity_selected_new,
@@ -1027,7 +1032,8 @@ void get_first_k_element_increase_0(__global short* u_new, __global short *d_new
         data_cost_new[id * cdisp_step1] = SHRT_MAX;
     }
 }
-void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new,
+
+static void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new,
     __global float *r_new, __global const float *u_cur, __global const float *d_cur,
     __global const float *l_cur, __global const float *r_cur,
     __global float *data_cost_selected, __global float *disparity_selected_new,
@@ -1190,7 +1196,8 @@ __kernel void init_message_1(__global float *u_new_, __global float *d_new_, __g
 ///////////////////////////////////////////////////////////////
 ////////////////////  calc all iterations /////////////////////
 ///////////////////////////////////////////////////////////////
-void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1,
+
+static void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1,
     __global const short *msg2, __global const short *msg3,
     __global const short *dst_disp, __global const short *src_disp,
     int nr_plane, __global short *temp,
@@ -1226,7 +1233,8 @@ void message_per_pixel_0(__global const short *data, __global short *msg_dst, __
     for(int d = 0; d < nr_plane; d++)
         msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum);
 }
-void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1,
+
+static void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1,
     __global const float *msg2, __global const float *msg3,
     __global const float *dst_disp, __global const float *src_disp,
     int nr_plane, __global float *temp,
@@ -1262,6 +1270,7 @@ void message_per_pixel_1(__global const float *data, __global float *msg_dst, __
     for(int d = 0; d < nr_plane; d++)
         msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum;
 }
+
 __kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_,
     __global const short *data_cost_selected, __global const short *selected_disp_pyr_cur,
     __global short *ctemp, int h, int w, int nr_plane, int i,
@@ -1293,6 +1302,7 @@ __kernel void compute_message_0(__global short *u_, __global short *d_, __global
             cmax_disc_term, cdisp_step1, cdisc_single_jump);
     }
 }
+
 __kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_,
     __global const float *data_cost_selected, __global const float *selected_disp_pyr_cur,
     __global float *ctemp, int h, int w, int nr_plane, int i,
@@ -1327,6 +1337,7 @@ __kernel void compute_message_1(__global float *u_, __global float *d_, __global
 ///////////////////////////////////////////////////////////////
 /////////////////////////// output ////////////////////////////
 ///////////////////////////////////////////////////////////////
+
 __kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_,
     __global const short *r_, __global const short * data_cost_selected,
     __global const short *disp_selected_pyr,
@@ -1364,6 +1375,7 @@ __kernel void compute_disp_0(__global const short *u_, __global const short *d_,
         disp[res_step * y + x] = best;
     }
 }
+
 __kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_,
     __global const float *r_, __global const float *data_cost_selected,
     __global const float *disp_selected_pyr,
diff --git a/modules/ocl/src/safe_call.hpp b/modules/ocl/src/safe_call.hpp
index 6bc73efc14..bd409c8d2c 100644
--- a/modules/ocl/src/safe_call.hpp
+++ b/modules/ocl/src/safe_call.hpp
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -65,7 +65,7 @@ namespace cv
 
         static inline void ___openCLSafeCall(int err, const char *file, const int line, const char *func = "")
         {
-            if( CL_SUCCESS != err)
+            if (CL_SUCCESS != err)
                 cv::error(Error::OpenCLApiCallError, getOpenCLErrorString(err), func, file, line);
         }
     }
diff --git a/modules/ocl/src/split_merge.cpp b/modules/ocl/src/split_merge.cpp
index 990c91c4f3..073a7a73b5 100644
--- a/modules/ocl/src/split_merge.cpp
+++ b/modules/ocl/src/split_merge.cpp
@@ -148,90 +148,128 @@ namespace cv
                 mat_dst.create(size, CV_MAKETYPE(depth, total_channels));
                 merge_vector_run(mat_src, n, mat_dst);
             }
-            static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
+            static void split_vector_run(const oclMat &src, oclMat *dst)
             {
 
-                if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F)
+                if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
                 {
                     CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
                     return;
                 }
 
-                Context  *clCxt = mat_src.clCxt;
-                int channels = mat_src.oclchannels();
-                int depth = mat_src.depth();
+                Context  *clCtx = src.clCxt;
+                int channels = src.channels();
+                int depth = src.depth();
+                depth = (depth == CV_8S) ? CV_8U : depth;
+                depth = (depth == CV_16S) ? CV_16U : depth;
 
                 String kernelName = "split_vector";
 
-                int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0},
-                    {4, 4, 2, 2, 1, 1, 1},
-                    {4, 4, 2, 2 , 1, 1, 1},
-                    {4, 4, 2, 2, 1, 1, 1}
-                };
-
-                size_t vector_length = vector_lengths[channels - 1][mat_dst[0].depth()];
-
-                int max_offset_cols = 0;
-                for(int i = 0; i < channels; i++)
-                {
-                    int offset_cols = (mat_dst[i].offset / mat_dst[i].elemSize()) & (vector_length - 1);
-                    if(max_offset_cols < offset_cols)
-                        max_offset_cols = offset_cols;
-                }
-
-                int cols =  vector_length == 1 ? divUp(mat_src.cols, vector_length)
-                            : divUp(mat_src.cols + max_offset_cols, vector_length);
-
-                size_t localThreads[3]  = { 64, 4, 1 };
-                size_t globalThreads[3] = { cols, mat_src.rows, 1 };
+                size_t VEC_SIZE = 4;
 
-                int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize();
                 std::vector<std::pair<size_t , const void *> > args;
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.offset));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].offset));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].offset));
-                if(channels >= 3)
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step));
+                int srcOffsetXBytes = src.offset % src.step;
+                int srcOffsetY = src.offset / src.step;
+                cl_int2 srcOffset = {{srcOffsetXBytes, srcOffsetY}};
+                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&srcOffset));
+
+                bool dst0Aligned = false, dst1Aligned = false, dst2Aligned = false, dst3Aligned = false;
+                int alignSize = dst[0].elemSize1() * VEC_SIZE;
+                int alignMask = alignSize - 1;
+
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[0].data));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[0].step));
+                int dst0OffsetXBytes = dst[0].offset % dst[0].step;
+                int dst0OffsetY = dst[0].offset / dst[0].step;
+                cl_int2 dst0Offset = {{dst0OffsetXBytes, dst0OffsetY}};
+                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst0Offset));
+                if ((dst0OffsetXBytes & alignMask) == 0)
+                    dst0Aligned = true;
+
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[1].data));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[1].step));
+                int dst1OffsetXBytes = dst[1].offset % dst[1].step;
+                int dst1OffsetY = dst[1].offset / dst[1].step;
+                cl_int2 dst1Offset = {{dst1OffsetXBytes, dst1OffsetY}};
+                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst1Offset));
+                if ((dst1OffsetXBytes & alignMask) == 0)
+                    dst1Aligned = true;
+
+                // DON'T MOVE VARIABLES INTO 'IF' BODY
+                int dst2OffsetXBytes, dst2OffsetY;
+                cl_int2 dst2Offset;
+                int dst3OffsetXBytes, dst3OffsetY;
+                cl_int2 dst3Offset;
+                if (channels >= 3)
                 {
-
-                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data));
-                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].step));
-                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].offset));
+                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[2].data));
+                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[2].step));
+                    dst2OffsetXBytes = dst[2].offset % dst[2].step;
+                    dst2OffsetY = dst[2].offset / dst[2].step;
+                    dst2Offset.s[0] = dst2OffsetXBytes; dst2Offset.s[1] = dst2OffsetY;
+                    args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst2Offset));
+                    if ((dst2OffsetXBytes & alignMask) == 0)
+                        dst2Aligned = true;
                 }
-                if(channels >= 4)
+
+                if (channels >= 4)
                 {
-                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data));
-                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].step));
-                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].offset));
+                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[3].data));
+                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[3].step));
+                    dst3OffsetXBytes = dst[3].offset % dst[3].step;
+                    dst3OffsetY = dst[3].offset / dst[3].step;
+                    dst3Offset.s[0] = dst3OffsetXBytes; dst3Offset.s[1] = dst3OffsetY;
+                    args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst3Offset));
+                    if ((dst3OffsetXBytes & alignMask) == 0)
+                        dst3Aligned = true;
                 }
 
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.rows));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1));
-
-                openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth);
+                cl_int2 size = {{ src.cols, src.rows }};
+                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&size));
+
+                String build_options =
+                        cv::format("-D VEC_SIZE=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d",
+                                   (int)VEC_SIZE, depth, channels);
+
+                if (dst0Aligned)
+                    build_options = build_options + " -D DST0_ALIGNED";
+                if (dst1Aligned)
+                    build_options = build_options + " -D DST1_ALIGNED";
+                if (dst2Aligned)
+                    build_options = build_options + " -D DST2_ALIGNED";
+                if (dst3Aligned)
+                    build_options = build_options + " -D DST3_ALIGNED";
+
+                const DeviceInfo& devInfo = clCtx->getDeviceInfo();
+
+                // TODO Workaround for issues. Need to investigate a problem.
+                if (channels == 2
+                        && devInfo.deviceType == CVCL_DEVICE_TYPE_CPU
+                        && devInfo.platform->platformVendor.find("Intel") != std::string::npos
+                        && (devInfo.deviceVersion.find("Build 56860") != std::string::npos
+                            || devInfo.deviceVersion.find("Build 76921") != std::string::npos))
+                    build_options = build_options + " -D BYPASS_VSTORE=true";
+
+                size_t globalThreads[3] = { divUp(src.cols, VEC_SIZE), src.rows, 1 };
+                openCLExecuteKernel(clCtx, &split_mat, kernelName, globalThreads, NULL, args, -1, -1, build_options.c_str());
             }
             static void split(const oclMat &mat_src, oclMat *mat_dst)
             {
                 CV_Assert(mat_dst);
 
                 int depth = mat_src.depth();
-                int num_channels = mat_src.oclchannels();
+                int num_channels = mat_src.channels();
                 Size size = mat_src.size();
 
-                if(num_channels == 1)
+                if (num_channels == 1)
                 {
                     mat_src.copyTo(mat_dst[0]);
                     return;
                 }
 
-                int i;
-                for(i = 0; i < num_channels; i++)
+                for (int i = 0; i < mat_src.oclchannels(); i++)
                     mat_dst[i].create(size, CV_MAKETYPE(depth, 1));
 
                 split_vector_run(mat_src, mat_dst);
@@ -255,7 +293,7 @@ void cv::ocl::split(const oclMat &src, oclMat *dst)
 }
 void cv::ocl::split(const oclMat &src, std::vector<oclMat> &dst)
 {
-    dst.resize(src.oclchannels());
+    dst.resize(src.oclchannels()); // TODO Why oclchannels?
     if(src.oclchannels() > 0)
         split_merge::split(src, &dst[0]);
 }
diff --git a/modules/ocl/src/tvl1flow.cpp b/modules/ocl/src/tvl1flow.cpp
index 604124fc22..6e75ee238b 100644
--- a/modules/ocl/src/tvl1flow.cpp
+++ b/modules/ocl/src/tvl1flow.cpp
@@ -121,10 +121,8 @@ void cv::ocl::OpticalFlowDual_TVL1_OCL::operator()(const oclMat& I0, const oclMa
             ocl::pyrDown(u1s[s - 1], u1s[s]);
             ocl::pyrDown(u2s[s - 1], u2s[s]);
 
-            //ocl::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
-            multiply(0.5, u1s[s], u1s[s]);
-            //ocl::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
-            multiply(0.5, u1s[s], u2s[s]);
+            ocl::multiply(0.5, u1s[s], u1s[s]);
+            ocl::multiply(0.5, u2s[s], u2s[s]);
         }
     }
 
diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp
index 1d1b0f1ab9..11b945c5b2 100644
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
@@ -126,8 +126,12 @@ PARAM_TEST_CASE(Lut, MatDepth, MatDepth, bool, bool)
 
     void Near(double threshold = 0.)
     {
-        EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold);
-        EXPECT_MAT_NEAR(dst_roi, Mat(gdst_roi), threshold);
+        Mat whole, roi;
+        gdst_whole.download(whole);
+        gdst_roi.download(roi);
+
+        EXPECT_MAT_NEAR(dst, whole, threshold);
+        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
     }
 };
 
@@ -222,14 +226,22 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
 
     void Near(double threshold = 0.)
     {
-        EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold);
-        EXPECT_MAT_NEAR(dst1_roi, Mat(gdst1_roi), threshold);
+        Mat whole, roi;
+        gdst1_whole.download(whole);
+        gdst1_roi.download(roi);
+
+        EXPECT_MAT_NEAR(dst1, whole, threshold);
+        EXPECT_MAT_NEAR(dst1_roi, roi, threshold);
     }
 
     void Near1(double threshold = 0.)
     {
-        EXPECT_MAT_NEAR(dst2, Mat(gdst2_whole), threshold);
-        EXPECT_MAT_NEAR(dst2_roi, Mat(gdst2_roi), threshold);
+        Mat whole, roi;
+        gdst2_whole.download(whole);
+        gdst2_roi.download(roi);
+
+        EXPECT_MAT_NEAR(dst2, whole, threshold);
+        EXPECT_MAT_NEAR(dst2_roi, roi, threshold);
     }
 };
 
@@ -724,6 +736,15 @@ OCL_TEST_P(MinMax, MAT)
 
 OCL_TEST_P(MinMax, MASK)
 {
+    enum { MAX_IDX = 0, MIN_IDX };
+    static const double minMaxGolds[2][7] =
+    {
+        { std::numeric_limits<uchar>::min(), std::numeric_limits<char>::min(), std::numeric_limits<ushort>::min(),
+          std::numeric_limits<short>::min(), std::numeric_limits<int>::min(), -std::numeric_limits<float>::max(), -std::numeric_limits<double>::max() },
+        { std::numeric_limits<uchar>::max(), std::numeric_limits<char>::max(), std::numeric_limits<ushort>::max(),
+          std::numeric_limits<short>::max(), std::numeric_limits<int>::max(), std::numeric_limits<float>::max(), std::numeric_limits<double>::max() },
+    };
+
     for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
@@ -750,8 +771,16 @@ OCL_TEST_P(MinMax, MASK)
         double minVal_, maxVal_;
         cv::ocl::minMax(gsrc1_roi, &minVal_, &maxVal_, gmask_roi);
 
-        EXPECT_DOUBLE_EQ(minVal, minVal_);
-        EXPECT_DOUBLE_EQ(maxVal, maxVal_);
+        if (cv::countNonZero(mask_roi) == 0)
+        {
+            EXPECT_DOUBLE_EQ(minMaxGolds[MIN_IDX][depth], minVal_);
+            EXPECT_DOUBLE_EQ(minMaxGolds[MAX_IDX][depth], maxVal_);
+        }
+        else
+        {
+            EXPECT_DOUBLE_EQ(minVal, minVal_);
+            EXPECT_DOUBLE_EQ(maxVal, maxVal_);
+        }
     }
 }
 
diff --git a/modules/ocl/test/test_blend.cpp b/modules/ocl/test/test_blend.cpp
index 63693749db..1576891a48 100644
--- a/modules/ocl/test/test_blend.cpp
+++ b/modules/ocl/test/test_blend.cpp
@@ -47,73 +47,130 @@
 
 using namespace cv;
 using namespace cv::ocl;
-using namespace cvtest;
 using namespace testing;
 using namespace std;
-#ifdef HAVE_OPENCL
+
 template <typename T>
-void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
+static void blendLinearGold(const Mat &img1, const Mat &img2,
+                            const Mat &weights1, const Mat &weights2,
+                            Mat &result_gold)
 {
+    CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
+    CV_Assert(weights1.size() == weights2.size() && weights1.size() == img1.size() &&
+              weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
+
     result_gold.create(img1.size(), img1.type());
 
     int cn = img1.channels();
+    int step1 = img1.cols * img1.channels();
 
     for (int y = 0; y < img1.rows; ++y)
     {
-        const float *weights1_row = weights1.ptr<float>(y);
-        const float *weights2_row = weights2.ptr<float>(y);
-        const T *img1_row = img1.ptr<T>(y);
-        const T *img2_row = img2.ptr<T>(y);
-        T *result_gold_row = result_gold.ptr<T>(y);
+        const float * const weights1_row = weights1.ptr<float>(y);
+        const float * const weights2_row = weights2.ptr<float>(y);
+        const T * const img1_row = img1.ptr<T>(y);
+        const T * const img2_row = img2.ptr<T>(y);
+        T * const result_gold_row = result_gold.ptr<T>(y);
 
-        for (int x = 0; x < img1.cols * cn; ++x)
+        for (int x = 0; x < step1; ++x)
         {
-            float w1 = weights1_row[x / cn];
-            float w2 = weights2_row[x / cn];
-            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+            int x1 = x / cn;
+            float w1 = weights1_row[x1], w2 = weights2_row[x1];
+            result_gold_row[x] = saturate_cast<T>(((float)img1_row[x] * w1
+                                                 + (float)img2_row[x] * w2) / (w1 + w2 + 1e-5f));
         }
     }
 }
 
-PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/)
+PARAM_TEST_CASE(Blend, MatDepth, int, bool)
 {
-    cv::Size size;
-    int type;
+    int depth, channels;
     bool useRoi;
 
+    Mat src1, src2, weights1, weights2, dst;
+    Mat src1_roi, src2_roi, weights1_roi, weights2_roi, dst_roi;
+    oclMat gsrc1, gsrc2, gweights1, gweights2, gdst, gst;
+    oclMat gsrc1_roi, gsrc2_roi, gweights1_roi, gweights2_roi, gdst_roi;
+
     virtual void SetUp()
     {
-        size = GET_PARAM(0);
-        type = GET_PARAM(1);
+        depth = GET_PARAM(0);
+        channels = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+    }
+
+    void random_roi()
+    {
+        const int type = CV_MAKE_TYPE(depth, channels);
+
+        const double upValue = 256;
+        const double sumMinValue = 0.01; // we don't want to divide by "zero"
+
+        Size roiSize = randomSize(1, 20);
+        Border src1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src1, src1_roi, roiSize, src1Border, type, -upValue, upValue);
+
+        Border src2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src2, src2_roi, roiSize, src2Border, type, -upValue, upValue);
+
+        Border weights1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(weights1, weights1_roi, roiSize, weights1Border, CV_32FC1, -upValue, upValue);
+
+        Border weights2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(weights2, weights2_roi, roiSize, weights2Border, CV_32FC1, sumMinValue, upValue); // fill it as a (w1 + w12)
+
+        weights2_roi = weights2_roi - weights1_roi;
+        // check that weights2_roi is still a part of weights2 (not a new matrix)
+        CV_Assert(checkNorm(weights2_roi,
+            weights2(Rect(weights2Border.lef, weights2Border.top, roiSize.width, roiSize.height))) < 1e-6);
+
+        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
+
+        generateOclMat(gsrc1, gsrc1_roi, src1, roiSize, src1Border);
+        generateOclMat(gsrc2, gsrc2_roi, src2, roiSize, src2Border);
+        generateOclMat(gweights1, gweights1_roi, weights1, roiSize, weights1Border);
+        generateOclMat(gweights2, gweights2_roi, weights2, roiSize, weights2Border);
+        generateOclMat(gdst, gdst_roi, dst, roiSize, dstBorder);
+    }
+
+    void Near(double eps = 0.0)
+    {
+        Mat whole, roi;
+        gdst.download(whole);
+        gdst_roi.download(roi);
+
+        EXPECT_MAT_NEAR(dst, whole, eps);
+        EXPECT_MAT_NEAR(dst_roi, roi, eps);
     }
 };
 
+typedef void (*blendLinearFunc)(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold);
+
 OCL_TEST_P(Blend, Accuracy)
 {
-    int depth = CV_MAT_DEPTH(type);
-
-    cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
-    cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
-    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
-    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
-
-    cv::ocl::oclMat gimg1(img1), gimg2(img2), gweights1(weights1), gweights2(weights2);
-    cv::ocl::oclMat dst;
-
-    cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst);
-    cv::Mat result;
-    cv::Mat result_gold;
-    dst.download(result);
-    if (depth == CV_8U)
-        blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
-    else
-        blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
-
-    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f);
+    for (int i = 0; i < LOOP_TIMES; ++i)
+    {
+        random_roi();
+
+        cv::ocl::blendLinear(gsrc1_roi, gsrc2_roi, gweights1_roi, gweights2_roi, gdst_roi);
+
+        static blendLinearFunc funcs[] = {
+            blendLinearGold<uchar>,
+            blendLinearGold<schar>,
+            blendLinearGold<ushort>,
+            blendLinearGold<short>,
+            blendLinearGold<int>,
+            blendLinearGold<float>,
+        };
+
+        blendLinearFunc func = funcs[depth];
+        func(src1_roi, src2_roi, weights1_roi, weights2_roi, dst_roi);
+
+        Near(depth <= CV_32S ? 1.0 : 0.2);
+    }
 }
 
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend, Combine(
-                            DIFFERENT_SIZES,
-                            testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4))
-                        ));
-#endif
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend,
+                        Combine(testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
+                                testing::Range(1, 5), Bool()));
diff --git a/modules/ocl/test/test_filters.cpp b/modules/ocl/test/test_filters.cpp
index 3caea04fb0..d2edf6d219 100644
--- a/modules/ocl/test/test_filters.cpp
+++ b/modules/ocl/test/test_filters.cpp
@@ -59,10 +59,15 @@ using namespace cv;
 PARAM_TEST_CASE(FilterTestBase, MatType,
                 int, // kernel size
                 Size, // dx, dy
-                int, // border type, or iteration
+                int, // border type
+                double, // optional parameter
                 bool) // roi or not
 {
+    bool isFP;
+
     int type, borderType, ksize;
+    Size size;
+    double param;
     bool useRoi;
 
     Mat src, dst_whole, src_roi, dst_roi;
@@ -72,31 +77,53 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
     {
         type = GET_PARAM(0);
         ksize = GET_PARAM(1);
+        size = GET_PARAM(2);
         borderType = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
+        param = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        isFP = (CV_MAT_DEPTH(type) == CV_32F || CV_MAT_DEPTH(type) == CV_64F);
     }
 
-    void random_roi()
+    void random_roi(int minSize = 1)
     {
-        Size roiSize = randomSize(1, MAX_VALUE);
+        if (minSize == 0)
+            minSize = ksize;
+        Size roiSize = randomSize(minSize, MAX_VALUE);
         Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, isFP ? 0 : 5, isFP ? 1 : 256);
 
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, type, 5, 16);
+        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, type, isFP ? 0.20 : 60, isFP ? 0.25 : 70);
 
         generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
         generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
     }
 
-    void Near(double threshold = 0.0)
+    void Near()
+    {
+        if (isFP)
+            Near(1e-6, true);
+        else
+            Near(1, false);
+    }
+
+    void Near(double threshold, bool relative)
     {
         Mat roi, whole;
         gdst_whole.download(whole);
         gdst_roi.download(roi);
 
-        EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
+        if (relative)
+        {
+            EXPECT_MAT_NEAR_RELATIVE(dst_whole, whole, threshold);
+            EXPECT_MAT_NEAR_RELATIVE(dst_roi, roi, threshold);
+        }
+        else
+        {
+            EXPECT_MAT_NEAR(dst_whole, whole, threshold);
+            EXPECT_MAT_NEAR(dst_roi, roi, threshold);
+        }
     }
 };
 
@@ -111,12 +138,12 @@ OCL_TEST_P(Blur, Mat)
 
     for (int j = 0; j < LOOP_TIMES; j++)
     {
-        random_roi();
+        random_roi(0); // TODO NOTE: min value for size is kernel size (temporary bypass border issues in CPU implementation)
 
         blur(src_roi, dst_roi, kernelSize, Point(-1, -1), borderType);
         ocl::blur(gsrc_roi, gdst_roi, kernelSize, Point(-1, -1), borderType); // TODO anchor
 
-        Near(1.0);
+        Near();
     }
 }
 
@@ -127,64 +154,51 @@ typedef FilterTestBase LaplacianTest;
 
 OCL_TEST_P(LaplacianTest, Accuracy)
 {
+    double scale = param;
+
     for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        // border type is used as a scale factor for the Laplacian kernel
-        double scale = static_cast<double>(borderType);
-
-        Laplacian(src_roi, dst_roi, -1, ksize, scale);
-        ocl::Laplacian(gsrc_roi, gdst_roi, -1, ksize, scale);
+        Laplacian(src_roi, dst_roi, -1, ksize, scale, 0, borderType);
+        ocl::Laplacian(gsrc_roi, gdst_roi, -1, ksize, scale, 0, borderType);
 
-        Near(1e-5);
+        Near();
     }
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // erode & dilate
 
-struct ErodeDilate :
-        public FilterTestBase
-{
-    int iterations;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        iterations = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-    }
-};
-
-typedef ErodeDilate Erode;
+typedef FilterTestBase Erode;
 
 OCL_TEST_P(Erode, Mat)
 {
     // erode or dilate kernel
     Size kernelSize(ksize, ksize);
     Mat kernel;
+    int iterations = (int)param;
 
     for (int j = 0; j < LOOP_TIMES; j++)
     {
-        kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
-
         random_roi();
 
-        cv::erode(src_roi, dst_roi, kernel, Point(-1, -1), iterations);
-        ocl::erode(gsrc_roi, gdst_roi, kernel, Point(-1, -1), iterations); // TODO iterations, borderType
+        kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
 
-        Near(1e-5);
+        cv::erode(src_roi, dst_roi, kernel, Point(-1, -1), iterations);//, borderType);
+        ocl::erode(gsrc_roi, gdst_roi, kernel, Point(-1, -1), iterations);//, borderType);
+
+        Near();
     }
 }
 
-typedef ErodeDilate Dilate;
+typedef FilterTestBase Dilate;
 
 OCL_TEST_P(Dilate, Mat)
 {
     // erode or dilate kernel
     Mat kernel;
+    int iterations = (int)param;
 
     for (int j = 0; j < LOOP_TIMES; j++)
     {
@@ -195,79 +209,56 @@ OCL_TEST_P(Dilate, Mat)
         cv::dilate(src_roi, dst_roi, kernel, Point(-1, -1), iterations);
         ocl::dilate(gsrc_roi, gdst_roi, kernel, Point(-1, -1), iterations); // TODO iterations, borderType
 
-        Near(1e-5);
+        Near();
     }
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Sobel
 
-struct SobelTest :
-        public FilterTestBase
-{
-    int dx, dy;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        borderType = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        Size d = GET_PARAM(2);
-        dx = d.width, dy = d.height;
-    }
-};
+typedef FilterTestBase SobelTest;
 
 OCL_TEST_P(SobelTest, Mat)
 {
+    int dx = size.width, dy = size.height;
+    double scale = param;
+
     for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        Sobel(src_roi, dst_roi, -1, dx, dy, ksize, /* scale */ 0.00001, /* delta */0, borderType);
-        ocl::Sobel(gsrc_roi, gdst_roi, -1, dx, dy, ksize, /* scale */ 0.00001, /* delta */ 0, borderType);
+        Sobel(src_roi, dst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType);
+        ocl::Sobel(gsrc_roi, gdst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType);
 
-        Near(1);
+        Near();
     }
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Scharr
 
-typedef SobelTest ScharrTest;
+typedef FilterTestBase ScharrTest;
 
 OCL_TEST_P(ScharrTest, Mat)
 {
+    int dx = size.width, dy = size.height;
+    double scale = param;
+
     for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        Scharr(src_roi, dst_roi, -1, dx, dy, /* scale */ 1, /* delta */ 0, borderType);
-        ocl::Scharr(gsrc_roi, gdst_roi, -1, dx, dy, /* scale */ 1, /* delta */ 0, borderType);
+        Scharr(src_roi, dst_roi, -1, dx, dy, scale, /* delta */ 0, borderType);
+        ocl::Scharr(gsrc_roi, gdst_roi, -1, dx, dy, scale, /* delta */ 0, borderType);
 
-        Near(1);
+        Near();
     }
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // GaussianBlur
 
-struct GaussianBlurTest :
-        public FilterTestBase
-{
-    double sigma1, sigma2;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        borderType = GET_PARAM(3);
-
-        sigma1 = rng.uniform(0.1, 1.0);
-        sigma2 = rng.uniform(0.1, 1.0);
-    }
-};
+typedef FilterTestBase GaussianBlurTest;
 
 OCL_TEST_P(GaussianBlurTest, Mat)
 {
@@ -275,10 +266,13 @@ OCL_TEST_P(GaussianBlurTest, Mat)
     {
         random_roi();
 
+        double sigma1 = rng.uniform(0.1, 1.0);
+        double sigma2 = rng.uniform(0.1, 1.0);
+
         GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType);
         ocl::GaussianBlur(gsrc_roi, gdst_roi, Size(ksize, ksize), sigma1, sigma2, borderType);
 
-        Near(1);
+        Near(CV_MAT_DEPTH(type) == CV_8U ? 3 : 1e-6, false);
     }
 }
 
@@ -289,19 +283,24 @@ typedef FilterTestBase Filter2D;
 
 OCL_TEST_P(Filter2D, Mat)
 {
-    const Size kernelSize(ksize, ksize);
-    Mat kernel;
-
     for (int j = 0; j < LOOP_TIMES; j++)
     {
-        kernel = randomMat(kernelSize, CV_32FC1, 0.0, 1.0);
-
         random_roi();
 
-        cv::filter2D(src_roi, dst_roi, -1, kernel, Point(-1, -1), 0.0, borderType); // TODO anchor
-        ocl::filter2D(gsrc_roi, gdst_roi, -1, kernel, Point(-1, -1), borderType);
+        Point anchor(-1, -1);
+        if (size.width >= 0)
+            anchor.x = size.width % ksize;
+        if (size.height >= 0)
+            anchor.y = size.height % ksize;
+
+        const Size kernelSize(ksize, ksize);
+        Mat kernel = randomMat(kernelSize, CV_32FC1, 0, 1.0);
+        kernel *= 1.0 / (double)(ksize * ksize);
+
+        cv::filter2D(src_roi, dst_roi, -1, kernel, anchor, 0.0, borderType);
+        ocl::filter2D(gsrc_roi, gdst_roi, -1, kernel, anchor, 0.0, borderType);
 
-        Near(1);
+        Near();
     }
 }
 
@@ -322,7 +321,7 @@ OCL_TEST_P(Bilateral, Mat)
         cv::bilateralFilter(src_roi, dst_roi, ksize, sigmacolor, sigmaspace, borderType);
         ocl::bilateralFilter(gsrc_roi, gdst_roi, ksize, sigmacolor, sigmaspace, borderType);
 
-        Near(1);
+        Near();
     }
 }
 
@@ -342,7 +341,7 @@ OCL_TEST_P(AdaptiveBilateral, Mat)
         adaptiveBilateralFilter(src_roi, dst_roi, kernelSize, 5, Point(-1, -1), borderType); // TODO anchor
         ocl::adaptiveBilateralFilter(gsrc_roi, gdst_roi, kernelSize, 5, Point(-1, -1), borderType);
 
-        Near(1);
+        Near();
     }
 }
 
@@ -366,87 +365,108 @@ OCL_TEST_P(MedianFilter, Mat)
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#define FILTER_BORDER_SET_NO_ISOLATED \
+    Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_WRAP, (int)BORDER_REFLECT_101/*, \
+            (int)BORDER_CONSTANT|BORDER_ISOLATED, (int)BORDER_REPLICATE|BORDER_ISOLATED, \
+            (int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
+            (int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
+
+#define FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED \
+    Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, /*(int)BORDER_WRAP,*/ (int)BORDER_REFLECT_101/*, \
+            (int)BORDER_CONSTANT|BORDER_ISOLATED, (int)BORDER_REPLICATE|BORDER_ISOLATED, \
+            (int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
+            (int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
+
+#define FILTER_DATATYPES Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4, \
+                                CV_32FC1, CV_32FC3, CV_32FC4, \
+                                CV_64FC1, CV_64FC3, CV_64FC4)
+
 INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
+                            FILTER_DATATYPES,
                             Values(3, 5, 7),
                             Values(Size(0, 0)), // not used
-                            Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
+                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
+                            Values(0.0), // not used
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
+                            FILTER_DATATYPES,
                             Values(1, 3),
                             Values(Size(0, 0)), // not used
-                            Values(1, 2), // value is used as scale factor for kernel
+                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
+                            Values(1.0, 0.2, 3.0), // scalar
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                             Values(3, 5, 7),
                             Values(Size(0, 0)), // not used
-                            testing::Range(1, 2),
+                            Values(0), // not used
+                            Values(1.0, 2.0, 3.0),
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                             Values(3, 5, 7),
                             Values(Size(0, 0)), // not used
-                            testing::Range(1, 2),
+                            Values(0), // not used
+                            Values(1.0, 2.0, 3.0),
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                             Values(3, 5),
-                            Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)),
-                            Values((int)BORDER_CONSTANT, (int)BORDER_REFLECT101,
-                                   (int)BORDER_REPLICATE, (int)BORDER_REFLECT),
+                            Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)), // dx, dy
+                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
+                            Values(0.0), // not used
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(0), // not used
-                            Values(Size(0, 1), Size(1, 0)),
-                            Values((int)BORDER_CONSTANT, (int)BORDER_REFLECT101,
-                                   (int)BORDER_REPLICATE, (int)BORDER_REFLECT),
+                            Values(1),
+                            Values(Size(0, 1), Size(1, 0)), // dx, dy
+                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
+                            Values(1.0, 0.2), // scalar
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
                             Values(3, 5),
                             Values(Size(0, 0)), // not used
-                            Values((int)BORDER_CONSTANT, (int)BORDER_REFLECT101,
-                                   (int)BORDER_REPLICATE, (int)BORDER_REFLECT),
+                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
+                            Values(0.0), // not used
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, Filter2D, testing::Combine(
-                            Values(CV_8UC1, CV_32FC1, CV_32FC4),
-                            Values(3, 15, 25),
-                            Values(Size(0, 0)), // not used
-                            Values((int)BORDER_CONSTANT, (int)BORDER_REFLECT101,
-                                   (int)BORDER_REPLICATE, (int)BORDER_REFLECT),
+                            FILTER_DATATYPES,
+                            Values(3, 15), // TODO 25: CPU implementation has some issues
+                            Values(Size(-1, -1), Size(0, 0), Size(2, 1)), // anchor
+                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
+                            Values(0.0), // not used
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
                             Values(CV_8UC1, CV_8UC3),
                             Values(5, 9),
                             Values(Size(0, 0)), // not used
-                            Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE,
-                                   (int)BORDER_REFLECT, (int)BORDER_WRAP, (int)BORDER_REFLECT_101),
+                            FILTER_BORDER_SET_NO_ISOLATED,
+                            Values(0.0), // not used
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, AdaptiveBilateral, Combine(
                             Values(CV_8UC1, CV_8UC3),
                             Values(5, 9),
                             Values(Size(0, 0)), // not used
-                            Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE,
-                                   (int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
+                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
+                            Values(0.0), // not used
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Filter, MedianFilter, Combine(
-                            Values((MatType)CV_8UC1, (MatType)CV_8UC4, (MatType)CV_32FC1, (MatType)CV_32FC4),
+                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
                             Values(3, 5),
                             Values(Size(0, 0)), // not used
                             Values(0), // not used
+                            Values(0.0), // not used
                             Bool()));
 
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp
index eb983fb17e..7e4b14ecae 100644
--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
@@ -80,7 +80,7 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType,
         useRoi = GET_PARAM(3);
     }
 
-    void random_roi()
+    virtual void random_roi()
     {
         Size roiSize = randomSize(1, MAX_VALUE);
         Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
@@ -93,14 +93,22 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType,
         generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
     }
 
-    void Near(double threshold = 0.0)
+    void Near(double threshold = 0.0, bool relative = false)
     {
-        Mat whole, roi;
+        Mat roi, whole;
         gdst_whole.download(whole);
         gdst_roi.download(roi);
 
-        EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
+        if (relative)
+        {
+            EXPECT_MAT_NEAR_RELATIVE(dst_whole, whole, threshold);
+            EXPECT_MAT_NEAR_RELATIVE(dst_roi, roi, threshold);
+        }
+        else
+        {
+            EXPECT_MAT_NEAR(dst_whole, whole, threshold);
+            EXPECT_MAT_NEAR(dst_roi, roi, threshold);
+        }
     }
 };
 
@@ -191,7 +199,31 @@ OCL_TEST_P(EqualizeHist, Mat)
 
 ////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
 
-typedef ImgprocTestBase CornerMinEigenVal;
+struct CornerTestBase :
+        public ImgprocTestBase
+{
+    virtual void random_roi()
+    {
+        Mat image = readImageType("gpu/stereobm/aloe-L.png", type);
+        ASSERT_FALSE(image.empty());
+
+        Size roiSize = image.size();
+        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+
+        Size wholeSize = Size(roiSize.width + srcBorder.lef + srcBorder.rig, roiSize.height + srcBorder.top + srcBorder.bot);
+        src = randomMat(wholeSize, type, -255, 255, false);
+        src_roi = src(Rect(srcBorder.lef, srcBorder.top, roiSize.width, roiSize.height));
+        image.copyTo(src_roi);
+
+        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, CV_32FC1, 5, 16);
+
+        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
+        generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
+    }
+};
+
+typedef CornerTestBase CornerMinEigenVal;
 
 OCL_TEST_P(CornerMinEigenVal, Mat)
 {
@@ -204,13 +236,13 @@ OCL_TEST_P(CornerMinEigenVal, Mat)
         cornerMinEigenVal(src_roi, dst_roi, blockSize, apertureSize, borderType);
         ocl::cornerMinEigenVal(gsrc_roi, gdst_roi, blockSize, apertureSize, borderType);
 
-        Near(1.0);
+        Near(1e-5, true);
     }
 }
 
 ////////////////////////////////cornerHarris//////////////////////////////////////////
 
-typedef ImgprocTestBase CornerHarris;
+typedef CornerTestBase CornerHarris;
 
 OCL_TEST_P(CornerHarris, Mat)
 {
@@ -219,12 +251,12 @@ OCL_TEST_P(CornerHarris, Mat)
         random_roi();
 
         int apertureSize = 3;
-        double k = 2.0;
+        double k = randomDouble(0.01, 0.9);
 
         cornerHarris(src_roi, dst_roi, blockSize, apertureSize, k, borderType);
         ocl::cornerHarris(gsrc_roi, gdst_roi, blockSize, apertureSize, k, borderType);
 
-        Near(1.0);
+        Near(1e-5, true);
     }
 }
 
@@ -484,25 +516,27 @@ INSTANTIATE_TEST_CASE_P(Imgproc, EqualizeHist, Combine(
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Imgproc, CornerMinEigenVal, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            Values(3), // TODO some fails when blockSize != 3 (for example 5)
-                            Values((int)BORDER_REFLECT, (int)BORDER_CONSTANT, (int)BORDER_REPLICATE), // TODO does not work with (int)BORDER_REFLECT101
+                            Values((MatType)CV_8UC1, (MatType)CV_32FC1),
+                            Values(3, 5),
+                            Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT101),
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Imgproc, CornerHarris, Combine(
                             Values((MatType)CV_8UC1), // TODO does not work properly with CV_32FC1
                             Values(3, 5),
-                            Values((int)BORDER_REFLECT101, (int)BORDER_REFLECT, (int)BORDER_CONSTANT, (int)BORDER_REPLICATE),
+                            Values( (int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Imgproc, Integral, Combine(
-                            Values((MatType)CV_8UC1), // TODO does work with CV_32F, CV_64F
+                            Values((MatType)CV_8UC1), // TODO does not work with CV_32F, CV_64F
                             Values(0), // not used
                             Values(0), // not used
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Imgproc, Threshold, Combine(
-                            Values(CV_8UC1, CV_32FC1),
+                            Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,
+                                   CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4,
+                                   CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
                             Values(0),
                             Values(ThreshOp(THRESH_BINARY),
                                    ThreshOp(THRESH_BINARY_INV), ThreshOp(THRESH_TRUNC),
diff --git a/modules/ocl/test/test_kmeans.cpp b/modules/ocl/test/test_kmeans.cpp
index 94263d8f76..d583cc950f 100644
--- a/modules/ocl/test/test_kmeans.cpp
+++ b/modules/ocl/test/test_kmeans.cpp
@@ -99,7 +99,6 @@ PARAM_TEST_CASE(Kmeans, int, int, int)
     }
 };
 OCL_TEST_P(Kmeans, Mat){
-
     if(flags & KMEANS_USE_INITIAL_LABELS)
     {
         // inital a given labels
@@ -116,11 +115,9 @@ OCL_TEST_P(Kmeans, Mat){
         kmeans(src, K, labels,
             TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0),
             1, flags, centers);
-
         ocl::kmeans(d_src, K, d_labels,
             TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0),
             1, flags, d_centers);
-
         Mat dd_labels(d_labels);
         Mat dd_centers(d_centers);
         if(flags & KMEANS_USE_INITIAL_LABELS)
@@ -153,9 +150,97 @@ OCL_TEST_P(Kmeans, Mat){
         }
     }
 }
+
 INSTANTIATE_TEST_CASE_P(OCL_ML, Kmeans, Combine(
     Values(3, 5, 8),
     Values(CV_32FC1, CV_32FC2, CV_32FC4),
     Values(OCL_KMEANS_USE_INITIAL_LABELS/*, OCL_KMEANS_PP_CENTERS*/)));
 
+
+/////////////////////////////// DistanceToCenters //////////////////////////////////////////
+
+CV_ENUM(DistType, NORM_L1, NORM_L2SQR);
+
+PARAM_TEST_CASE(distanceToCenters, DistType, bool)
+{
+    cv::Size size;
+    int distType;
+    bool useRoi;
+    cv::Mat src, centers, src_roi, centers_roi;
+    cv::ocl::oclMat ocl_src, ocl_centers, ocl_src_roi, ocl_centers_roi;
+
+    virtual void SetUp()
+    {
+        distType = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
+    }
+
+    void random_roi()
+    {
+        Size roiSize_src = randomSize(10,1000);
+        Size roiSize_centers = randomSize(10, 1000);
+        roiSize_src.width = roiSize_centers.width;
+
+        Border srcBorder = randomBorder(0, useRoi ? 500 : 0);
+        randomSubMat(src, src_roi, roiSize_src, srcBorder, CV_32FC1, -SHRT_MAX, SHRT_MAX);
+
+        Border centersBorder = randomBorder(0, useRoi ? 500 : 0);
+        randomSubMat(centers, centers_roi, roiSize_centers, centersBorder, CV_32FC1, -SHRT_MAX, SHRT_MAX);
+
+        for(int i = 0; i<centers.rows; i++)
+            centers.at<float>(i, randomInt(0,centers.cols-1)) = (float)randomDouble(SHRT_MAX, INT_MAX);
+
+        generateOclMat(ocl_src, ocl_src_roi, src, roiSize_src, srcBorder);
+        generateOclMat(ocl_centers, ocl_centers_roi, centers, roiSize_centers, centersBorder);
+
+    }
+
+};
+
+OCL_TEST_P(distanceToCenters, Accuracy)
+{
+    for(int j = 0; j< LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::ocl::oclMat ocl_dists;
+        cv::ocl::oclMat ocl_labels;
+
+        cv::ocl::distanceToCenters(ocl_dists,ocl_labels,ocl_src_roi, ocl_centers_roi, distType);
+
+        Mat labels, dists;
+        ocl_labels.download(labels);
+        ocl_dists.download(dists);
+
+        ASSERT_EQ(ocl_dists.cols, ocl_labels.rows);
+
+        Mat batch_dists;
+
+        cv::batchDistance(src_roi, centers_roi, batch_dists, CV_32FC1, noArray(), distType);
+
+        std::vector<double> gold_dists_v;
+
+        for(int i = 0; i<batch_dists.rows; i++)
+        {
+            Mat r = batch_dists.row(i);
+            double mVal;
+            Point mLoc;
+            minMaxLoc(r, &mVal, NULL, &mLoc, NULL);
+
+            int ocl_label = *(int*)labels.row(i).col(0).data;
+            ASSERT_EQ(mLoc.x, ocl_label);
+
+            gold_dists_v.push_back(mVal);
+        }
+        Mat gold_dists(gold_dists_v);
+        dists.convertTo(dists, CV_64FC1);
+        double relative_error = cv::norm(gold_dists.t(), dists, NORM_INF|NORM_RELATIVE);
+        ASSERT_LE(relative_error, 1e-5);
+    }
+}
+
+
+INSTANTIATE_TEST_CASE_P (OCL_ML, distanceToCenters, Combine(DistType::all(), Bool()) );
+
+
 #endif
diff --git a/modules/ocl/test/test_moments.cpp b/modules/ocl/test/test_moments.cpp
index 7118609f54..e978bb28f0 100644
--- a/modules/ocl/test/test_moments.cpp
+++ b/modules/ocl/test/test_moments.cpp
@@ -7,18 +7,19 @@ using namespace cv;
 using namespace cv::ocl;
 using namespace cvtest;
 using namespace testing;
-PARAM_TEST_CASE(MomentsTest, MatType, bool)
+PARAM_TEST_CASE(MomentsTest, MatType, bool, bool)
 {
     int type;
-    cv::Mat mat1;
+    cv::Mat mat;
     bool test_contours;
-
+    bool binaryImage;
     virtual void SetUp()
     {
         type = GET_PARAM(0);
         test_contours = GET_PARAM(1);
-        cv::Size size(10*MWIDTH, 10*MHEIGHT);
-        mat1 = randomMat(size, type, 5, 16, false);
+        cv::Size size(10 * MWIDTH, 10 * MHEIGHT);
+        mat = randomMat(size, type, 0, 256, false);
+        binaryImage = GET_PARAM(2);
     }
 
     void Compare(Moments& cpu_moments, Moments& gpu_moments)
@@ -26,16 +27,13 @@ PARAM_TEST_CASE(MomentsTest, MatType, bool)
         Mat gpu_dst, cpu_dst;
         HuMoments(cpu_moments, cpu_dst);
         HuMoments(gpu_moments, gpu_dst);
-        EXPECT_MAT_NEAR(gpu_dst, cpu_dst, .5);
+        EXPECT_MAT_NEAR(gpu_dst, cpu_dst, 1e-3);
     }
-
 };
 
-
 OCL_TEST_P(MomentsTest, Mat)
 {
-    bool binaryImage = 0;
-
+    oclMat src_d(mat);
     for(int j = 0; j < LOOP_TIMES; j++)
     {
         if(test_contours)
@@ -50,18 +48,17 @@ OCL_TEST_P(MomentsTest, Mat)
             for( size_t i = 0; i < contours.size(); i++ )
             {
                 Moments m = moments( contours[i], false );
-                Moments dm = ocl::ocl_moments( contours[i], false );
+                Moments dm = ocl::ocl_moments( contours[i]);
                 Compare(m, dm);
             }
         }
-        cv::_InputArray _array(mat1);
-        cv::Moments CvMom = cv::moments(_array, binaryImage);
-        cv::Moments oclMom = cv::ocl::ocl_moments(_array, binaryImage);
+        cv::Moments CvMom = cv::moments(mat, binaryImage);
+        cv::Moments oclMom = cv::ocl::ocl_moments(src_d, binaryImage);
 
         Compare(CvMom, oclMom);
     }
 }
 INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MomentsTest, Combine(
-                            Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_64FC1), Values(true,false)));
+    Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1, CV_64FC1), Values(false, true), Values(false, true)));
 
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_split_merge.cpp b/modules/ocl/test/test_split_merge.cpp
index 6148e95cb4..8805416cf0 100644
--- a/modules/ocl/test/test_split_merge.cpp
+++ b/modules/ocl/test/test_split_merge.cpp
@@ -158,81 +158,32 @@ PARAM_TEST_CASE(SplitTestBase, MatType, int, bool)
     int channels;
     bool use_roi;
 
-    //src mat
-    cv::Mat mat;
-
-    //dstmat
-    cv::Mat dst[MAX_CHANNELS];
-
-    // set up roi
-    int roicols, roirows;
-    int srcx, srcy;
-    int dstx[MAX_CHANNELS];
-    int dsty[MAX_CHANNELS];
-
-    //src mat with roi
-    cv::Mat mat_roi;
-
-    //dst mat with roi
-    cv::Mat dst_roi[MAX_CHANNELS];
+    cv::Mat src, src_roi;
+    cv::Mat dst[MAX_CHANNELS], dst_roi[MAX_CHANNELS];
 
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole[MAX_CHANNELS];
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst[MAX_CHANNELS];
+    cv::ocl::oclMat gsrc_whole, gsrc_roi;
+    cv::ocl::oclMat gdst_whole[MAX_CHANNELS], gdst_roi[MAX_CHANNELS];
 
     virtual void SetUp()
     {
         type = GET_PARAM(0);
         channels = GET_PARAM(1);
         use_roi = GET_PARAM(2);
-
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat  = randomMat(size, CV_MAKETYPE(type, channels), 5, 16, false);
-        for (int i = 0; i < channels; ++i)
-            dst[i] = randomMat(size, CV_MAKETYPE(type, 1), 5, 16, false);    }
+    }
 
     void random_roi()
     {
-        if (use_roi)
-        {
-            //randomize ROI
-            roicols = rng.uniform(1, mat.cols);
-            roirows = rng.uniform(1, mat.rows);
-            srcx    = rng.uniform(0, mat.cols - roicols);
-            srcy    = rng.uniform(0, mat.rows - roirows);
-
-            for (int i = 0; i < channels; ++i)
-            {
-                dstx[i] = rng.uniform(0, dst[i].cols  - roicols);
-                dsty[i] = rng.uniform(0, dst[i].rows  - roirows);
-            }
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx = srcy = 0;
-
-            for (int i = 0; i < channels; ++i)
-                dstx[i] = dsty[i] = 0;
-        }
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-
-        for (int i = 0; i < channels; ++i)
-            dst_roi[i] = dst[i](Rect(dstx[i], dsty[i], roicols, roirows));
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, CV_MAKETYPE(type, channels), 0, 256);
+        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
 
         for (int i = 0; i < channels; ++i)
         {
-            gdst_whole[i] = dst[i];
-            gdst[i] = gdst_whole[i](Rect(dstx[i], dsty[i], roicols, roirows));
+            Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst[i], dst_roi[i], roiSize, dstBorder, CV_MAKETYPE(type, 1), 5, 16);
+            generateOclMat(gdst_whole[i], gdst_roi[i], dst[i], roiSize, dstBorder);
         }
-
-        gmat = mat_roi;
     }
 };
 
@@ -244,11 +195,14 @@ OCL_TEST_P(Split, Accuracy)
     {
         random_roi();
 
-        cv::split(mat_roi, dst_roi);
-        cv::ocl::split(gmat, gdst);
+        cv::split(src_roi, dst_roi);
+        cv::ocl::split(gsrc_roi, gdst_roi);
 
         for (int i = 0; i < channels; ++i)
-            EXPECT_MAT_NEAR(dst[i], Mat(gdst_whole[i]), 0.0);
+        {
+            EXPECT_MAT_NEAR(dst[i], gdst_whole[i], 0.0);
+            EXPECT_MAT_NEAR(dst_roi[i], gdst_roi[i], 0.0);
+        }
     }
 }
 
diff --git a/modules/ocl/test/test_warp.cpp b/modules/ocl/test/test_warp.cpp
index bfe5b638f3..05554ce3fa 100644
--- a/modules/ocl/test/test_warp.cpp
+++ b/modules/ocl/test/test_warp.cpp
@@ -86,17 +86,17 @@ PARAM_TEST_CASE(WarpTestBase, MatType, Interpolation, bool, bool)
 
     void random_roi()
     {
+        dsize = randomSize(1, MAX_VALUE);
+
         Size roiSize = randomSize(1, MAX_VALUE);
         Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
 
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
+        randomSubMat(dst_whole, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
 
         generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
-
-        dsize = randomSize(1, MAX_VALUE);
+        generateOclMat(gdst_whole, gdst_roi, dst_whole, dsize, dstBorder);
     }
 
     void Near(double threshold = 0.0)
@@ -116,18 +116,13 @@ typedef WarpTestBase WarpAffine;
 
 OCL_TEST_P(WarpAffine, Mat)
 {
-    static const double coeffs[2][3] =
-    {
-        { cos(CV_PI / 6), -sin(CV_PI / 6),  100.0 },
-        { sin(CV_PI / 6),  cos(CV_PI / 6), -100.0 }
-    };
-
-    static Mat M(2, 3, CV_64FC1, (void *)coeffs);
-
     for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
+        Mat M = getRotationMatrix2D(Point2f(src_roi.cols / 2.0f, src_roi.rows / 2.0f),
+            rng.uniform(-180.f, 180.f), rng.uniform(0.4f, 2.0f));
+
         warpAffine(src_roi, dst_roi, M, dsize, interpolation);
         ocl::warpAffine(gsrc_roi, gdst_roi, M, dsize, interpolation);
 
@@ -141,19 +136,19 @@ typedef WarpTestBase WarpPerspective;
 
 OCL_TEST_P(WarpPerspective, Mat)
 {
-    static const double coeffs[3][3] =
-    {
-        { cos(CV_PI / 6), -sin(CV_PI / 6),  100.0 },
-        { sin(CV_PI / 6),  cos(CV_PI / 6), -100.0 },
-        { 0.0,             0.0,             1.0   }
-    };
-
-    static Mat M(3, 3, CV_64FC1, (void *)coeffs);
-
     for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
+        float cols = static_cast<float>(src_roi.cols), rows = static_cast<float>(src_roi.rows);
+        float cols2 = cols / 2.0f, rows2 = rows / 2.0f;
+        Point2f sp[] = { Point2f(0.0f, 0.0f), Point2f(cols, 0.0f), Point2f(0.0f, rows), Point2f(cols, rows) };
+        Point2f dp[] = { Point2f(rng.uniform(0.0f, cols2), rng.uniform(0.0f, rows2)),
+            Point2f(rng.uniform(cols2, cols), rng.uniform(0.0f, rows2)),
+            Point2f(rng.uniform(0.0f, cols2), rng.uniform(rows2, rows)),
+            Point2f(rng.uniform(cols2, cols), rng.uniform(rows2, rows)) };
+        Mat M = getPerspectiveTransform(sp, dp);
+
         warpPerspective(src_roi, dst_roi, M, dsize, interpolation);
         ocl::warpPerspective(gsrc_roi, gdst_roi, M, dsize, interpolation);
 
diff --git a/modules/ocl/test/utility.cpp b/modules/ocl/test/utility.cpp
index 8521dff821..fc7cb5e6c6 100644
--- a/modules/ocl/test/utility.cpp
+++ b/modules/ocl/test/utility.cpp
@@ -230,21 +230,25 @@ double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& o
     return final_test_result;
 }
 
-void showDiff(const Mat& gold, const Mat& actual, double eps)
+void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow)
 {
-    Mat diff;
+    Mat diff, diff_thresh;
     absdiff(gold, actual, diff);
-    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
+    diff.convertTo(diff, CV_32F);
+    threshold(diff, diff_thresh, eps, 255.0, cv::THRESH_BINARY);
 
-    namedWindow("gold", WINDOW_NORMAL);
-    namedWindow("actual", WINDOW_NORMAL);
-    namedWindow("diff", WINDOW_NORMAL);
+    if (alwaysShow || cv::countNonZero(diff_thresh.reshape(1)) > 0)
+    {
+        namedWindow("gold", WINDOW_NORMAL);
+        namedWindow("actual", WINDOW_NORMAL);
+        namedWindow("diff", WINDOW_NORMAL);
 
-    imshow("gold", gold);
-    imshow("actual", actual);
-    imshow("diff", diff);
+        imshow("gold", gold);
+        imshow("actual", actual);
+        imshow("diff", diff);
 
-    waitKey();
+        waitKey();
+    }
 }
 
 } // namespace cvtest
diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp
index 0f0fac3138..759194e7fa 100644
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -54,7 +54,7 @@ extern int LOOP_TIMES;
 
 namespace cvtest {
 
-void showDiff(const Mat& gold, const Mat& actual, double eps);
+void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow = false);
 
 cv::ocl::oclMat createMat_ocl(cv::RNG& rng, Size size, int type, bool useRoi);
 cv::ocl::oclMat loadMat_ocl(cv::RNG& rng, const Mat& m, bool useRoi);
@@ -74,6 +74,13 @@ double checkNorm(const cv::Mat &m);
 double checkNorm(const cv::Mat &m1, const cv::Mat &m2);
 double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
 
+inline double checkNormRelative(const Mat &m1, const Mat &m2)
+{
+    return cv::norm(m1, m2, cv::NORM_INF) /
+            std::max((double)std::numeric_limits<float>::epsilon(),
+                     (double)std::max(cv::norm(m1, cv::NORM_INF), norm(m2, cv::NORM_INF)));
+}
+
 #define EXPECT_MAT_NORM(mat, eps) \
 { \
     EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
@@ -83,7 +90,16 @@ double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
 { \
    ASSERT_EQ(mat1.type(), mat2.type()); \
    ASSERT_EQ(mat1.size(), mat2.size()); \
-   EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
+   EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps) \
+       << cv::format("Size: %d x %d", mat1.cols, mat1.rows) << std::endl; \
+}
+
+#define EXPECT_MAT_NEAR_RELATIVE(mat1, mat2, eps) \
+{ \
+   ASSERT_EQ(mat1.type(), mat2.type()); \
+   ASSERT_EQ(mat1.size(), mat2.size()); \
+   EXPECT_LE(checkNormRelative(cv::Mat(mat1), cv::Mat(mat2)), eps) \
+       << cv::format("Size: %d x %d", mat1.cols, mat1.rows) << std::endl; \
 }
 
 #define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
diff --git a/modules/superres/perf/perf_superres_ocl.cpp b/modules/superres/perf/perf_superres_ocl.cpp
index 67bcf8cbcd..04a3f7e85b 100644
--- a/modules/superres/perf/perf_superres_ocl.cpp
+++ b/modules/superres/perf/perf_superres_ocl.cpp
@@ -42,7 +42,7 @@
 
 #include "perf_precomp.hpp"
 
-#ifdef HAVE_OPENCL
+#ifdef HAVE_OPENCV_OCL
 
 #include "opencv2/ocl.hpp"
 using namespace std;
diff --git a/modules/video/src/kalman.cpp b/modules/video/src/kalman.cpp
index 793404a393..03d88f50d9 100644
--- a/modules/video/src/kalman.cpp
+++ b/modules/video/src/kalman.cpp
@@ -96,6 +96,7 @@ const Mat& KalmanFilter::predict(const Mat& control)
 
     // handle the case when there will be measurement before the next predict.
     statePre.copyTo(statePost);
+    errorCovPre.copyTo(errorCovPost);
 
     return statePre;
 }
diff --git a/platforms/android/service/doc/JavaHelper.rst b/platforms/android/service/doc/JavaHelper.rst
index 9262a7cf73..5c1e1c3256 100644
--- a/platforms/android/service/doc/JavaHelper.rst
+++ b/platforms/android/service/doc/JavaHelper.rst
@@ -59,3 +59,7 @@ OpenCV version constants
 .. data:: OPENCV_VERSION_2_4_6
 
     OpenCV Library version 2.4.6
+
+.. data:: OPENCV_VERSION_2_4_7
+
+    OpenCV Library version 2.4.7
diff --git a/platforms/android/service/engine/AndroidManifest.xml b/platforms/android/service/engine/AndroidManifest.xml
index dc992b3a62..8d7894797e 100644
--- a/platforms/android/service/engine/AndroidManifest.xml
+++ b/platforms/android/service/engine/AndroidManifest.xml
@@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.opencv.engine"
-    android:versionCode="210@ANDROID_PLATFORM_VERSION_CODE@"
-    android:versionName="2.10" >
+    android:versionCode="213@ANDROID_PLATFORM_VERSION_CODE@"
+    android:versionName="2.13" >
 
     <uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" />
     <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>
diff --git a/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp b/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
index b0b2b5137f..dbd192b796 100644
--- a/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
+++ b/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
@@ -15,7 +15,7 @@ using namespace android;
 
 const int OpenCVEngine::Platform = DetectKnownPlatforms();
 const int OpenCVEngine::CpuID = GetCpuID();
-const int OpenCVEngine::KnownVersions[] = {2040000, 2040100, 2040200, 2040300, 2040301, 2040302, 2040400, 2040500, 2040600};
+const int OpenCVEngine::KnownVersions[] = {2040000, 2040100, 2040200, 2040300, 2040301, 2040302, 2040400, 2040500, 2040600, 2040700};
 
 bool OpenCVEngine::ValidateVersion(int version)
 {
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 2591d329dc..64c25fc092 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -60,7 +60,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
       target_link_libraries(${the_target} opencv_cudacodec)
     endif()
 
-    if(HAVE_OPENCL)
+    if(HAVE_opencv_ocl)
       target_link_libraries(${the_target} opencv_ocl)
     endif()
 
diff --git a/samples/ocl/tvl1_optical_flow.cpp b/samples/ocl/tvl1_optical_flow.cpp
index 046e0cba70..f678dd6fda 100644
--- a/samples/ocl/tvl1_optical_flow.cpp
+++ b/samples/ocl/tvl1_optical_flow.cpp
@@ -184,7 +184,7 @@ int main(int argc, const char* argv[])
                 else
                     frame0.copyTo(frameCopy);
                 getFlowField(flow_vec[0], flow_vec[1], show_flow);
-                imshow("PyrLK [Sparse]", show_flow);
+                imshow("tvl1 optical flow field", show_flow);
             }
 
             if( waitKey( 10 ) >= 0 )