diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index 3c6ae98f45..af1581349e 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -122,15 +122,32 @@ file(GLOB lib_srcs "${tbb_src_dir}/src/tbb/*.cpp")
 file(GLOB lib_hdrs "${tbb_src_dir}/src/tbb/*.h")
 list(APPEND lib_srcs "${tbb_src_dir}/src/rml/client/rml_tbb.cpp")
 
-add_definitions(-D__TBB_DYNAMIC_LOAD_ENABLED=0         #required
-                -D__TBB_BUILD=1                        #required
-                -D__TBB_SURVIVE_THREAD_SWITCH=0        #no cilk support
-                -DUSE_PTHREAD                          #required for Unix
-                -DTBB_USE_GCC_BUILTINS=1               #required for ARM GCC
-                -DTBB_USE_DEBUG=0                      #just to be sure
-                -DTBB_NO_LEGACY=1                      #don't need backward compatibility
-                -DDO_ITT_NOTIFY=0                      #it seems that we don't need these notifications
-               )
+if (WIN32)
+  add_definitions(-D__TBB_DYNAMIC_LOAD_ENABLED=0
+                -D__TBB_BUILD=1
+                -D_UNICODE
+                -DUNICODE
+                -DWINAPI_FAMILY=WINAPI_FAMILY_APP
+                -DDO_ITT_NOTIFY=0
+               ) # defines were copied from windows.cl.inc
+set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} /APPCONTAINER")
+else()
+  add_definitions(-D__TBB_DYNAMIC_LOAD_ENABLED=0         #required
+                  -D__TBB_BUILD=1                        #required
+                  -D__TBB_SURVIVE_THREAD_SWITCH=0        #no cilk support
+                  -DTBB_USE_DEBUG=0                      #just to be sure
+                  -DTBB_NO_LEGACY=1                      #don't need backward compatibility
+                  -DDO_ITT_NOTIFY=0                      #it seems that we don't need these notifications
+                 )
+endif()
+
+if (HAVE_LIBPTHREAD)
+  add_definitions(-DUSE_PTHREAD) #required for Unix
+endif()
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+  add_definitions(-DTBB_USE_GCC_BUILTINS=1) #required for ARM GCC
+endif()
 
 if(ANDROID_COMPILER_IS_CLANG)
   add_definitions(-D__TBB_GCC_BUILTIN_ATOMICS_PRESENT=1)
@@ -145,7 +162,7 @@ endif()
 
 set(TBB_SOURCE_FILES ${lib_srcs} ${lib_hdrs})
 
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+if (ARM AND NOT WIN32)
   if (NOT ANDROID)
     set(TBB_SOURCE_FILES ${TBB_SOURCE_FILES} "${CMAKE_CURRENT_SOURCE_DIR}/arm_linux_stub.cpp")
   endif()
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a952826395..a7a09ee322 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,6 +118,7 @@ OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT)
 OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_NVCUVID        "Include NVidia Video Decoding library support"                               OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
 OCV_OPTION(WITH_EIGEN          "Include Eigen2/Eigen3 support"               ON)
+OCV_OPTION(WITH_VFW            "Include Video for Windows support"           ON   IF WIN32 )
 OCV_OPTION(WITH_FFMPEG         "Include FFMPEG support"                      ON   IF (NOT ANDROID AND NOT IOS))
 OCV_OPTION(WITH_GSTREAMER      "Include Gstreamer support"                   ON   IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_GSTREAMER_1_X  "Include Gstreamer 1.x support"               OFF)
@@ -133,13 +134,15 @@ OCV_OPTION(WITH_PNG            "Include PNG support"                         ON)
 OCV_OPTION(WITH_PVAPI          "Include Prosilica GigE support"              ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_GIGEAPI        "Include Smartek GigE support"                ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_QT             "Build with Qt Backend support"               OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_WIN32UI        "Build with Win32 UI Backend support"         ON   IF WIN32 )
 OCV_OPTION(WITH_QUICKTIME      "Use QuickTime for Video I/O insted of QTKit" OFF  IF APPLE )
 OCV_OPTION(WITH_TBB            "Include Intel TBB support"                   OFF  IF (NOT IOS) )
 OCV_OPTION(WITH_CSTRIPES       "Include C= support"                          OFF  IF WIN32 )
 OCV_OPTION(WITH_TIFF           "Include TIFF support"                        ON   IF (NOT IOS) )
 OCV_OPTION(WITH_UNICAP         "Include Unicap support (GPL)"                OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_V4L            "Include Video 4 Linux support"               ON   IF (UNIX AND NOT ANDROID) )
-OCV_OPTION(WITH_VIDEOINPUT     "Build HighGUI with DirectShow support"       ON   IF WIN32 )
+OCV_OPTION(WITH_DSHOW          "Build HighGUI with DirectShow support"       ON   IF (WIN32 AND NOT ARM) )
+OCV_OPTION(WITH_MSMF           "Build HighGUI with Media Foundation support" OFF  IF WIN32 )
 OCV_OPTION(WITH_XIMEA          "Include XIMEA cameras support"               OFF  IF (NOT ANDROID AND NOT APPLE) )
 OCV_OPTION(WITH_XINE           "Include Xine support (GPL)"                  OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_CLP            "Include Clp support (EPL)"                   OFF)
@@ -171,7 +174,7 @@ OCV_OPTION(BUILD_JASPER             "Build libjasper from source"        WIN32 O
 OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"          WIN32 OR ANDROID OR APPLE )
 OCV_OPTION(BUILD_PNG                "Build libpng from source"           WIN32 OR ANDROID OR APPLE )
 OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"          WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_TBB                "Download and build TBB from source" ANDROID IF CMAKE_COMPILER_IS_GNUCXX )
+OCV_OPTION(BUILD_TBB                "Download and build TBB from source" ANDROID )
 
 # OpenCV installation options
 # ===================================================
@@ -600,8 +603,8 @@ else()
   if(DEFINED WITH_QT)
     status("    QT 4.x:" NO)
   endif()
-  if(WIN32)
-    status("    Win32 UI:" YES)
+  if(DEFINED WITH_WIN32UI)
+    status("    Win32 UI:" HAVE_WIN32UI THEN YES ELSE NO)
   else()
     if(APPLE)
       if(WITH_CARBON)
@@ -665,6 +668,10 @@ endif()
 status("")
 status("  Video I/O:")
 
+if (DEFINED WITH_VFW)
+  status("    Video for Windows:" HAVE_VFW         THEN YES                                        ELSE NO)
+endif(DEFINED WITH_VFW)
+
 if(DEFINED WITH_1394)
   status("    DC1394 1.x:"     HAVE_DC1394         THEN "YES (ver ${ALIASOF_libdc1394_VERSION})"   ELSE NO)
   status("    DC1394 2.x:"     HAVE_DC1394_2       THEN "YES (ver ${ALIASOF_libdc1394-2_VERSION})" ELSE NO)
@@ -749,9 +756,13 @@ if(DEFINED WITH_V4L)
                                                    ELSE "${HAVE_CAMV4L_STR}/${HAVE_CAMV4L2_STR}")
 endif(DEFINED WITH_V4L)
 
-if(DEFINED WITH_VIDEOINPUT)
-  status("    DirectShow:"     HAVE_VIDEOINPUT     THEN YES                                        ELSE NO)
-endif(DEFINED WITH_VIDEOINPUT)
+if(DEFINED WITH_DSHOW)
+  status("    DirectShow:"     HAVE_DSHOW     THEN YES                                        ELSE NO)
+endif(DEFINED WITH_DSHOW)
+
+if(DEFINED WITH_MSMF)
+  status("    Media Foundation:" HAVE_MSMF    THEN YES                                        ELSE NO)
+endif(DEFINED WITH_MSMF)
 
 if(DEFINED WITH_XIMEA)
   status("    XIMEA:"          HAVE_XIMEA          THEN YES                                        ELSE NO)
diff --git a/android/android.toolchain.cmake b/android/android.toolchain.cmake
index f5daf307f7..0f7e340678 100644
--- a/android/android.toolchain.cmake
+++ b/android/android.toolchain.cmake
@@ -1,5 +1,5 @@
 # Copyright (c) 2010-2011, Ethan Rublee
-# Copyright (c) 2011-2012, Andrey Kamaev
+# Copyright (c) 2011-2013, Andrey Kamaev
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -281,8 +281,14 @@
 #     [+] updated for NDK r8c
 #     [+] added support for clang compiler
 #   - December 2012
+#     [+] suppress warning about unused CMAKE_TOOLCHAIN_FILE variable
+#     [+] adjust API level to closest compatible as NDK does
 #     [~] fixed ccache full path search
 #     [+] updated for NDK r8d
+#     [~] compiler options are aligned with NDK r8d
+#   - March 2013
+#     [+] updated for NDK r8e (x86 version)
+#     [+] support x86_64 version of NDK
 # ------------------------------------------------------------------------------
 
 cmake_minimum_required( VERSION 2.6.3 )
@@ -292,6 +298,10 @@ if( DEFINED CMAKE_CROSSCOMPILING )
  return()
 endif()
 
+if( CMAKE_TOOLCHAIN_FILE )
+ # touch toolchain variable only to suppress "unused variable" warning
+endif()
+
 get_property( _CMAKE_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE )
 if( _CMAKE_IN_TRY_COMPILE )
  include( "${CMAKE_CURRENT_SOURCE_DIR}/../android.toolchain.config.cmake" OPTIONAL )
@@ -305,7 +315,7 @@ set( CMAKE_SYSTEM_VERSION 1 )
 # rpath makes low sence for Android
 set( CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries." )
 
-set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8d -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
+set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8e -r8d -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
 if(NOT DEFINED ANDROID_NDK_SEARCH_PATHS)
  if( CMAKE_HOST_WIN32 )
   file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS )
@@ -449,19 +459,32 @@ if( ANDROID_FORBID_SYGWIN )
  endif()
 endif()
 
+
 # detect current host platform
+if( NOT DEFINED ANDROID_NDK_HOST_X64 AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64")
+ set( ANDROID_NDK_HOST_X64 1 CACHE BOOL "Try to use 64-bit compiler toolchain" )
+ mark_as_advanced( ANDROID_NDK_HOST_X64 )
+endif()
+
 set( TOOL_OS_SUFFIX "" )
 if( CMAKE_HOST_APPLE )
- set( ANDROID_NDK_HOST_SYSTEM_NAME "darwin-x86" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "darwin-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "darwin-x86" )
 elseif( CMAKE_HOST_WIN32 )
- set( ANDROID_NDK_HOST_SYSTEM_NAME "windows" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "windows-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "windows" )
  set( TOOL_OS_SUFFIX ".exe" )
 elseif( CMAKE_HOST_UNIX )
- set( ANDROID_NDK_HOST_SYSTEM_NAME "linux-x86" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "linux-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "linux-x86" )
 else()
  message( FATAL_ERROR "Cross-compilation on your platform is not supported by this cmake toolchain" )
 endif()
 
+if( NOT ANDROID_NDK_HOST_X64 )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+endif()
+
 # see if we have path to Android NDK
 __INIT_VARIABLE( ANDROID_NDK PATH ENV_ANDROID_NDK )
 if( NOT ANDROID_NDK )
@@ -509,7 +532,8 @@ if( ANDROID_NDK )
  endif()
  set( ANDROID_NDK "${ANDROID_NDK}" CACHE INTERNAL "Path of the Android NDK" FORCE )
  set( BUILD_WITH_ANDROID_NDK True )
- file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
+ file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
+ string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
 elseif( ANDROID_STANDALONE_TOOLCHAIN )
  get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
  # try to detect change
@@ -563,22 +587,21 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  endif()
 endif()
 
-macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar )
- foreach( __toolchain ${${__availableToolchainsVar}} )
+macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __host_system_name )
+ foreach( __toolchain ${${__availableToolchainsLst}} )
   if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK}/toolchains/${__toolchain}/prebuilt/" )
    string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
   else()
    set( __gcc_toolchain "${__toolchain}" )
   endif()
-  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__gcc_toolchain}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__gcc_toolchain}/prebuilt/${__host_system_name}" )
   if( __machine )
    string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9]+)?$" __version "${__gcc_toolchain}" )
    string( REGEX MATCH "^[^-]+" __arch "${__gcc_toolchain}" )
    list( APPEND __availableToolchainMachines "${__machine}" )
    list( APPEND __availableToolchainArchs "${__arch}" )
    list( APPEND __availableToolchainCompilerVersions "${__version}" )
-  else()
-   list( REMOVE_ITEM ${__availableToolchainsVar} "${__toolchain}" )
+   list( APPEND ${__availableToolchainsVar} "${__toolchain}" )
   endif()
   unset( __gcc_toolchain )
  endforeach()
@@ -594,17 +617,29 @@ if( BUILD_WITH_ANDROID_NDK )
  set( __availableToolchainCompilerVersions "" )
  if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_NAME}/" )
   # do not go through all toolchains if we know the name
-  set( __availableToolchains "${ANDROID_TOOLCHAIN_NAME}" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains )
+  set( __availableToolchainsLst "${ANDROID_TOOLCHAIN_NAME}" )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+   if( __availableToolchains )
+    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+   endif()
+  endif()
  endif()
  if( NOT __availableToolchains )
-  file( GLOB __availableToolchains RELATIVE "${ANDROID_NDK}/toolchains" "${ANDROID_NDK}/toolchains/*" )
+  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK}/toolchains" "${ANDROID_NDK}/toolchains/*" )
   if( __availableToolchains )
-   list(SORT __availableToolchains) # we need clang to go after gcc
+   list(SORT __availableToolchainsLst) # we need clang to go after gcc
+  endif()
+  __LIST_FILTER( __availableToolchainsLst "^[.]" )
+  __LIST_FILTER( __availableToolchainsLst "llvm" )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+   if( __availableToolchains )
+    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+   endif()
   endif()
-  __LIST_FILTER( __availableToolchains "^[.]" )
-  __LIST_FILTER( __availableToolchains "llvm" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains )
  endif()
  if( NOT __availableToolchains )
   message( FATAL_ERROR "Could not find any working toolchain in the NDK. Probably your Android NDK is broken." )
@@ -617,11 +652,11 @@ set( __uniqToolchainArchNames ${__availableToolchainArchs} )
 list( REMOVE_DUPLICATES __uniqToolchainArchNames )
 list( SORT __uniqToolchainArchNames )
 foreach( __arch ${__uniqToolchainArchNames} )
-list( APPEND ANDROID_SUPPORTED_ABIS ${ANDROID_SUPPORTED_ABIS_${__arch}} )
+ list( APPEND ANDROID_SUPPORTED_ABIS ${ANDROID_SUPPORTED_ABIS_${__arch}} )
 endforeach()
 unset( __uniqToolchainArchNames )
 if( NOT ANDROID_SUPPORTED_ABIS )
-message( FATAL_ERROR "No one of known Android ABIs is supported by this cmake toolchain." )
+ message( FATAL_ERROR "No one of known Android ABIs is supported by this cmake toolchain." )
 endif()
 
 # choose target ABI
@@ -760,11 +795,22 @@ unset( __availableToolchainCompilerVersions )
 # choose native API level
 __INIT_VARIABLE( ANDROID_NATIVE_API_LEVEL ENV_ANDROID_NATIVE_API_LEVEL ANDROID_API_LEVEL ENV_ANDROID_API_LEVEL ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME} ANDROID_DEFAULT_NDK_API_LEVEL )
 string( REGEX MATCH "[0-9]+" ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" )
-# TODO: filter out unsupported levels
+# adjust API level
+set( __real_api_level ${ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME}} )
+foreach( __level ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
+ if( NOT __level GREATER ANDROID_NATIVE_API_LEVEL AND NOT __level LESS __real_api_level )
+  set( __real_api_level ${__level} )
+ endif()
+endforeach()
+if( __real_api_level AND NOT ANDROID_NATIVE_API_LEVEL EQUAL __real_api_level )
+ message( STATUS "Adjusting Android API level 'android-${ANDROID_NATIVE_API_LEVEL}' to 'android-${__real_api_level}'")
+ set( ANDROID_NATIVE_API_LEVEL ${__real_api_level} )
+endif()
+unset(__real_api_level)
 # validate
 list( FIND ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_NATIVE_API_LEVEL}" __levelIdx )
 if( __levelIdx EQUAL -1 )
- message( SEND_ERROR "Specified Android native API level (${ANDROID_NATIVE_API_LEVEL}) is not supported by your NDK/toolchain." )
+ message( SEND_ERROR "Specified Android native API level 'android-${ANDROID_NATIVE_API_LEVEL}' is not supported by your NDK/toolchain." )
 else()
  if( BUILD_WITH_ANDROID_NDK )
   __DETECT_NATIVE_API_LEVEL( __realApiLevel "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}/usr/include/android/api-level.h" )
@@ -926,7 +972,7 @@ elseif( "${ANDROID_TOOLCHAIN_NAME}" MATCHES "-clang3[.][0-9]?$" )
  string( REGEX MATCH "3[.][0-9]$" ANDROID_CLANG_VERSION "${ANDROID_TOOLCHAIN_NAME}")
  string( REGEX REPLACE "-clang${ANDROID_CLANG_VERSION}$" "-4.6" ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
  if( NOT EXISTS "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}/bin/clang${TOOL_OS_SUFFIX}" )
-  message( FATAL_ERROR "Could not find the " )
+  message( FATAL_ERROR "Could not find the Clang compiler driver" )
  endif()
  set( ANDROID_COMPILER_IS_CLANG 1 )
  set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
@@ -1140,38 +1186,52 @@ endif()
 
 # NDK flags
 if( ARMEABI OR ARMEABI_V7A )
- set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fpic -D__ARM_ARCH_5__ -D__ARM_ARCH_5T__ -D__ARM_ARCH_5E__ -D__ARM_ARCH_5TE__" )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fpic -funwind-tables" )
  if( NOT ANDROID_FORCE_ARM_BUILD AND NOT ARMEABI_V6 )
-  # It is recommended to use the -mthumb compiler flag to force the generation
-  # of 16-bit Thumb-1 instructions (the default being 32-bit ARM ones).
-  set( ANDROID_CXX_FLAGS_RELEASE "-mthumb" )
-  set( ANDROID_CXX_FLAGS_DEBUG   "-marm -finline-limit=64" )
+  set( ANDROID_CXX_FLAGS_RELEASE "-mthumb -fomit-frame-pointer -fno-strict-aliasing" )
+  set( ANDROID_CXX_FLAGS_DEBUG   "-marm -fno-omit-frame-pointer -fno-strict-aliasing" )
+  if( NOT ANDROID_COMPILER_IS_CLANG )
+   set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -finline-limit=64" )
+  endif()
  else()
   # always compile ARMEABI_V6 in arm mode; otherwise there is no difference from ARMEABI
-  # O3 instead of O2/Os in release mode - like cmake sets for desktop gcc
-  set( ANDROID_CXX_FLAGS_RELEASE "-marm" )
-  set( ANDROID_CXX_FLAGS_DEBUG   "-marm -finline-limit=300" )
+  set( ANDROID_CXX_FLAGS_RELEASE "-marm -fomit-frame-pointer -fstrict-aliasing" )
+  set( ANDROID_CXX_FLAGS_DEBUG   "-marm -fno-omit-frame-pointer -fno-strict-aliasing" )
+  if( NOT ANDROID_COMPILER_IS_CLANG )
+   set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funswitch-loops -finline-limit=300" )
+  endif()
  endif()
 elseif( X86 )
- set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS} -funwind-tables" )
- set( ANDROID_CXX_FLAGS_RELEASE "" )
- set( ANDROID_CXX_FLAGS_DEBUG   "-finline-limit=300" )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funwind-tables" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funswitch-loops -finline-limit=300" )
+ else()
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fPIC" )
+ endif()
+ set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer -fstrict-aliasing" )
+ set( ANDROID_CXX_FLAGS_DEBUG   "-fno-omit-frame-pointer -fno-strict-aliasing" )
 elseif( MIPS )
- set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS} -fpic -funwind-tables -fmessage-length=0 -fno-inline-functions-called-once -frename-registers" )
- set( ANDROID_CXX_FLAGS_RELEASE "-finline-limit=300 -fno-strict-aliasing" )
- set( ANDROID_CXX_FLAGS_DEBUG   "-finline-functions -fgcse-after-reload -frerun-cse-after-loop" )
+ set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS} -fpic -fno-strict-aliasing -finline-functions -ffunction-sections -funwind-tables -fmessage-length=0" )
+ set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer" )
+ set( ANDROID_CXX_FLAGS_DEBUG   "-fno-omit-frame-pointer" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fno-inline-functions-called-once -fgcse-after-reload -frerun-cse-after-loop -frename-registers" )
+  set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} -funswitch-loops -finline-limit=300" )
+ endif()
 elseif()
  set( ANDROID_CXX_FLAGS_RELEASE "" )
  set( ANDROID_CXX_FLAGS_DEBUG   "" )
 endif()
 
-if( NOT X86 )
- set( ANDROID_CXX_FLAGS         "-Wno-psabi ${ANDROID_CXX_FLAGS}" )
+set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fsigned-char" ) # good/necessary when porting desktop libraries
+
+if( NOT X86 AND NOT ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "-Wno-psabi ${ANDROID_CXX_FLAGS}" )
 endif()
 
-set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fsigned-char" ) # good/necessary when porting desktop libraries
-set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} -fomit-frame-pointer" )
-set( ANDROID_CXX_FLAGS_DEBUG   "${ANDROID_CXX_FLAGS_DEBUG} -fno-strict-aliasing -fno-omit-frame-pointer" )
+if( NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.6" )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -no-canonical-prefixes" ) # see https://android-review.googlesource.com/#/c/47564/
+endif()
 
 # ABI-specific flags
 if( ARMEABI_V7A )
@@ -1308,9 +1368,6 @@ if( ANDROID_COMPILER_IS_CLANG )
   set( ANDROID_CXX_FLAGS "-target ${ANDROID_LLVM_TRIPLE} ${ANDROID_CXX_FLAGS}" )
  endif()
  if( BUILD_WITH_ANDROID_NDK )
-  if(ANDROID_ARCH_NAME STREQUAL "arm" )
-   set( ANDROID_CXX_FLAGS "-isystem ${ANDROID_CLANG_TOOLCHAIN_ROOT}/lib/clang/${ANDROID_CLANG_VERSION}/include ${ANDROID_CXX_FLAGS}" )
-  endif()
   set( ANDROID_CXX_FLAGS "-gcc-toolchain ${ANDROID_TOOLCHAIN_ROOT} ${ANDROID_CXX_FLAGS}" )
  endif()
 endif()
@@ -1326,6 +1383,12 @@ set( CMAKE_SHARED_LINKER_FLAGS ""                        CACHE STRING "shared li
 set( CMAKE_MODULE_LINKER_FLAGS ""                        CACHE STRING "module linker flags" )
 set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-z,nocopyreloc"      CACHE STRING "executable linker flags" )
 
+# put flags to cache (for debug purpose only)
+set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS}"         CACHE INTERNAL "Android specific c/c++ flags" )
+set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE}" CACHE INTERNAL "Android specific c/c++ Release flags" )
+set( ANDROID_CXX_FLAGS_DEBUG   "${ANDROID_CXX_FLAGS_DEBUG}"   CACHE INTERNAL "Android specific c/c++ Debug flags" )
+set( ANDROID_LINKER_FLAGS      "${ANDROID_LINKER_FLAGS}"      CACHE INTERNAL "Android specific c/c++ linker flags" )
+
 # finish flags
 set( CMAKE_CXX_FLAGS           "${ANDROID_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" )
 set( CMAKE_C_FLAGS             "${ANDROID_CXX_FLAGS} ${CMAKE_C_FLAGS}" )
@@ -1456,6 +1519,7 @@ endmacro()
 if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
  set( __toolchain_config "")
  foreach( __var NDK_CCACHE  LIBRARY_OUTPUT_PATH_ROOT  ANDROID_FORBID_SYGWIN  ANDROID_SET_OBSOLETE_VARIABLES
+                ANDROID_NDK_HOST_X64
                 ANDROID_NDK
                 ANDROID_STANDALONE_TOOLCHAIN
                 ANDROID_TOOLCHAIN_NAME
@@ -1512,6 +1576,7 @@ endif()
 #   ANDROID_NDK
 #   ANDROID_STANDALONE_TOOLCHAIN
 #   ANDROID_TOOLCHAIN_NAME : the NDK name of compiler toolchain
+#   ANDROID_NDK_HOST_X64 : try to use x86_64 toolchain (default for x64 host systems)
 #   LIBRARY_OUTPUT_PATH_ROOT : <any valid path>
 #   NDK_CCACHE : <path to your ccache executable>
 # Obsolete:
@@ -1536,7 +1601,7 @@ endif()
 #   BUILD_WITH_STANDALONE_TOOLCHAIN : TRUE if standalone toolchain is used
 #   ANDROID_NDK_HOST_SYSTEM_NAME : "windows", "linux-x86" or "darwin-x86" depending on host platform
 #   ANDROID_NDK_ABI_NAME : "armeabi", "armeabi-v7a", "x86" or "mips" depending on ANDROID_ABI
-#   ANDROID_NDK_RELEASE : one of r5, r5b, r5c, r6, r6b, r7, r7b, r7c, r8, r8b, r8c, r8d; set only for NDK
+#   ANDROID_NDK_RELEASE : one of r5, r5b, r5c, r6, r6b, r7, r7b, r7c, r8, r8b, r8c, r8d, r8e; set only for NDK
 #   ANDROID_ARCH_NAME : "arm" or "x86" or "mips" depending on ANDROID_ABI
 #   ANDROID_SYSROOT : path to the compiler sysroot
 #   TOOL_OS_SUFFIX : "" or ".exe" depending on host platform
diff --git a/android/service/doc/JavaHelper.rst b/android/service/doc/JavaHelper.rst
index 34798c267e..e90b016e54 100644
--- a/android/service/doc/JavaHelper.rst
+++ b/android/service/doc/JavaHelper.rst
@@ -51,3 +51,7 @@ OpenCV version constants
 .. data:: OPENCV_VERSION_2_4_4
 
     OpenCV Library version 2.4.4
+
+.. data:: OPENCV_VERSION_2_4_5
+
+    OpenCV Library version 2.4.5
diff --git a/android/service/engine/AndroidManifest.xml b/android/service/engine/AndroidManifest.xml
index f4f0eb94fa..9549556782 100644
--- a/android/service/engine/AndroidManifest.xml
+++ b/android/service/engine/AndroidManifest.xml
@@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.opencv.engine"
-    android:versionCode="26@ANDROID_PLATFORM_VERSION_CODE@"
-    android:versionName="2.6" >
+    android:versionCode="27@ANDROID_PLATFORM_VERSION_CODE@"
+    android:versionName="2.7" >
 
     <uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" />
     <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>
@@ -26,6 +26,5 @@
             <category android:name="android.intent.category.LAUNCHER" />
         </intent-filter>
     </activity>
-
-</application>
+    </application>
 </manifest>
\ No newline at end of file
diff --git a/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp b/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
index c10ab54284..274e36a4b5 100644
--- a/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
+++ b/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
@@ -15,7 +15,7 @@ using namespace android;
 
 const int OpenCVEngine::Platform = DetectKnownPlatforms();
 const int OpenCVEngine::CpuID = GetCpuID();
-const int OpenCVEngine::KnownVersions[] = {2040000, 2040100, 2040200, 2040300, 2040301, 2040302, 2040400};
+const int OpenCVEngine::KnownVersions[] = {2040000, 2040100, 2040200, 2040300, 2040301, 2040302, 2040400, 2040500};
 
 bool OpenCVEngine::ValidateVersion(int version)
 {
diff --git a/android/service/engine/jni/Tests/OpenCVEngineTest.cpp b/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
index ce5159f818..7473387a04 100644
--- a/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
+++ b/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
@@ -294,7 +294,7 @@ TEST(OpenCVEngineTest, GetPathFor2_4_5)
     Starter.PackageManager->InstallVersion(2040500, PLATFORM_UNKNOWN, ARCH_ARMv7);
     EXPECT_FALSE(NULL == Engine.get());
     String16 result = Engine->GetLibPathByVersion(String16("2.4.5"));
-    EXPECT_EQ(0, result.size()); // 2.4.5 is not published yet
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", String8(result).string());
 }
 #endif
 
diff --git a/android/service/engine/project.properties b/android/service/engine/project.properties
index c6998b3d10..85aac54016 100644
--- a/android/service/engine/project.properties
+++ b/android/service/engine/project.properties
@@ -11,4 +11,4 @@
 #proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt
 
 # Project target.
-target=android-9
+target=android-8
diff --git a/android/service/readme.txt b/android/service/readme.txt
index df17c18245..f4e65eb369 100644
--- a/android/service/readme.txt
+++ b/android/service/readme.txt
@@ -14,20 +14,20 @@ manually using adb tool:
 
 .. code-block:: sh
 
-    adb install OpenCV-2.4.4-android-sdk/apk/OpenCV_2.4.4_Manager_2.6_<platform>.apk
+    adb install OpenCV-2.4.5-android-sdk/apk/OpenCV_2.4.5_Manager_2.7_<platform>.apk
 
 Use the table below to determine proper OpenCV Manager package for your device:
 
 +------------------------------+--------------+---------------------------------------------------+
 | Hardware Platform            | Android ver. | Package name                                      |
 +==============================+==============+===================================================+
-| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.4_Manager_2.6_armv7a-neon.apk          |
+| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.5_Manager_2.7_armv7a-neon.apk          |
 +------------------------------+--------------+---------------------------------------------------+
-| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.4_Manager_2.6_armv7a-neon-android8.apk |
+| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.5_Manager_2.7_armv7a-neon-android8.apk |
 +------------------------------+--------------+---------------------------------------------------+
-| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.4_Manager_2.6_armeabi.apk              |
+| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.5_Manager_2.7_armeabi.apk              |
 +------------------------------+--------------+---------------------------------------------------+
-| Intel x86                    |    >= 2.3    | OpenCV_2.4.4_Manager_2.6_x86.apk                  |
+| Intel x86                    |    >= 2.3    | OpenCV_2.4.5_Manager_2.7_x86.apk                  |
 +------------------------------+--------------+---------------------------------------------------+
-| MIPS                         |    >= 2.3    | OpenCV_2.4.4_Manager_2.6_mips.apk                 |
+| MIPS                         |    >= 2.3    | OpenCV_2.4.5_Manager_2.7_mips.apk                 |
 +------------------------------+--------------+---------------------------------------------------+
diff --git a/cmake/OpenCVDetectAndroidSDK.cmake b/cmake/OpenCVDetectAndroidSDK.cmake
index b125561d4a..92d7ba3272 100644
--- a/cmake/OpenCVDetectAndroidSDK.cmake
+++ b/cmake/OpenCVDetectAndroidSDK.cmake
@@ -278,6 +278,7 @@ macro(add_android_project target path)
         if (NATIVE_APP_GLUE)
           include_directories(${ANDROID_NDK}/sources/android/native_app_glue)
           list(APPEND android_proj_jni_files ${ANDROID_NDK}/sources/android/native_app_glue/android_native_app_glue.c)
+          ocv_warnings_disable(CMAKE_C_FLAGS -Wstrict-prototypes -Wunused-parameter -Wmissing-prototypes)
           set(android_proj_NATIVE_DEPS ${android_proj_NATIVE_DEPS} android)
         endif()
 
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index e853a8d0a0..f3d101ab21 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -57,7 +57,7 @@ if(CUDA_FOUND)
   elseif(CUDA_GENERATION STREQUAL "Kepler")
     set(__cuda_arch_bin "3.0")
   elseif(CUDA_GENERATION STREQUAL "Auto")
-    execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCudaArch.cu" "--run"
+    execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run"
                      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
                      RESULT_VARIABLE _nvcc_res OUTPUT_VARIABLE _nvcc_out
                      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -142,11 +142,14 @@ if(CUDA_FOUND)
     foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
       set(${var}_backup_in_cuda_compile_ "${${var}}")
 
-      # we reomove /EHa as it leasd warnings under windows
+      # we remove /EHa as it generates warnings under windows
       string(REPLACE "/EHa" "" ${var} "${${var}}")
 
       # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
       string(REPLACE "-ggdb3" "" ${var} "${${var}}")
+
+      # we remove -Wsign-promo as it generates warnings under linux
+      string(REPLACE "-Wsign-promo" "" ${var} "${${var}}")
     endforeach()
 
     if(BUILD_SHARED_LIBS)
diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake
index 6e02780009..9ee23da55b 100644
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -93,14 +93,16 @@ elseif(CMAKE_COMPILER_IS_GNUCXX)
         execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
                   OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "64")
+        if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
             set(MINGW64 1)
         endif()
     endif()
 endif()
 
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR CMAKE_GENERATOR MATCHES "Visual Studio.*Win64")
+if(MINGW64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR CMAKE_GENERATOR MATCHES "Visual Studio.*Win64")
     set(X86_64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
     set(X86 1)
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*|ARM.*")
+    set(ARM 1)
 endif()
diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake
index c883a80ced..3b42f1b0b3 100644
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -2,6 +2,16 @@
 #  Detect 3rd-party GUI libraries
 # ----------------------------------------------------------------------------
 
+#--- Win32 UI ---
+ocv_clear_vars(HAVE_WIN32UI)
+if(WITH_WIN32UI)
+  TRY_COMPILE(HAVE_WIN32UI
+    "${OPENCV_BINARY_DIR}/CMakeFiles/CMakeTmp"
+    "${OpenCV_SOURCE_DIR}/cmake/checks/win32uitest.cpp"
+    CMAKE_FLAGS "\"user32.lib\" \"gdi32.lib\""
+    OUTPUT_VARIABLE OUTPUT)
+endif(WITH_WIN32UI)
+
 # --- QT4 ---
 ocv_clear_vars(HAVE_QT)
 if(WITH_QT)
@@ -25,7 +35,7 @@ endif()
 # --- OpenGl ---
 ocv_clear_vars(HAVE_OPENGL HAVE_QT_OPENGL)
 if(WITH_OPENGL)
-  if(WIN32 OR QT_QTOPENGL_FOUND OR HAVE_GTKGLEXT)
+  if(WITH_WIN32UI OR (HAVE_QT AND QT_QTOPENGL_FOUND) OR HAVE_GTKGLEXT)
     find_package (OpenGL QUIET)
     if(OPENGL_FOUND)
       set(HAVE_OPENGL TRUE)
diff --git a/cmake/OpenCVFindLibsGrfmt.cmake b/cmake/OpenCVFindLibsGrfmt.cmake
index fa90b515b2..ed62c93625 100644
--- a/cmake/OpenCVFindLibsGrfmt.cmake
+++ b/cmake/OpenCVFindLibsGrfmt.cmake
@@ -151,6 +151,7 @@ if(WITH_PNG AND NOT IOS)
   else()
     include(FindPNG)
     if(PNG_FOUND)
+      include(CheckIncludeFile)
       check_include_file("${PNG_PNG_INCLUDE_DIR}/png.h"        HAVE_PNG_H)
       check_include_file("${PNG_PNG_INCLUDE_DIR}/libpng/png.h" HAVE_LIBPNG_PNG_H)
       if(HAVE_PNG_H)
diff --git a/cmake/OpenCVFindLibsPerf.cmake b/cmake/OpenCVFindLibsPerf.cmake
index 6e497accd3..db71b8aebb 100644
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@@ -88,8 +88,9 @@ endif()
 # --- OpenMP ---
 if(NOT HAVE_TBB AND NOT HAVE_CSTRIPES)
   set(_fname "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/omptest.cpp")
-  FILE(WRITE "${_fname}" "#ifndef _OPENMP\n#error\n#endif\nint main() { return 0; }\n")
-  TRY_COMPILE(HAVE_OPENMP "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp" "${_fname}")
+  file(WRITE "${_fname}" "#ifndef _OPENMP\n#error\n#endif\nint main() { return 0; }\n")
+  try_compile(HAVE_OPENMP "${CMAKE_BINARY_DIR}" "${_fname}")
+  file(REMOVE "${_fname}")
 else()
   set(HAVE_OPENMP 0)
 endif()
@@ -104,8 +105,9 @@ endif()
 # --- Concurrency ---
 if(MSVC AND NOT HAVE_TBB AND NOT HAVE_CSTRIPES AND NOT HAVE_OPENMP)
   set(_fname "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/concurrencytest.cpp")
-  FILE(WRITE "${_fname}" "#if _MSC_VER < 1600\n#error\n#endif\nint main() { return 0; }\n")
-  TRY_COMPILE(HAVE_CONCURRENCY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp" "${_fname}")
+  file(WRITE "${_fname}" "#if _MSC_VER < 1600\n#error\n#endif\nint main() { return 0; }\n")
+  try_compile(HAVE_CONCURRENCY "${CMAKE_BINARY_DIR}" "${_fname}")
+  file(REMOVE "${_fname}")
 else()
   set(HAVE_CONCURRENCY 0)
 endif()
diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index d74e22012b..1ae6a5102f 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -2,6 +2,15 @@
 #  Detect 3rd-party video IO libraries
 # ----------------------------------------------------------------------------
 
+ocv_clear_vars(HAVE_VFW)
+if (WITH_VFW)
+  TRY_COMPILE(HAVE_VFW
+    "${OPENCV_BINARY_DIR}/CMakeFiles/CMakeTmp"
+    "${OpenCV_SOURCE_DIR}/cmake/checks/vfwtest.cpp"
+    CMAKE_FLAGS "-DLINK_LIBRARIES:STRING=vfw32"
+    OUTPUT_VARIABLE OUTPUT)
+ endif(WITH_VFW)
+
 # --- GStreamer ---
 ocv_clear_vars(HAVE_GSTREAMER)
 # try to find gstreamer 0.10 first
@@ -66,7 +75,7 @@ if(WITH_PVAPI)
       set(PVAPI_SDK_SUBDIR x86)
     elseif(X86_64)
       set(PVAPI_SDK_SUBDIR x64)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES arm)
+    elseif(ARM)
       set(PVAPI_SDK_SUBDIR arm)
     endif()
 
@@ -140,7 +149,7 @@ endif(WITH_XIMEA)
 # --- FFMPEG ---
 ocv_clear_vars(HAVE_FFMPEG HAVE_FFMPEG_CODEC HAVE_FFMPEG_FORMAT HAVE_FFMPEG_UTIL HAVE_FFMPEG_SWSCALE HAVE_GENTOO_FFMPEG HAVE_FFMPEG_FFMPEG)
 if(WITH_FFMPEG)
-  if(WIN32)
+  if(WIN32 AND NOT ARM)
     include("${OpenCV_SOURCE_DIR}/3rdparty/ffmpeg/ffmpeg_version.cmake")
   elseif(UNIX)
     CHECK_MODULE(libavcodec HAVE_FFMPEG_CODEC)
@@ -204,11 +213,16 @@ if(WITH_FFMPEG)
   endif(APPLE)
 endif(WITH_FFMPEG)
 
-# --- VideoInput ---
-if(WITH_VIDEOINPUT)
+# --- VideoInput/DirectShow ---
+if(WITH_DSHOW)
   # always have VideoInput on Windows
-  set(HAVE_VIDEOINPUT 1)
-endif(WITH_VIDEOINPUT)
+  set(HAVE_DSHOW 1)
+endif(WITH_DSHOW)
+
+# --- VideoInput/Microsoft Media Foundation ---
+if(WITH_MSMF)
+  check_include_file(Mfapi.h HAVE_MSMF)
+endif(WITH_MSMF)
 
 # --- Extra HighGUI libs on Windows ---
 if(WIN32)
diff --git a/cmake/OpenCVLegacyOptions.cmake b/cmake/OpenCVLegacyOptions.cmake
index a34c9e5abc..e05ad4c48d 100644
--- a/cmake/OpenCVLegacyOptions.cmake
+++ b/cmake/OpenCVLegacyOptions.cmake
@@ -12,6 +12,7 @@ endmacro()
 ocv_legacy_option(BUILD_NEW_PYTHON_SUPPORT BUILD_opencv_python)
 ocv_legacy_option(BUILD_JAVA_SUPPORT       BUILD_opencv_java)
 ocv_legacy_option(WITH_ANDROID_CAMERA      BUILD_opencv_androidcamera)
+ocv_legacy_option(WITH_VIDEOINPUT          WITH_DSHOW)
 
 if(DEFINED OPENCV_BUILD_3RDPARTY_LIBS)
   set(BUILD_ZLIB   ${OPENCV_BUILD_3RDPARTY_LIBS} CACHE BOOL "Set via depricated OPENCV_BUILD_3RDPARTY_LIBS" FORCE)
diff --git a/cmake/OpenCVDetectCudaArch.cu b/cmake/checks/OpenCVDetectCudaArch.cu
similarity index 100%
rename from cmake/OpenCVDetectCudaArch.cu
rename to cmake/checks/OpenCVDetectCudaArch.cu
diff --git a/cmake/checks/vfwtest.cpp b/cmake/checks/vfwtest.cpp
new file mode 100644
index 0000000000..63d545788f
--- /dev/null
+++ b/cmake/checks/vfwtest.cpp
@@ -0,0 +1,10 @@
+
+#include <windows.h>
+#include <vfw.h>
+
+int main()
+{
+  AVIFileInit();
+  AVIFileExit();
+  return 0;
+}
\ No newline at end of file
diff --git a/cmake/checks/win32uitest.cpp b/cmake/checks/win32uitest.cpp
new file mode 100644
index 0000000000..f475e1c963
--- /dev/null
+++ b/cmake/checks/win32uitest.cpp
@@ -0,0 +1,11 @@
+#include <windows.h>
+
+int main(int argc, char** argv)
+{
+    CreateWindow(NULL /*lpClassName*/, NULL /*lpWindowName*/, 0 /*dwStyle*/, 0 /*x*/,
+                 0 /*y*/, 0 /*nWidth*/, 0 /*nHeight*/, NULL /*hWndParent*/, NULL /*hMenu*/,
+                NULL /*hInstance*/,  NULL /*lpParam*/);
+    DeleteDC(NULL);
+
+    return 0;
+}
diff --git a/cmake/templates/cvconfig.h.cmake b/cmake/templates/cvconfig.h.cmake
index 62c8b68b38..a419b0c3f2 100644
--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@@ -13,6 +13,9 @@
    */
 #cmakedefine HAVE_ALLOCA_H 1
 
+/* Video for Windows support */
+#cmakedefine HAVE_VFW
+
 /* V4L capturing support */
 #cmakedefine HAVE_CAMV4L
 
@@ -55,6 +58,9 @@
 /* GTK+ 2.0 Thread support */
 #cmakedefine  HAVE_GTHREAD
 
+/* Win32 UI */
+#cmakedefine HAVE_WIN32UI
+
 /* GTK+ 2.x toolkit */
 #cmakedefine  HAVE_GTK
 
@@ -205,8 +211,11 @@
 /* AMD's Basic Linear Algebra Subprograms Library*/
 #cmakedefine HAVE_CLAMDBLAS
 
-/* VideoInput library */
-#cmakedefine HAVE_VIDEOINPUT
+/* DirectShow Video Capture library */
+#cmakedefine HAVE_DSHOW
+
+/* Microsoft Media Foundation Capture library */
+#cmakedefine HAVE_MSMF
 
 /* XIMEA camera support */
 #cmakedefine HAVE_XIMEA
diff --git a/doc/_themes/blue/layout.html b/doc/_themes/blue/layout.html
index 8bba49b17a..a376c97592 100644
--- a/doc/_themes/blue/layout.html
+++ b/doc/_themes/blue/layout.html
@@ -183,7 +183,7 @@
                   {% if theme_lang == 'c' %}
                   {% endif %}
                   {% if theme_lang == 'cpp' %}
-                    <li>Try the <a href="http://opencv.willowgarage.com/wiki/Welcome?action=AttachFile&do=get&target=opencv_cheatsheet.pdf">Cheatsheet</a>.</li>
+                    <li>Try the <a href="http://docs.opencv.org/trunk/opencv_cheatsheet.pdf">Cheatsheet</a>.</li>
                   {% endif %}
                   {% if theme_lang == 'py' %}
                     <li>Try the <a href="cookbook.html">Cookbook</a>.</li>
diff --git a/doc/tutorials/features2d/feature_description/feature_description.rst b/doc/tutorials/features2d/feature_description/feature_description.rst
index aa1a4a88be..2d97f83be8 100644
--- a/doc/tutorials/features2d/feature_description/feature_description.rst
+++ b/doc/tutorials/features2d/feature_description/feature_description.rst
@@ -13,7 +13,7 @@ In this tutorial you will learn how to:
    * Use the :descriptor_extractor:`DescriptorExtractor<>` interface in order to find the feature vector correspondent to the keypoints. Specifically:
 
      * Use :surf_descriptor_extractor:`SurfDescriptorExtractor<>` and its function :descriptor_extractor:`compute<>` to perform the required calculations.
-     * Use a :brute_force_matcher:`BruteForceMatcher<>`	to match the features vector
+     * Use a :brute_force_matcher:`BFMatcher<>`	to match the features vector
      * Use the function :draw_matches:`drawMatches<>` to draw the detected matches.
 
 
@@ -69,7 +69,7 @@ This tutorial code's is shown lines below. You can also download it from `here <
      extractor.compute( img_2, keypoints_2, descriptors_2 );
 
      //-- Step 3: Matching descriptor vectors with a brute force matcher
-     BruteForceMatcher< L2<float> > matcher;
+     BFMatcher matcher(NORM_L2);
      std::vector< DMatch > matches;
      matcher.match( descriptors_1, descriptors_2, matches );
 
diff --git a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
index b37126f308..b6c859dc34 100644
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
@@ -48,10 +48,10 @@ The structure of package contents looks as follows:
 
 ::
 
-    OpenCV-2.4.4-android-sdk
+    OpenCV-2.4.5-android-sdk
     |_ apk
-    |   |_ OpenCV_2.4.4_binary_pack_armv7a.apk
-    |   |_ OpenCV_2.4.4_Manager_2.6_XXX.apk
+    |   |_ OpenCV_2.4.5_binary_pack_armv7a.apk
+    |   |_ OpenCV_2.4.5_Manager_2.7_XXX.apk
     |
     |_ doc
     |_ samples
@@ -157,10 +157,10 @@ Get the OpenCV4Android SDK
 
    .. code-block:: bash
 
-      unzip ~/Downloads/OpenCV-2.4.4-android-sdk.zip
+      unzip ~/Downloads/OpenCV-2.4.5-android-sdk.zip
 
-.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.4-android-sdk.zip`
-.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.4/OpenCV-2.4.4-android-sdk.zip/download
+.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.5-android-sdk.zip`
+.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.5/OpenCV-2.4.5-android-sdk.zip/download
 .. |opencv_android_bin_pack_url| replace:: |opencv_android_bin_pack|
 .. |seven_zip| replace:: 7-Zip
 .. _seven_zip: http://www.7-zip.org/
@@ -295,7 +295,7 @@ Well, running samples from Eclipse is very simple:
   .. code-block:: sh
     :linenos:
 
-    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.4_Manager_2.6_armv7a-neon.apk
+    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.5_Manager_2.7_armv7a-neon.apk
 
   .. note:: ``armeabi``, ``armv7a-neon``, ``arm7a-neon-android8``, ``mips`` and ``x86`` stand for
             platform targets:
diff --git a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
index 41f5166292..231fe5afa1 100644
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
@@ -55,14 +55,14 @@ Manager to access OpenCV libraries externally installed in the target system.
    :guilabel:`File -> Import -> Existing project in your workspace`.
 
    Press :guilabel:`Browse`  button and locate OpenCV4Android SDK
-   (:file:`OpenCV-2.4.4-android-sdk/sdk`).
+   (:file:`OpenCV-2.4.5-android-sdk/sdk`).
 
    .. image:: images/eclipse_opencv_dependency0.png
         :alt: Add dependency from OpenCV library
         :align: center
 
 #. In application project add a reference to the OpenCV Java SDK in
-   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.4``.
+   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.5``.
 
    .. image:: images/eclipse_opencv_dependency1.png
         :alt: Add dependency from OpenCV library
@@ -101,7 +101,7 @@ See the "15-puzzle" OpenCV sample for details.
         public void onResume()
         {
             super.onResume();
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_3, this, mLoaderCallback);
+            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_5, this, mLoaderCallback);
         }
 
         ...
@@ -128,27 +128,27 @@ described above.
 #. Add the OpenCV library project to your workspace the same way as for the async initialization
    above. Use menu :guilabel:`File -> Import -> Existing project in your workspace`,
    press :guilabel:`Browse` button and select OpenCV SDK path
-   (:file:`OpenCV-2.4.4-android-sdk/sdk`).
+   (:file:`OpenCV-2.4.5-android-sdk/sdk`).
 
    .. image:: images/eclipse_opencv_dependency0.png
         :alt: Add dependency from OpenCV library
         :align: center
 
 #. In the application project add a reference to the OpenCV4Android SDK in
-   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.4``;
+   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.5``;
 
    .. image:: images/eclipse_opencv_dependency1.png
        :alt: Add dependency from OpenCV library
        :align: center
 
 #. If your application project **doesn't have a JNI part**, just copy the corresponding OpenCV
-   native libs from :file:`<OpenCV-2.4.4-android-sdk>/sdk/native/libs/<target_arch>` to your
+   native libs from :file:`<OpenCV-2.4.5-android-sdk>/sdk/native/libs/<target_arch>` to your
    project directory to folder :file:`libs/<target_arch>`.
 
    In case of the application project **with a JNI part**, instead of manual libraries copying you
    need to modify your ``Android.mk`` file:
    add the following two code lines after the ``"include $(CLEAR_VARS)"`` and before
-   ``"include path_to_OpenCV-2.4.4-android-sdk/sdk/native/jni/OpenCV.mk"``
+   ``"include path_to_OpenCV-2.4.5-android-sdk/sdk/native/jni/OpenCV.mk"``
 
    .. code-block:: make
       :linenos:
@@ -221,7 +221,7 @@ taken:
 
    .. code-block:: make
 
-      include C:\Work\OpenCV4Android\OpenCV-2.4.4-android-sdk\sdk\native\jni\OpenCV.mk
+      include C:\Work\OpenCV4Android\OpenCV-2.4.5-android-sdk\sdk\native\jni\OpenCV.mk
 
    Should be inserted into the :file:`jni/Android.mk` file **after** this line:
 
diff --git a/doc/tutorials/introduction/linux_eclipse/linux_eclipse.rst b/doc/tutorials/introduction/linux_eclipse/linux_eclipse.rst
index c1a6fac000..dc684451e7 100644
--- a/doc/tutorials/introduction/linux_eclipse/linux_eclipse.rst
+++ b/doc/tutorials/introduction/linux_eclipse/linux_eclipse.rst
@@ -201,8 +201,6 @@ Assuming that the image to use as the argument would be located in <DisplayImage
 V2: Using CMake+OpenCV with Eclipse (plugin CDT)
 ==================================================
 
-(See the `getting started <http://opencv.willowgarage.com/wiki/Getting_started>` section of the OpenCV Wiki)
-
 Say you have or create a new file, *helloworld.cpp* in a directory called *foo*:
 
 .. code-block:: cpp
diff --git a/modules/calib3d/test/test_solvepnp_ransac.cpp b/modules/calib3d/test/test_solvepnp_ransac.cpp
index 5cc39a0ea1..7e93369676 100644
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@@ -239,7 +239,7 @@ protected:
     }
 };
 
-TEST(Calib3d_SolvePnPRansac, accuracy) { CV_solvePnPRansac_Test test; test.safe_run(); }
+TEST(DISABLED_Calib3d_SolvePnPRansac, accuracy) { CV_solvePnPRansac_Test test; test.safe_run(); }
 TEST(Calib3d_SolvePnP, accuracy) { CV_solvePnP_Test test; test.safe_run(); }
 
 
diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp
index a5817e95d5..e5f146d21d 100644
--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -460,14 +460,29 @@ void CV_StereoMatchingTest::run(int)
             continue;
         }
         int dispScaleFactor = datasetsParams[datasetName].dispScaleFactor;
-        Mat tmp; trueLeftDisp.convertTo( tmp, CV_32FC1, 1.f/dispScaleFactor ); trueLeftDisp = tmp; tmp.release();
+        Mat tmp;
+
+        trueLeftDisp.convertTo( tmp, CV_32FC1, 1.f/dispScaleFactor );
+        trueLeftDisp = tmp;
+        tmp.release();
+
         if( !trueRightDisp.empty() )
-            trueRightDisp.convertTo( tmp, CV_32FC1, 1.f/dispScaleFactor ); trueRightDisp = tmp; tmp.release();
+        {
+            trueRightDisp.convertTo( tmp, CV_32FC1, 1.f/dispScaleFactor );
+            trueRightDisp = tmp;
+            tmp.release();
+        }
 
         Mat leftDisp, rightDisp;
         int ignBorder = max(runStereoMatchingAlgorithm(leftImg, rightImg, leftDisp, rightDisp, ci), EVAL_IGNORE_BORDER);
-        leftDisp.convertTo( tmp, CV_32FC1 ); leftDisp = tmp; tmp.release();
-        rightDisp.convertTo( tmp, CV_32FC1 ); rightDisp = tmp; tmp.release();
+
+        leftDisp.convertTo( tmp, CV_32FC1 );
+        leftDisp = tmp;
+        tmp.release();
+
+        rightDisp.convertTo( tmp, CV_32FC1 );
+        rightDisp = tmp;
+        tmp.release();
 
         int tempCode = processStereoMatchingResults( resFS, ci, isWrite,
                    leftImg, rightImg, trueLeftDisp, trueRightDisp, leftDisp, rightDisp, QualityEvalParams(ignBorder));
@@ -531,7 +546,8 @@ int CV_StereoMatchingTest::processStereoMatchingResults( FileStorage& fs, int ca
     // rightDisp is not used in current test virsion
     int code = cvtest::TS::OK;
     assert( fs.isOpened() );
-    assert( trueLeftDisp.type() == CV_32FC1 && trueRightDisp.type() == CV_32FC1 );
+    assert( trueLeftDisp.type() == CV_32FC1 );
+    assert( trueRightDisp.empty() || trueRightDisp.type() == CV_32FC1 );
     assert( leftDisp.type() == CV_32FC1 && rightDisp.type() == CV_32FC1 );
 
     // get masks for unknown ground truth disparity values
diff --git a/modules/contrib/doc/facerec/facerec_tutorial.rst b/modules/contrib/doc/facerec/facerec_tutorial.rst
index 170da8ff24..16b425d7ee 100644
--- a/modules/contrib/doc/facerec/facerec_tutorial.rst
+++ b/modules/contrib/doc/facerec/facerec_tutorial.rst
@@ -7,7 +7,7 @@ Face Recognition with OpenCV
 Introduction
 ============
 
-`OpenCV (Open Source Computer Vision) <http://opencv.willowgarage.com>`_ is a popular computer vision library started by `Intel <http://www.intel.com>`_ in 1999. The cross-platform library sets its focus on real-time image processing and includes patent-free implementations of the latest computer vision algorithms. In 2008 `Willow Garage <http://www.willowgarage.com>`_ took over support and OpenCV 2.3.1 now comes with a programming interface to C, C++, `Python <http://www.python.org>`_ and `Android <http://www.android.com>`_. OpenCV is released under a BSD license so it is used in academic projects and commercial products alike.
+`OpenCV (Open Source Computer Vision) <http://opencv.org>`_ is a popular computer vision library started by `Intel <http://www.intel.com>`_ in 1999. The cross-platform library sets its focus on real-time image processing and includes patent-free implementations of the latest computer vision algorithms. In 2008 `Willow Garage <http://www.willowgarage.com>`_ took over support and OpenCV 2.3.1 now comes with a programming interface to C, C++, `Python <http://www.python.org>`_ and `Android <http://www.android.com>`_. OpenCV is released under a BSD license so it is used in academic projects and commercial products alike.
 
 OpenCV 2.4 now comes with the very new :ocv:class:`FaceRecognizer` class for face recognition, so you can start experimenting with face recognition right away. This document is the guide I've wished for, when I was working myself into face recognition. It shows you how to perform face recognition with :ocv:class:`FaceRecognizer` in OpenCV (with full source code listings) and gives you an introduction into the algorithms behind. I'll also show how to create the visualizations you can find in many publications, because a lot of people asked for.
 
diff --git a/modules/contrib/doc/facerec/src/CMakeLists.txt b/modules/contrib/doc/facerec/src/CMakeLists.txt
index 10720048c0..e56762ea49 100644
--- a/modules/contrib/doc/facerec/src/CMakeLists.txt
+++ b/modules/contrib/doc/facerec/src/CMakeLists.txt
@@ -6,7 +6,7 @@ project(facerec_cpp_samples)
 #SET(OpenCV_DIR /path/to/your/opencv/installation)
 
 # packages
-find_package(OpenCV REQUIRED) # http://opencv.willowgarage.com
+find_package(OpenCV REQUIRED) # http://opencv.org
 
 # probably you should loop through the sample files here
 add_executable(facerec_demo facerec_demo.cpp)
diff --git a/modules/contrib/include/opencv2/contrib/contrib.hpp b/modules/contrib/include/opencv2/contrib/contrib.hpp
index 7a43631fce..b9edf94f59 100644
--- a/modules/contrib/include/opencv2/contrib/contrib.hpp
+++ b/modules/contrib/include/opencv2/contrib/contrib.hpp
@@ -45,4 +45,4 @@
 #error this is a compatibility header which should not be used inside the OpenCV library
 #endif
 
-#include "opencv2/contrib.hpp"
\ No newline at end of file
+#include "opencv2/contrib.hpp"
diff --git a/modules/contrib/src/ba.cpp b/modules/contrib/src/ba.cpp
index bf37dbfe03..fb361d49c2 100644
--- a/modules/contrib/src/ba.cpp
+++ b/modules/contrib/src/ba.cpp
@@ -1106,7 +1106,7 @@ void LevMarqSparse::bundleAdjust( std::vector<Point3d>& points, //positions of p
     Mat rot_vec = levmarP.rowRange(i*num_cam_param, i*num_cam_param+3);
     Rodrigues( rot_vec, R[i] );
     //translation
-    T[i] = levmarP.rowRange(i*num_cam_param + 3, i*num_cam_param+6);
+    levmarP.rowRange(i*num_cam_param + 3, i*num_cam_param+6).copyTo(T[i]);
 
     //intrinsic camera matrix
     double* intr_data = (double*)cameraMatrix[i].data;
diff --git a/modules/contrib/src/fuzzymeanshifttracker.cpp b/modules/contrib/src/fuzzymeanshifttracker.cpp
index 443b961ed1..c83f915b03 100644
--- a/modules/contrib/src/fuzzymeanshifttracker.cpp
+++ b/modules/contrib/src/fuzzymeanshifttracker.cpp
@@ -380,6 +380,7 @@ void CvFuzzyMeanShiftTracker::SearchWindow::initDepthValues(IplImage *maskImage,
                 {
                     if (*depthData)
                     {
+                        d = *depthData;
                         m1 += d;
                         if (d < mind)
                             mind = d;
diff --git a/modules/core/doc/intro.rst b/modules/core/doc/intro.rst
index 806f434b26..5a170cb13f 100644
--- a/modules/core/doc/intro.rst
+++ b/modules/core/doc/intro.rst
@@ -4,7 +4,7 @@ Introduction
 
 .. highlight:: cpp
 
-OpenCV (Open Source Computer Vision Library: http://opencv.willowgarage.com/wiki/) is an open-source BSD-licensed library that includes several hundreds of computer vision algorithms. The document describes the so-called OpenCV 2.x API, which is essentially a C++ API, as opposite to the C-based OpenCV 1.x API. The latter is described in opencv1x.pdf.
+OpenCV (Open Source Computer Vision Library: http://opencv.org) is an open-source BSD-licensed library that includes several hundreds of computer vision algorithms. The document describes the so-called OpenCV 2.x API, which is essentially a C++ API, as opposite to the C-based OpenCV 1.x API. The latter is described in opencv1x.pdf.
 
 OpenCV has a modular structure, which means that the package includes several shared or static libraries. The following modules are available:
 
diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp
index 5ceb8ff96c..438918359b 100644
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -45,4 +45,4 @@
 #error this is a compatibility header which should not be used inside the OpenCV library
 #endif
 
-#include "opencv2/core.hpp"
\ No newline at end of file
+#include "opencv2/core.hpp"
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 7ce9bb8e1f..ce33c15788 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -360,6 +360,8 @@ CV_INLINE int cvRound( double value )
         fistp t;
     }
     return t;
+#elif defined _MSC_VER && defined _M_ARM && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND(value);
 #elif defined HAVE_LRINT || defined CV_ICC || defined __GNUC__
 #  ifdef HAVE_TEGRA_OPTIMIZATION
     TEGRA_ROUND(value);
@@ -367,8 +369,12 @@ CV_INLINE int cvRound( double value )
     return (int)lrint(value);
 #  endif
 #else
-    // while this is not IEEE754-compliant rounding, it's usually a good enough approximation
-    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+    double intpart, fractpart;
+    fractpart = modf(value, &intpart);
+    if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0))
+        return (int)(value + (value >= 0 ? 0.5 : -0.5));
+    else
+        return (int)intpart;
 #endif
 }
 
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 3378e9dd14..8a54e7f8e4 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -1704,6 +1704,7 @@ public:
     SparseMatConstIterator_();
     //! the full constructor setting the iterator to the first sparse matrix element
     SparseMatConstIterator_(const SparseMat_<_Tp>* _m);
+    SparseMatConstIterator_(const SparseMat* _m);
     //! the copy constructor
     SparseMatConstIterator_(const SparseMatConstIterator_& it);
 
@@ -1740,6 +1741,7 @@ public:
     SparseMatIterator_();
     //! the full constructor setting the iterator to the first sparse matrix element
     SparseMatIterator_(SparseMat_<_Tp>* _m);
+    SparseMatIterator_(SparseMat* _m);
     //! the copy constructor
     SparseMatIterator_(const SparseMatIterator_& it);
 
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index a5ecea8dce..026ab695d5 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -2587,6 +2587,13 @@ SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMat_<_Tp>* _m)
     : SparseMatConstIterator(_m)
 {}
 
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMat* _m)
+    : SparseMatConstIterator(_m)
+{
+    CV_Assert( _m->type() == DataType<_Tp>::type );
+}
+
 template<typename _Tp> inline
 SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMatConstIterator_<_Tp>& it)
     : SparseMatConstIterator(it)
@@ -2634,6 +2641,11 @@ SparseMatIterator_<_Tp>::SparseMatIterator_(SparseMat_<_Tp>* _m)
     : SparseMatConstIterator_<_Tp>(_m)
 {}
 
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_(SparseMat* _m)
+    : SparseMatConstIterator_<_Tp>(_m)
+{}
+
 template<typename _Tp> inline
 SparseMatIterator_<_Tp>::SparseMatIterator_(const SparseMatIterator_<_Tp>& it)
     : SparseMatConstIterator_<_Tp>(it)
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index 687b75e4b8..35b08582c4 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -577,10 +577,10 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
                     continue;
 
                 p *= 2;
-                double beta = a - b, gamma = hypot((double)p, beta), delta;
+                double beta = a - b, gamma = hypot((double)p, beta);
                 if( beta < 0 )
                 {
-                    delta = (gamma - beta)*0.5;
+                    double delta = (gamma - beta)*0.5;
                     s = (_Tp)std::sqrt(delta/gamma);
                     c = (_Tp)(p/(gamma*s*2));
                 }
@@ -588,36 +588,18 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
                 {
                     c = (_Tp)std::sqrt((gamma + beta)/(gamma*2));
                     s = (_Tp)(p/(gamma*c*2));
-                    delta = p*p*0.5/(gamma + beta);
                 }
 
-                W[i] += delta;
-                W[j] -= delta;
-
-                if( iter % 2 != 0 && W[i] > 0 && W[j] > 0 )
-                {
-                    k = vblas.givens(Ai, Aj, m, c, s);
-
-                    for( ; k < m; k++ )
-                    {
-                        _Tp t0 = c*Ai[k] + s*Aj[k];
-                        _Tp t1 = -s*Ai[k] + c*Aj[k];
-                        Ai[k] = t0; Aj[k] = t1;
-                    }
-                }
-                else
+                a = b = 0;
+                for( k = 0; k < m; k++ )
                 {
-                    a = b = 0;
-                    for( k = 0; k < m; k++ )
-                    {
-                        _Tp t0 = c*Ai[k] + s*Aj[k];
-                        _Tp t1 = -s*Ai[k] + c*Aj[k];
-                        Ai[k] = t0; Aj[k] = t1;
+                    _Tp t0 = c*Ai[k] + s*Aj[k];
+                    _Tp t1 = -s*Ai[k] + c*Aj[k];
+                    Ai[k] = t0; Aj[k] = t1;
 
-                        a += (double)t0*t0; b += (double)t1*t1;
-                    }
-                    W[i] = a; W[j] = b;
+                    a += (double)t0*t0; b += (double)t1*t1;
                 }
+                W[i] = a; W[j] = b;
 
                 changed = true;
 
diff --git a/modules/core/src/matop.cpp b/modules/core/src/matop.cpp
index f089702695..016435650a 100644
--- a/modules/core/src/matop.cpp
+++ b/modules/core/src/matop.cpp
@@ -324,7 +324,7 @@ void MatOp::augAssignXor(const MatExpr& expr, Mat& m) const
 {
     Mat temp;
     expr.op->assign(expr, temp);
-    m /= temp;
+    m ^= temp;
 }
 
 
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index d7567b2d39..c48fbc6a66 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -183,7 +183,7 @@ static void finalizeHdr(Mat& m)
 void Mat::create(int d, const int* _sizes, int _type)
 {
     int i;
-    CV_Assert(0 <= d && _sizes && d <= CV_MAX_DIM && _sizes);
+    CV_Assert(0 <= d && d <= CV_MAX_DIM && _sizes);
     _type = CV_MAT_TYPE(_type);
 
     if( data && (d == dims || (d == 1 && dims <= 2)) && _type == type() )
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index 9ea057d89a..721fd0e6bd 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -1551,3 +1551,16 @@ TEST(Core_Add, AddToColumnWhen4Rows)
 
     ASSERT_EQ(0, countNonZero(m1 - m2));
 }
+
+TEST(Core_round, CvRound)
+{
+    ASSERT_EQ(2, cvRound(2.0));
+    ASSERT_EQ(2, cvRound(2.1));
+    ASSERT_EQ(-2, cvRound(-2.1));
+    ASSERT_EQ(3, cvRound(2.8));
+    ASSERT_EQ(-3, cvRound(-2.8));
+    ASSERT_EQ(2, cvRound(2.5));
+    ASSERT_EQ(4, cvRound(3.5));
+    ASSERT_EQ(-2, cvRound(-2.5));
+    ASSERT_EQ(-4, cvRound(-3.5));
+}
diff --git a/modules/features2d/doc/feature_detection_and_description.rst b/modules/features2d/doc/feature_detection_and_description.rst
index 659fa4c3b2..f265ab3c4f 100644
--- a/modules/features2d/doc/feature_detection_and_description.rst
+++ b/modules/features2d/doc/feature_detection_and_description.rst
@@ -48,7 +48,7 @@ Maximally stable extremal region extractor. ::
     };
 
 The class encapsulates all the parameters of the MSER extraction algorithm (see
-http://en.wikipedia.org/wiki/Maximally_stable_extremal_regions). Also see http://opencv.willowgarage.com/wiki/documentation/cpp/features2d/MSER for useful comments and parameters description.
+http://en.wikipedia.org/wiki/Maximally_stable_extremal_regions). Also see http://code.opencv.org/projects/opencv/wiki/MSER for useful comments and parameters description.
 
 
 ORB
diff --git a/modules/features2d/src/keypoint.cpp b/modules/features2d/src/keypoint.cpp
index 118d3df5ea..0cf7ae0571 100644
--- a/modules/features2d/src/keypoint.cpp
+++ b/modules/features2d/src/keypoint.cpp
@@ -69,7 +69,7 @@ struct KeypointResponseGreater
 void KeyPointsFilter::retainBest(std::vector<KeyPoint>& keypoints, int n_points)
 {
     //this is only necessary if the keypoints size is greater than the number of desired points.
-    if( n_points > 0 && keypoints.size() > (size_t)n_points )
+    if( n_points >= 0 && keypoints.size() > (size_t)n_points )
     {
         if (n_points==0)
         {
diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h
index bfd37a9e23..e001da7cde 100644
--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -421,7 +421,6 @@ struct Hamming
     ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
     {
         ResultType result = 0;
-#ifdef __GNUC__
 #ifdef __ARM_NEON__
         {
             uint32x4_t bits = vmovq_n_u32(0);
@@ -438,7 +437,7 @@ struct Hamming
             result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
             result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
         }
-#else
+#elif __GNUC__
         {
             //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
             typedef unsigned long long pop_t;
@@ -458,8 +457,8 @@ struct Hamming
                 result += __builtin_popcountll(a_final ^ b_final);
             }
         }
-#endif //NEON
-#else
+#else // NO NEON and NOT GNUC
+        typedef unsigned long long pop_t;
         HammingLUT lut;
         result = lut(reinterpret_cast<const unsigned char*> (a),
                      reinterpret_cast<const unsigned char*> (b), size * sizeof(pop_t));
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 3d613cd4dd..3472b2fa09 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -29,8 +29,6 @@ if(HAVE_CUDA)
   source_group("Src\\NVidia" FILES ${ncv_files})
   ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
-  string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
 
   if(MSVC)
     if(NOT ENABLE_NOISY_WARNINGS)
diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp
index 07014849ad..dd39aa822a 100644
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -1007,7 +1007,7 @@ PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG,
 
 #if defined(HAVE_NVCUVID) && BUILD_WITH_VIDEO_INPUT_SUPPORT
 
-PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
+PERF_TEST_P(Video, DISABLED_Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
 {
     declare.time(20);
 
@@ -1044,7 +1044,7 @@ PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video
 
 #if defined(HAVE_NVCUVID) && defined(WIN32)
 
-PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
+PERF_TEST_P(Video, DISABLED_Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
 {
     declare.time(30);
 
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index afce5bbc7f..a9b003937a 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -1793,10 +1793,10 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
 
 namespace arithm
 {
-    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatEq_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void cmpMatNe_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void cmpMatLt_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void cmpMatLe_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
 
     template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1820,7 +1820,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
     };
 
-    typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    typedef void (*func_v4_t)(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
     static const func_v4_t funcs_v4[] =
     {
         cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
diff --git a/modules/gpu/src/tvl1flow.cpp b/modules/gpu/src/tvl1flow.cpp
index 4fe67c097a..49dd8caf43 100644
--- a/modules/gpu/src/tvl1flow.cpp
+++ b/modules/gpu/src/tvl1flow.cpp
@@ -129,6 +129,17 @@ void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuM
             gpu::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
             gpu::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
         }
+        else
+        {
+            u1s[s].create(I0s[s].size(), CV_32FC1);
+            u2s[s].create(I0s[s].size(), CV_32FC1);
+        }
+    }
+
+    if (!useInitialFlow)
+    {
+        u1s[nscales-1].setTo(Scalar::all(0));
+        u2s[nscales-1].setTo(Scalar::all(0));
     }
 
     // pyramidal structure for computing the optical flow
@@ -173,18 +184,9 @@ void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const Gpu
 
     CV_DbgAssert( I1.size() == I0.size() );
     CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
+    CV_DbgAssert( u1.size() == I0.size() );
     CV_DbgAssert( u2.size() == u1.size() );
 
-    if (u1.empty())
-    {
-        u1.create(I0.size(), CV_32FC1);
-        u1.setTo(Scalar::all(0));
-
-        u2.create(I0.size(), CV_32FC1);
-        u2.setTo(Scalar::all(0));
-    }
-
     GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
     GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
     centeredGradient(I1, I1x, I1y);
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index 8d3056742a..166a834692 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -95,7 +95,7 @@ if(HAVE_QT)
   if(${_have_flag})
     set_source_files_properties(${_RCC_OUTFILES} PROPERTIES COMPILE_FLAGS -Wno-missing-declarations)
   endif()
-elseif(WIN32)
+elseif(HAVE_WIN32UI)
   list(APPEND highgui_srcs src/window_w32.cpp)
 elseif(HAVE_GTK)
   list(APPEND highgui_srcs src/window_gtk.cpp)
@@ -111,9 +111,21 @@ elseif(APPLE)
   endif()
 endif()
 
-if(WIN32)
-  list(APPEND highgui_srcs src/cap_vfw.cpp src/cap_cmu.cpp src/cap_dshow.cpp)
-endif(WIN32)
+if(WIN32 AND NOT ARM)
+  list(APPEND highgui_srcs src/cap_cmu.cpp)
+endif()
+
+if (WIN32 AND HAVE_DSHOW)
+  list(APPEND highgui_srcs src/cap_dshow.cpp)
+endif()
+
+if (WIN32 AND HAVE_MSMF)
+  list(APPEND highgui_srcs src/cap_msmf.cpp)
+endif()
+
+if (WIN32 AND HAVE_VFW)
+  list(APPEND highgui_srcs src/cap_vfw.cpp)
+endif()
 
 if(HAVE_XINE)
   list(APPEND highgui_srcs src/cap_xine.cpp)
diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h
index 116e8c1f45..c29a1dca70 100644
--- a/modules/highgui/include/opencv2/highgui/highgui_c.h
+++ b/modules/highgui/include/opencv2/highgui/highgui_c.h
@@ -298,6 +298,7 @@ enum
     CV_CAP_UNICAP   =600,   // Unicap drivers
 
     CV_CAP_DSHOW    =700,   // DirectShow (via videoInput)
+    CV_CAP_MSMF     =1400,  // Microsoft Media Foundation (via videoInput)
 
     CV_CAP_PVAPI    =800,   // PvAPI, Prosilica GigE SDK
 
diff --git a/modules/highgui/perf/perf_precomp.hpp b/modules/highgui/perf/perf_precomp.hpp
index 30e9b7ff6d..16880d1e42 100644
--- a/modules/highgui/perf/perf_precomp.hpp
+++ b/modules/highgui/perf/perf_precomp.hpp
@@ -20,9 +20,9 @@
     defined(HAVE_GSTREAMER)    || \
     defined(HAVE_QUICKTIME)    || \
     defined(HAVE_AVFOUNDATION) || \
-    /*defined(HAVE_OPENNI)     || too specialized */ \
     defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
+    defined(HAVE_VFW)
+    /*defined(HAVE_OPENNI) too specialized */ \
 
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
 #else
@@ -34,7 +34,7 @@
     defined(HAVE_QUICKTIME)    || \
     defined(HAVE_AVFOUNDATION) || \
     defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
+    defined(HAVE_VFW)
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 1
 #else
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 0
diff --git a/modules/highgui/src/cap.cpp b/modules/highgui/src/cap.cpp
index d1325a3b24..0e33a81a81 100644
--- a/modules/highgui/src/cap.cpp
+++ b/modules/highgui/src/cap.cpp
@@ -114,7 +114,7 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
 {
     int  domains[] =
     {
-#ifdef HAVE_VIDEOINPUT
+#ifdef HAVE_DSHOW
         CV_CAP_DSHOW,
 #endif
 #if 1
@@ -168,7 +168,8 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
     // try every possibly installed camera API
     for (int i = 0; domains[i] >= 0; i++)
     {
-#if defined(HAVE_VIDEOINPUT)   || \
+#if defined(HAVE_DSHOW)        || \
+    defined(HAVE_MSMF)         || \
     defined(HAVE_TYZX)         || \
     defined(HAVE_VFW)          || \
     defined(HAVE_LIBV4L)       || \
@@ -195,11 +196,18 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
 
         switch (domains[i])
         {
-#ifdef HAVE_VIDEOINPUT
+#ifdef HAVE_MSMF
+        case CV_CAP_MSMF:
+             capture = cvCreateCameraCapture_MSMF (index);
+             if (capture)
+                 return capture;
+            break;
+#endif
+#ifdef HAVE_DSHOW
         case CV_CAP_DSHOW:
-            capture = cvCreateCameraCapture_DShow (index);
-            if (capture)
-                return capture;
+             capture = cvCreateCameraCapture_DShow (index);
+             if (capture)
+                 return capture;
             break;
 #endif
 
diff --git a/modules/highgui/src/cap_dshow.cpp b/modules/highgui/src/cap_dshow.cpp
index 2cb0158216..42741e83b4 100644
--- a/modules/highgui/src/cap_dshow.cpp
+++ b/modules/highgui/src/cap_dshow.cpp
@@ -41,7 +41,7 @@
 
 #include "precomp.hpp"
 
-#if (defined WIN32 || defined _WIN32) && defined HAVE_VIDEOINPUT
+#if (defined WIN32 || defined _WIN32) && defined HAVE_DSHOW
 
 /*
    DirectShow-based Video Capturing module is based on
@@ -3098,6 +3098,7 @@ HRESULT videoInput::routeCrossbar(ICaptureGraphBuilder2 **ppBuild, IBaseFilter *
     return hr;
 }
 
+
 /********************* Capturing video from camera via DirectShow *********************/
 
 class CvCaptureCAM_DShow : public CvCapture
diff --git a/modules/highgui/src/cap_ffmpeg.cpp b/modules/highgui/src/cap_ffmpeg.cpp
index 6029fa2e0e..11bb80c8f1 100644
--- a/modules/highgui/src/cap_ffmpeg.cpp
+++ b/modules/highgui/src/cap_ffmpeg.cpp
@@ -209,7 +209,7 @@ CvCapture* cvCreateFileCapture_FFMPEG_proxy(const char * filename)
     if( result->open( filename ))
         return result;
     delete result;
-#if defined WIN32 || defined _WIN32
+#ifdef HAVE_VFW
     return cvCreateFileCapture_VFW(filename);
 #else
     return 0;
@@ -263,9 +263,9 @@ CvVideoWriter* cvCreateVideoWriter_FFMPEG_proxy( const char* filename, int fourc
     if( result->open( filename, fourcc, fps, frameSize, isColor != 0 ))
         return result;
     delete result;
-#if defined WIN32 || defined _WIN32
-    return cvCreateVideoWriter_VFW(filename, fourcc, fps, frameSize, isColor);
-#else
+#ifdef HAVE_VFW
+     return cvCreateVideoWriter_VFW(filename, fourcc, fps, frameSize, isColor);
+ #else
     return 0;
 #endif
 }
diff --git a/modules/highgui/src/cap_ffmpeg_impl.hpp b/modules/highgui/src/cap_ffmpeg_impl.hpp
index dd9c47c544..f5d6b48065 100644
--- a/modules/highgui/src/cap_ffmpeg_impl.hpp
+++ b/modules/highgui/src/cap_ffmpeg_impl.hpp
@@ -153,6 +153,14 @@ extern "C" {
 #define AVERROR_EOF (-MKTAG( 'E','O','F',' '))
 #endif
 
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54,25,0)
+#  define CV_CODEC_ID AVCodecID
+#  define CV_CODEC(name) AV_##name
+#else
+#  define CV_CODEC_ID CodecID
+#  define CV_CODEC(name) name
+#endif
+
 static int get_number_of_cpus(void)
 {
 #if LIBAVFORMAT_BUILD < CALC_FFMPEG_VERSION(52, 111, 0)
@@ -1026,7 +1034,7 @@ static const char * icvFFMPEGErrStr(int err)
 
 /* function internal to FFMPEG (libavformat/riff.c) to lookup codec id by fourcc tag*/
 extern "C" {
-    enum CodecID codec_get_bmp_id(unsigned int tag);
+    enum CV_CODEC_ID codec_get_bmp_id(unsigned int tag);
 }
 
 void CvVideoWriter_FFMPEG::init()
@@ -1078,7 +1086,7 @@ static AVFrame * icv_alloc_picture_FFMPEG(int pix_fmt, int width, int height, bo
 
 /* add a video output stream to the container */
 static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
-                                             CodecID codec_id,
+                                             CV_CODEC_ID codec_id,
                                              int w, int h, int bitrate,
                                              double fps, int pixel_format)
 {
@@ -1110,7 +1118,7 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
     c->codec_id = oc->oformat->video_codec;
 #endif
 
-    if(codec_id != CODEC_ID_NONE){
+    if(codec_id != CV_CODEC(CODEC_ID_NONE)){
         c->codec_id = codec_id;
     }
 
@@ -1179,10 +1187,10 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
     c->gop_size = 12; /* emit one intra frame every twelve frames at most */
     c->pix_fmt = (PixelFormat) pixel_format;
 
-    if (c->codec_id == CODEC_ID_MPEG2VIDEO) {
+    if (c->codec_id == CV_CODEC(CODEC_ID_MPEG2VIDEO)) {
         c->max_b_frames = 2;
     }
-    if (c->codec_id == CODEC_ID_MPEG1VIDEO || c->codec_id == CODEC_ID_MSMPEG4V3){
+    if (c->codec_id == CV_CODEC(CODEC_ID_MPEG1VIDEO) || c->codec_id == CV_CODEC(CODEC_ID_MSMPEG4V3)){
         /* needed to avoid using macroblocks in which some coeffs overflow
            this doesnt happen with normal video, it just happens here as the
            motion of the chroma plane doesnt match the luma plane */
@@ -1290,7 +1298,7 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
 
 #if LIBAVFORMAT_BUILD < 5231
     // It is not needed in the latest versions of the ffmpeg
-    if( c->codec_id == CODEC_ID_RAWVIDEO && origin != 1 )
+    if( c->codec_id == CV_CODEC(CODEC_ID_RAWVIDEO) && origin != 1 )
     {
         if( !temp_image.data )
         {
@@ -1477,7 +1485,7 @@ void CvVideoWriter_FFMPEG::close()
 bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
                                  double fps, int width, int height, bool is_color )
 {
-    CodecID codec_id = CODEC_ID_NONE;
+    CV_CODEC_ID codec_id = CV_CODEC(CODEC_ID_NONE);
     int err, codec_pix_fmt;
     double bitrate_scale = 1;
 
@@ -1518,11 +1526,11 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
 
     /* Lookup codec_id for given fourcc */
 #if LIBAVCODEC_VERSION_INT<((51<<16)+(49<<8)+0)
-    if( (codec_id = codec_get_bmp_id( fourcc )) == CODEC_ID_NONE )
+    if( (codec_id = codec_get_bmp_id( fourcc )) == CV_CODEC(CODEC_ID_NONE) )
         return false;
 #else
     const struct AVCodecTag * tags[] = { codec_bmp_tags, NULL};
-    if( (codec_id = av_codec_get_id(tags, fourcc)) == CODEC_ID_NONE )
+    if( (codec_id = av_codec_get_id(tags, fourcc)) == CV_CODEC(CODEC_ID_NONE) )
         return false;
 #endif
 
@@ -1544,20 +1552,20 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
     // set a few optimal pixel formats for lossless codecs of interest..
     switch (codec_id) {
 #if LIBAVCODEC_VERSION_INT>((50<<16)+(1<<8)+0)
-    case CODEC_ID_JPEGLS:
+    case CV_CODEC(CODEC_ID_JPEGLS):
         // BGR24 or GRAY8 depending on is_color...
         codec_pix_fmt = input_pix_fmt;
         break;
 #endif
-    case CODEC_ID_HUFFYUV:
+    case CV_CODEC(CODEC_ID_HUFFYUV):
         codec_pix_fmt = PIX_FMT_YUV422P;
         break;
-    case CODEC_ID_MJPEG:
-    case CODEC_ID_LJPEG:
+    case CV_CODEC(CODEC_ID_MJPEG):
+    case CV_CODEC(CODEC_ID_LJPEG):
         codec_pix_fmt = PIX_FMT_YUVJ420P;
         bitrate_scale = 3;
         break;
-    case CODEC_ID_RAWVIDEO:
+    case CV_CODEC(CODEC_ID_RAWVIDEO):
         codec_pix_fmt = input_pix_fmt == PIX_FMT_GRAY8 ||
                         input_pix_fmt == PIX_FMT_GRAY16LE ||
                         input_pix_fmt == PIX_FMT_GRAY16BE ? input_pix_fmt : PIX_FMT_YUV420P;
@@ -1788,7 +1796,7 @@ struct OutputMediaStream_FFMPEG
     void write(unsigned char* data, int size, int keyFrame);
 
     // add a video output stream to the container
-    static AVStream* addVideoStream(AVFormatContext *oc, CodecID codec_id, int w, int h, int bitrate, double fps, PixelFormat pixel_format);
+    static AVStream* addVideoStream(AVFormatContext *oc, CV_CODEC_ID codec_id, int w, int h, int bitrate, double fps, PixelFormat pixel_format);
 
     AVOutputFormat* fmt_;
     AVFormatContext* oc_;
@@ -1835,7 +1843,7 @@ void OutputMediaStream_FFMPEG::close()
     }
 }
 
-AVStream* OutputMediaStream_FFMPEG::addVideoStream(AVFormatContext *oc, CodecID codec_id, int w, int h, int bitrate, double fps, PixelFormat pixel_format)
+AVStream* OutputMediaStream_FFMPEG::addVideoStream(AVFormatContext *oc, CV_CODEC_ID codec_id, int w, int h, int bitrate, double fps, PixelFormat pixel_format)
 {
     #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(53, 10, 0)
         AVStream* st = avformat_new_stream(oc, 0);
@@ -1915,10 +1923,10 @@ AVStream* OutputMediaStream_FFMPEG::addVideoStream(AVFormatContext *oc, CodecID
     c->gop_size = 12; // emit one intra frame every twelve frames at most
     c->pix_fmt = pixel_format;
 
-    if (c->codec_id == CODEC_ID_MPEG2VIDEO)
+    if (c->codec_id == CV_CODEC(CODEC_ID_MPEG2VIDEO))
         c->max_b_frames = 2;
 
-    if (c->codec_id == CODEC_ID_MPEG1VIDEO || c->codec_id == CODEC_ID_MSMPEG4V3)
+    if (c->codec_id == CV_CODEC(CODEC_ID_MPEG1VIDEO) || c->codec_id == CV_CODEC(CODEC_ID_MSMPEG4V3))
     {
         // needed to avoid using macroblocks in which some coeffs overflow
         // this doesnt happen with normal video, it just happens here as the
@@ -1955,7 +1963,7 @@ bool OutputMediaStream_FFMPEG::open(const char* fileName, int width, int height,
     if (!fmt_)
         return false;
 
-    CodecID codec_id = CODEC_ID_H264;
+    CV_CODEC_ID codec_id = CV_CODEC(CODEC_ID_H264);
 
     // alloc memory for context
     #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(53, 2, 0)
@@ -2156,23 +2164,23 @@ bool InputMediaStream_FFMPEG::open(const char* fileName, int* codec, int* chroma
 
             switch (enc->codec_id)
             {
-            case CODEC_ID_MPEG1VIDEO:
+            case CV_CODEC(CODEC_ID_MPEG1VIDEO):
                 *codec = ::VideoCodec_MPEG1;
                 break;
 
-            case CODEC_ID_MPEG2VIDEO:
+            case CV_CODEC(CODEC_ID_MPEG2VIDEO):
                 *codec = ::VideoCodec_MPEG2;
                 break;
 
-            case CODEC_ID_MPEG4:
+            case CV_CODEC(CODEC_ID_MPEG4):
                 *codec = ::VideoCodec_MPEG4;
                 break;
 
-            case CODEC_ID_VC1:
+            case CV_CODEC(CODEC_ID_VC1):
                 *codec = ::VideoCodec_VC1;
                 break;
 
-            case CODEC_ID_H264:
+            case CV_CODEC(CODEC_ID_H264):
                 *codec = ::VideoCodec_H264;
                 break;
 
diff --git a/modules/highgui/src/cap_libv4l.cpp b/modules/highgui/src/cap_libv4l.cpp
index 63a2ff96b0..ec048aff77 100644
--- a/modules/highgui/src/cap_libv4l.cpp
+++ b/modules/highgui/src/cap_libv4l.cpp
@@ -1714,6 +1714,7 @@ static void icvCloseCAM_V4L( CvCaptureCAM_V4L* capture ){
 #endif
 
      free(capture->deviceName);
+     capture->deviceName = NULL;
      //v4l2_free_ranges(capture);
      //cvFree((void **)capture);
    }
diff --git a/modules/highgui/src/cap_msmf.cpp b/modules/highgui/src/cap_msmf.cpp
new file mode 100644
index 0000000000..52b780463a
--- /dev/null
+++ b/modules/highgui/src/cap_msmf.cpp
@@ -0,0 +1,2810 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#if (defined WIN32 || defined _WIN32) && defined HAVE_MSMF
+/*
+   Media Foundation-based Video Capturing module is based on
+   videoInput library by Evgeny Pereguda:
+   http://www.codeproject.com/Articles/559437/Capturing-of-video-from-web-camera-on-Windows-7-an
+   Originaly licensed under The Code Project Open License (CPOL) 1.02:
+   http://www.codeproject.com/info/cpol10.aspx
+*/
+#include <windows.h>
+#include <guiddef.h>
+#include <mfidl.h>
+#include <Mfapi.h>
+#include <mfplay.h>
+#include <mfobjects.h>
+#include "Strsafe.h"
+#include <new>
+#include <map>
+#include <vector>
+#include <string>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#pragma warning(disable:4503)
+#pragma comment(lib, "mfplat")
+#pragma comment(lib, "mf")
+#pragma comment(lib, "mfuuid")
+#pragma comment(lib, "Strmiids")
+#pragma comment(lib, "MinCore_Downlevel")
+struct IMFMediaType;
+struct IMFActivate;
+struct IMFMediaSource;
+struct IMFAttributes;
+namespace
+{
+template <class T> void SafeRelease(T **ppT)
+{
+    if (*ppT)
+    {
+        (*ppT)->Release();
+        *ppT = NULL;
+    }
+}
+ /// Class for printing info into consol
+class DebugPrintOut
+{
+public:
+    ~DebugPrintOut(void);
+    static DebugPrintOut& getInstance();
+    void printOut(const wchar_t *format, ...);
+    void setVerbose(bool state);
+    bool verbose;
+private:
+    DebugPrintOut(void);
+};
+// Structure for collecting info about types of video, which are supported by current video device
+struct MediaType
+{
+    unsigned int MF_MT_FRAME_SIZE;
+    unsigned int height;
+    unsigned int width;
+    unsigned int MF_MT_YUV_MATRIX;
+    unsigned int MF_MT_VIDEO_LIGHTING;
+    unsigned int MF_MT_DEFAULT_STRIDE;
+    unsigned int MF_MT_VIDEO_CHROMA_SITING;
+    GUID MF_MT_AM_FORMAT_TYPE;
+    wchar_t *pMF_MT_AM_FORMAT_TYPEName;
+    unsigned int MF_MT_FIXED_SIZE_SAMPLES;
+    unsigned int MF_MT_VIDEO_NOMINAL_RANGE;
+    unsigned int MF_MT_FRAME_RATE;
+    unsigned int MF_MT_FRAME_RATE_low;
+    unsigned int MF_MT_PIXEL_ASPECT_RATIO;
+    unsigned int MF_MT_PIXEL_ASPECT_RATIO_low;
+    unsigned int MF_MT_ALL_SAMPLES_INDEPENDENT;
+    unsigned int MF_MT_FRAME_RATE_RANGE_MIN;
+    unsigned int MF_MT_FRAME_RATE_RANGE_MIN_low;
+    unsigned int MF_MT_SAMPLE_SIZE;
+    unsigned int MF_MT_VIDEO_PRIMARIES;
+    unsigned int MF_MT_INTERLACE_MODE;
+    unsigned int MF_MT_FRAME_RATE_RANGE_MAX;
+    unsigned int MF_MT_FRAME_RATE_RANGE_MAX_low;
+    GUID MF_MT_MAJOR_TYPE;
+    GUID MF_MT_SUBTYPE;
+    wchar_t *pMF_MT_MAJOR_TYPEName;
+    wchar_t *pMF_MT_SUBTYPEName;
+    MediaType();
+    ~MediaType();
+    void Clear();
+};
+/// Class for parsing info from IMFMediaType into the local MediaType
+class FormatReader
+{
+public:
+    static MediaType Read(IMFMediaType *pType);
+    ~FormatReader(void);
+private:
+    FormatReader(void);
+};
+DWORD WINAPI MainThreadFunction( LPVOID lpParam );
+typedef void(*emergensyStopEventCallback)(int, void *);
+typedef unsigned char BYTE;
+class RawImage
+{
+public:
+    ~RawImage(void);
+    // Function of creation of the instance of the class
+    static long CreateInstance(RawImage **ppRImage,unsigned int size);
+    void setCopy(const BYTE * pSampleBuffer);
+    void fastCopy(const BYTE * pSampleBuffer);
+    unsigned char * getpPixels();
+    bool isNew();
+    unsigned int getSize();
+private:
+    bool ri_new;
+    unsigned int ri_size;
+    unsigned char *ri_pixels;
+    RawImage(unsigned int size);
+};
+// Class for grabbing image from video stream
+class ImageGrabber : public IMFSampleGrabberSinkCallback
+{
+public:
+    ~ImageGrabber(void);
+    HRESULT initImageGrabber(IMFMediaSource *pSource, GUID VideoFormat);
+    HRESULT startGrabbing(void);
+    void stopGrabbing();
+    RawImage *getRawImage();
+    // Function of creation of the instance of the class
+    static HRESULT CreateInstance(ImageGrabber **ppIG,unsigned int deviceID);
+private:
+    bool ig_RIE;
+    bool ig_Close;
+    long m_cRef;
+    unsigned int ig_DeviceID;
+    IMFMediaSource *ig_pSource;
+    IMFMediaSession *ig_pSession;
+    IMFTopology *ig_pTopology;
+    RawImage *ig_RIFirst;
+    RawImage *ig_RISecond;
+    RawImage *ig_RIOut;
+    ImageGrabber(unsigned int deviceID);
+    HRESULT CreateTopology(IMFMediaSource *pSource, IMFActivate *pSinkActivate, IMFTopology **ppTopo);
+    HRESULT AddSourceNode(
+    IMFTopology *pTopology,
+    IMFMediaSource *pSource,
+    IMFPresentationDescriptor *pPD,
+    IMFStreamDescriptor *pSD,
+    IMFTopologyNode **ppNode);
+    HRESULT AddOutputNode(
+    IMFTopology *pTopology,
+    IMFActivate *pActivate,
+    DWORD dwId,
+    IMFTopologyNode **ppNode);
+    // IUnknown methods
+    STDMETHODIMP QueryInterface(REFIID iid, void** ppv);
+    STDMETHODIMP_(ULONG) AddRef();
+    STDMETHODIMP_(ULONG) Release();
+    // IMFClockStateSink methods
+    STDMETHODIMP OnClockStart(MFTIME hnsSystemTime, LONGLONG llClockStartOffset);
+    STDMETHODIMP OnClockStop(MFTIME hnsSystemTime);
+    STDMETHODIMP OnClockPause(MFTIME hnsSystemTime);
+    STDMETHODIMP OnClockRestart(MFTIME hnsSystemTime);
+    STDMETHODIMP OnClockSetRate(MFTIME hnsSystemTime, float flRate);
+    // IMFSampleGrabberSinkCallback methods
+    STDMETHODIMP OnSetPresentationClock(IMFPresentationClock* pClock);
+    STDMETHODIMP OnProcessSample(REFGUID guidMajorMediaType, DWORD dwSampleFlags,
+        LONGLONG llSampleTime, LONGLONG llSampleDuration, const BYTE * pSampleBuffer,
+        DWORD dwSampleSize);
+    STDMETHODIMP OnShutdown();
+};
+/// Class for controlling of thread of the grabbing raw data from video device
+class ImageGrabberThread
+{
+    friend DWORD WINAPI MainThreadFunction( LPVOID lpParam );
+public:
+    ~ImageGrabberThread(void);
+    static HRESULT CreateInstance(ImageGrabberThread **ppIGT, IMFMediaSource *pSource, unsigned int deviceID);
+    void start();
+    void stop();
+    void setEmergencyStopEvent(void *userData, void(*func)(int, void *));
+    ImageGrabber *getImageGrabber();
+protected:
+    virtual void run();
+private:
+    ImageGrabberThread(IMFMediaSource *pSource, unsigned int deviceID);
+    HANDLE igt_Handle;
+    DWORD   igt_ThreadIdArray;
+    ImageGrabber *igt_pImageGrabber;
+    emergensyStopEventCallback igt_func;
+    void *igt_userData;
+    bool igt_stop;
+    unsigned int igt_DeviceID;
+};
+// Structure for collecting info about one parametr of current video device
+struct Parametr
+{
+    long CurrentValue;
+    long Min;
+    long Max;
+    long Step;
+    long Default;
+    long Flag;
+    Parametr();
+};
+// Structure for collecting info about 17 parametrs of current video device
+struct CamParametrs
+{
+        Parametr Brightness;
+        Parametr Contrast;
+        Parametr Hue;
+        Parametr Saturation;
+        Parametr Sharpness;
+        Parametr Gamma;
+        Parametr ColorEnable;
+        Parametr WhiteBalance;
+        Parametr BacklightCompensation;
+        Parametr Gain;
+        Parametr Pan;
+        Parametr Tilt;
+        Parametr Roll;
+        Parametr Zoom;
+        Parametr Exposure;
+        Parametr Iris;
+        Parametr Focus;
+};
+typedef std::wstring String;
+typedef std::vector<int> vectorNum;
+typedef std::map<String, vectorNum> SUBTYPEMap;
+typedef std::map<UINT64, SUBTYPEMap> FrameRateMap;
+typedef void(*emergensyStopEventCallback)(int, void *);
+/// Class for controlling of video device
+class videoDevice
+{
+public:
+    videoDevice(void);
+    ~videoDevice(void);
+    void closeDevice();
+    CamParametrs getParametrs();
+    void setParametrs(CamParametrs parametrs);
+    void setEmergencyStopEvent(void *userData, void(*func)(int, void *));
+    long readInfoOfDevice(IMFActivate *pActivate, unsigned int Num);
+    wchar_t *getName();
+    int getCountFormats();
+    unsigned int getWidth();
+    unsigned int getHeight();
+    MediaType getFormat(unsigned int id);
+    bool setupDevice(unsigned int w, unsigned int h, unsigned int idealFramerate = 0);
+    bool setupDevice(unsigned int id);
+    bool isDeviceSetup();
+    bool isDeviceMediaSource();
+    bool isDeviceRawDataSource();
+    bool isFrameNew();
+    IMFMediaSource *getMediaSource();
+    RawImage *getRawImageOut();
+private:
+    enum typeLock
+    {
+        MediaSourceLock,
+        RawDataLock,
+        OpenLock
+    } vd_LockOut;
+    wchar_t *vd_pFriendlyName;
+    ImageGrabberThread *vd_pImGrTh;
+    CamParametrs vd_PrevParametrs;
+    unsigned int vd_Width;
+    unsigned int vd_Height;
+    unsigned int vd_CurrentNumber;
+    bool vd_IsSetuped;
+    std::map<UINT64, FrameRateMap> vd_CaptureFormats;
+    std::vector<MediaType> vd_CurrentFormats;
+    IMFMediaSource *vd_pSource;
+    emergensyStopEventCallback vd_func;
+    void *vd_userData;
+    long enumerateCaptureFormats(IMFMediaSource *pSource);
+    long setDeviceFormat(IMFMediaSource *pSource, unsigned long dwFormatIndex);
+    void buildLibraryofTypes();
+    int findType(unsigned int size, unsigned int frameRate = 0);
+    long resetDevice(IMFActivate *pActivate);
+    long initDevice();
+    long checkDevice(IMFAttributes *pAttributes, IMFActivate **pDevice);
+};
+/// Class for managing of list of video devices
+class videoDevices
+{
+public:
+    ~videoDevices(void);
+    long initDevices(IMFAttributes *pAttributes);
+    static videoDevices& getInstance();
+    videoDevice *getDevice(unsigned int i);
+    unsigned int getCount();
+    void clearDevices();
+private:
+    UINT32 count;
+    std::vector<videoDevice *> vds_Devices;
+    videoDevices(void);
+};
+// Class for creating of Media Foundation context
+class Media_Foundation
+{
+public:
+    virtual ~Media_Foundation(void);
+    static Media_Foundation& getInstance();
+    bool buildListOfDevices();
+private:
+    Media_Foundation(void);
+};
+/// The only visiable class for controlling of video devices in format singelton
+class videoInput
+{
+public:
+    virtual ~videoInput(void);
+    // Getting of static instance of videoInput class
+    static videoInput& getInstance();
+    // Closing video device with deviceID
+    void closeDevice(int deviceID);
+    // Setting callback function for emergency events(for example: removing video device with deviceID) with userData
+    void setEmergencyStopEvent(int deviceID, void *userData, void(*func)(int, void *));
+    // Closing all devices
+    void closeAllDevices();
+    // Getting of parametrs of video device with deviceID
+    CamParametrs getParametrs(int deviceID);
+    // Setting of parametrs of video device with deviceID
+    void setParametrs(int deviceID, CamParametrs parametrs);
+    // Getting numbers of existence videodevices with listing in consol
+    unsigned int listDevices(bool silent = false);
+    // Getting numbers of formats, which are supported by videodevice with deviceID
+    unsigned int getCountFormats(int deviceID);
+    // Getting width of image, which is getting from videodevice with deviceID
+    unsigned int getWidth(int deviceID);
+    // Getting height of image, which is getting from videodevice with deviceID
+    unsigned int getHeight(int deviceID);
+    // Getting name of videodevice with deviceID
+    wchar_t *getNameVideoDevice(int deviceID);
+    // Getting interface MediaSource for Media Foundation from videodevice with deviceID
+    IMFMediaSource *getMediaSource(int deviceID);
+    // Getting format with id, which is supported by videodevice with deviceID
+    MediaType getFormat(int deviceID, int unsigned id);
+    // Checking of existence of the suitable video devices
+    bool isDevicesAcceable();
+    // Checking of using the videodevice with deviceID
+    bool isDeviceSetup(int deviceID);
+    // Checking of using MediaSource from videodevice with deviceID
+    bool isDeviceMediaSource(int deviceID);
+    // Checking of using Raw Data of pixels from videodevice with deviceID
+    bool isDeviceRawDataSource(int deviceID);
+    // Setting of the state of outprinting info in console
+    static void setVerbose(bool state);
+    // Initialization of video device with deviceID by media type with id
+    bool setupDevice(int deviceID, unsigned int id = 0);
+    // Initialization of video device with deviceID by wisth w, height h and fps idealFramerate
+    bool setupDevice(int deviceID, unsigned int w, unsigned int h, unsigned int idealFramerate = 30);
+    // Checking of recivig of new frame from video device with deviceID
+    bool isFrameNew(int deviceID);
+    // Writing of Raw Data pixels from video device with deviceID with correction of RedAndBlue flipping flipRedAndBlue and vertical flipping flipImage
+    bool getPixels(int deviceID, unsigned char * pixels, bool flipRedAndBlue = false, bool flipImage = false);
+private:
+    bool accessToDevices;
+    videoInput(void);
+    void processPixels(unsigned char * src, unsigned char * dst, unsigned int width, unsigned int height, unsigned int bpp, bool bRGB, bool bFlip);
+    void updateListOfDevices();
+};
+DebugPrintOut::DebugPrintOut(void):verbose(true)
+{
+}
+DebugPrintOut::~DebugPrintOut(void)
+{
+}
+DebugPrintOut& DebugPrintOut::getInstance()
+{
+    static DebugPrintOut instance;
+    return instance;
+}
+void DebugPrintOut::printOut(const wchar_t *format, ...)
+{
+    if(verbose)
+    {
+        int i = 0;
+        wchar_t *p = NULL;
+        va_list args;
+        va_start(args, format);
+        if(wcscmp(format, L"%i"))
+        {
+            i = va_arg (args, int);
+        }
+        if(wcscmp(format, L"%s"))
+        {
+            p = va_arg (args, wchar_t *);
+        }
+        wprintf(format, i,p);
+        va_end (args);
+    }
+}
+void DebugPrintOut::setVerbose(bool state)
+{
+    verbose = state;
+}
+LPCWSTR GetGUIDNameConstNew(const GUID& guid);
+HRESULT GetGUIDNameNew(const GUID& guid, WCHAR **ppwsz);
+HRESULT LogAttributeValueByIndexNew(IMFAttributes *pAttr, DWORD index);
+HRESULT SpecialCaseAttributeValueNew(GUID guid, const PROPVARIANT& var, MediaType &out);
+unsigned int *GetParametr(GUID guid, MediaType &out)
+{
+    if(guid == MF_MT_YUV_MATRIX)
+        return &(out.MF_MT_YUV_MATRIX);
+    if(guid == MF_MT_VIDEO_LIGHTING)
+        return &(out.MF_MT_VIDEO_LIGHTING);
+    if(guid == MF_MT_DEFAULT_STRIDE)
+        return &(out.MF_MT_DEFAULT_STRIDE);
+    if(guid == MF_MT_VIDEO_CHROMA_SITING)
+        return &(out.MF_MT_VIDEO_CHROMA_SITING);
+    if(guid == MF_MT_VIDEO_NOMINAL_RANGE)
+        return &(out.MF_MT_VIDEO_NOMINAL_RANGE);
+    if(guid == MF_MT_ALL_SAMPLES_INDEPENDENT)
+        return &(out.MF_MT_ALL_SAMPLES_INDEPENDENT);
+    if(guid == MF_MT_FIXED_SIZE_SAMPLES)
+        return &(out.MF_MT_FIXED_SIZE_SAMPLES);
+    if(guid == MF_MT_SAMPLE_SIZE)
+        return &(out.MF_MT_SAMPLE_SIZE);
+    if(guid == MF_MT_VIDEO_PRIMARIES)
+        return &(out.MF_MT_VIDEO_PRIMARIES);
+    if(guid == MF_MT_INTERLACE_MODE)
+        return &(out.MF_MT_INTERLACE_MODE);
+    return NULL;
+}
+HRESULT LogAttributeValueByIndexNew(IMFAttributes *pAttr, DWORD index, MediaType &out)
+{
+    WCHAR *pGuidName = NULL;
+    WCHAR *pGuidValName = NULL;
+    GUID guid = { 0 };
+    PROPVARIANT var;
+    PropVariantInit(&var);
+    HRESULT hr = pAttr->GetItemByIndex(index, &guid, &var);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    hr = GetGUIDNameNew(guid, &pGuidName);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    hr = SpecialCaseAttributeValueNew(guid, var, out);
+    unsigned int *p;
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    if (hr == S_FALSE)
+    {
+        switch (var.vt)
+        {
+        case VT_UI4:
+            p = GetParametr(guid, out);
+            if(p)
+            {
+                *p = var.ulVal;
+            }
+            break;
+        case VT_UI8:
+            break;
+        case VT_R8:
+            break;
+        case VT_CLSID:
+            if(guid == MF_MT_AM_FORMAT_TYPE)
+            {
+                hr = GetGUIDNameNew(*var.puuid, &pGuidValName);
+                if (SUCCEEDED(hr))
+                {
+                    out.MF_MT_AM_FORMAT_TYPE = MF_MT_AM_FORMAT_TYPE;
+                    out.pMF_MT_AM_FORMAT_TYPEName = pGuidValName;
+                    pGuidValName = NULL;
+                }
+            }
+            if(guid == MF_MT_MAJOR_TYPE)
+            {
+                hr = GetGUIDNameNew(*var.puuid, &pGuidValName);
+                if (SUCCEEDED(hr))
+                {
+                    out.MF_MT_MAJOR_TYPE = MF_MT_MAJOR_TYPE;
+                    out.pMF_MT_MAJOR_TYPEName = pGuidValName;
+                    pGuidValName = NULL;
+                }
+            }
+            if(guid == MF_MT_SUBTYPE)
+            {
+                hr = GetGUIDNameNew(*var.puuid, &pGuidValName);
+                if (SUCCEEDED(hr))
+                {
+                    out.MF_MT_SUBTYPE = MF_MT_SUBTYPE;
+                    out.pMF_MT_SUBTYPEName = pGuidValName;
+                    pGuidValName = NULL;
+                }
+            }
+            break;
+        case VT_LPWSTR:
+            break;
+        case VT_VECTOR | VT_UI1:
+            break;
+        case VT_UNKNOWN:
+            break;
+        default:
+            break;
+        }
+    }
+done:
+    CoTaskMemFree(pGuidName);
+    CoTaskMemFree(pGuidValName);
+    PropVariantClear(&var);
+    return hr;
+}
+HRESULT GetGUIDNameNew(const GUID& guid, WCHAR **ppwsz)
+{
+    HRESULT hr = S_OK;
+    WCHAR *pName = NULL;
+    LPCWSTR pcwsz = GetGUIDNameConstNew(guid);
+    if (pcwsz)
+    {
+        size_t cchLength = 0;
+        hr = StringCchLengthW(pcwsz, STRSAFE_MAX_CCH, &cchLength);
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+        pName = (WCHAR*)CoTaskMemAlloc((cchLength + 1) * sizeof(WCHAR));
+        if (pName == NULL)
+        {
+            hr = E_OUTOFMEMORY;
+            goto done;
+        }
+        hr = StringCchCopyW(pName, cchLength + 1, pcwsz);
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+    }
+    else
+    {
+        hr = StringFromCLSID(guid, &pName);
+    }
+done:
+    if (FAILED(hr))
+    {
+        *ppwsz = NULL;
+        CoTaskMemFree(pName);
+    }
+    else
+    {
+        *ppwsz = pName;
+    }
+    return hr;
+}
+void LogUINT32AsUINT64New(const PROPVARIANT& var, UINT32 &uHigh, UINT32 &uLow)
+{
+    Unpack2UINT32AsUINT64(var.uhVal.QuadPart, &uHigh, &uLow);
+}
+float OffsetToFloatNew(const MFOffset& offset)
+{
+    return offset.value + (static_cast<float>(offset.fract) / 65536.0f);
+}
+HRESULT LogVideoAreaNew(const PROPVARIANT& var)
+{
+    if (var.caub.cElems < sizeof(MFVideoArea))
+    {
+        return S_OK;
+    }
+    return S_OK;
+}
+HRESULT SpecialCaseAttributeValueNew(GUID guid, const PROPVARIANT& var, MediaType &out)
+{
+    if (guid == MF_MT_FRAME_SIZE)
+    {
+        UINT32 uHigh = 0, uLow = 0;
+        LogUINT32AsUINT64New(var, uHigh, uLow);
+        out.width = uHigh;
+        out.height = uLow;
+        out.MF_MT_FRAME_SIZE = out.width * out.height;
+    }
+    else
+    if (guid == MF_MT_FRAME_RATE)
+    {
+        UINT32 uHigh = 0, uLow = 0;
+        LogUINT32AsUINT64New(var, uHigh, uLow);
+        out.MF_MT_FRAME_RATE = uHigh;
+        out.MF_MT_FRAME_RATE_low = uLow;
+    }
+    else
+    if (guid == MF_MT_FRAME_RATE_RANGE_MAX)
+    {
+        UINT32 uHigh = 0, uLow = 0;
+        LogUINT32AsUINT64New(var, uHigh, uLow);
+        out.MF_MT_FRAME_RATE_RANGE_MAX = uHigh;
+        out.MF_MT_FRAME_RATE_RANGE_MAX_low = uLow;
+    }
+    else
+    if (guid == MF_MT_FRAME_RATE_RANGE_MIN)
+    {
+        UINT32 uHigh = 0, uLow = 0;
+        LogUINT32AsUINT64New(var, uHigh, uLow);
+        out.MF_MT_FRAME_RATE_RANGE_MIN = uHigh;
+        out.MF_MT_FRAME_RATE_RANGE_MIN_low = uLow;
+    }
+    else
+    if (guid == MF_MT_PIXEL_ASPECT_RATIO)
+    {
+        UINT32 uHigh = 0, uLow = 0;
+        LogUINT32AsUINT64New(var, uHigh, uLow);
+        out.MF_MT_PIXEL_ASPECT_RATIO = uHigh;
+        out.MF_MT_PIXEL_ASPECT_RATIO_low = uLow;
+    }
+    else
+    {
+        return S_FALSE;
+    }
+    return S_OK;
+}
+#ifndef IF_EQUAL_RETURN
+#define IF_EQUAL_RETURN(param, val) if(val == param) return L#val
+#endif
+LPCWSTR GetGUIDNameConstNew(const GUID& guid)
+{
+    IF_EQUAL_RETURN(guid, MF_MT_MAJOR_TYPE);
+    IF_EQUAL_RETURN(guid, MF_MT_MAJOR_TYPE);
+    IF_EQUAL_RETURN(guid, MF_MT_SUBTYPE);
+    IF_EQUAL_RETURN(guid, MF_MT_ALL_SAMPLES_INDEPENDENT);
+    IF_EQUAL_RETURN(guid, MF_MT_FIXED_SIZE_SAMPLES);
+    IF_EQUAL_RETURN(guid, MF_MT_COMPRESSED);
+    IF_EQUAL_RETURN(guid, MF_MT_SAMPLE_SIZE);
+    IF_EQUAL_RETURN(guid, MF_MT_WRAPPED_TYPE);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_NUM_CHANNELS);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_SAMPLES_PER_SECOND);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_FLOAT_SAMPLES_PER_SECOND);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_AVG_BYTES_PER_SECOND);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_BLOCK_ALIGNMENT);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_BITS_PER_SAMPLE);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_VALID_BITS_PER_SAMPLE);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_SAMPLES_PER_BLOCK);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_CHANNEL_MASK);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_FOLDDOWN_MATRIX);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_WMADRC_PEAKREF);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_WMADRC_PEAKTARGET);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_WMADRC_AVGREF);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_WMADRC_AVGTARGET);
+    IF_EQUAL_RETURN(guid, MF_MT_AUDIO_PREFER_WAVEFORMATEX);
+    IF_EQUAL_RETURN(guid, MF_MT_AAC_PAYLOAD_TYPE);
+    IF_EQUAL_RETURN(guid, MF_MT_AAC_AUDIO_PROFILE_LEVEL_INDICATION);
+    IF_EQUAL_RETURN(guid, MF_MT_FRAME_SIZE);
+    IF_EQUAL_RETURN(guid, MF_MT_FRAME_RATE);
+    IF_EQUAL_RETURN(guid, MF_MT_FRAME_RATE_RANGE_MAX);
+    IF_EQUAL_RETURN(guid, MF_MT_FRAME_RATE_RANGE_MIN);
+    IF_EQUAL_RETURN(guid, MF_MT_PIXEL_ASPECT_RATIO);
+    IF_EQUAL_RETURN(guid, MF_MT_DRM_FLAGS);
+    IF_EQUAL_RETURN(guid, MF_MT_PAD_CONTROL_FLAGS);
+    IF_EQUAL_RETURN(guid, MF_MT_SOURCE_CONTENT_HINT);
+    IF_EQUAL_RETURN(guid, MF_MT_VIDEO_CHROMA_SITING);
+    IF_EQUAL_RETURN(guid, MF_MT_INTERLACE_MODE);
+    IF_EQUAL_RETURN(guid, MF_MT_TRANSFER_FUNCTION);
+    IF_EQUAL_RETURN(guid, MF_MT_VIDEO_PRIMARIES);
+    IF_EQUAL_RETURN(guid, MF_MT_CUSTOM_VIDEO_PRIMARIES);
+    IF_EQUAL_RETURN(guid, MF_MT_YUV_MATRIX);
+    IF_EQUAL_RETURN(guid, MF_MT_VIDEO_LIGHTING);
+    IF_EQUAL_RETURN(guid, MF_MT_VIDEO_NOMINAL_RANGE);
+    IF_EQUAL_RETURN(guid, MF_MT_GEOMETRIC_APERTURE);
+    IF_EQUAL_RETURN(guid, MF_MT_MINIMUM_DISPLAY_APERTURE);
+    IF_EQUAL_RETURN(guid, MF_MT_PAN_SCAN_APERTURE);
+    IF_EQUAL_RETURN(guid, MF_MT_PAN_SCAN_ENABLED);
+    IF_EQUAL_RETURN(guid, MF_MT_AVG_BITRATE);
+    IF_EQUAL_RETURN(guid, MF_MT_AVG_BIT_ERROR_RATE);
+    IF_EQUAL_RETURN(guid, MF_MT_MAX_KEYFRAME_SPACING);
+    IF_EQUAL_RETURN(guid, MF_MT_DEFAULT_STRIDE);
+    IF_EQUAL_RETURN(guid, MF_MT_PALETTE);
+    IF_EQUAL_RETURN(guid, MF_MT_USER_DATA);
+    IF_EQUAL_RETURN(guid, MF_MT_AM_FORMAT_TYPE);
+    IF_EQUAL_RETURN(guid, MF_MT_MPEG_START_TIME_CODE);
+    IF_EQUAL_RETURN(guid, MF_MT_MPEG2_PROFILE);
+    IF_EQUAL_RETURN(guid, MF_MT_MPEG2_LEVEL);
+    IF_EQUAL_RETURN(guid, MF_MT_MPEG2_FLAGS);
+    IF_EQUAL_RETURN(guid, MF_MT_MPEG_SEQUENCE_HEADER);
+    IF_EQUAL_RETURN(guid, MF_MT_DV_AAUX_SRC_PACK_0);
+    IF_EQUAL_RETURN(guid, MF_MT_DV_AAUX_CTRL_PACK_0);
+    IF_EQUAL_RETURN(guid, MF_MT_DV_AAUX_SRC_PACK_1);
+    IF_EQUAL_RETURN(guid, MF_MT_DV_AAUX_CTRL_PACK_1);
+    IF_EQUAL_RETURN(guid, MF_MT_DV_VAUX_SRC_PACK);
+    IF_EQUAL_RETURN(guid, MF_MT_DV_VAUX_CTRL_PACK);
+    IF_EQUAL_RETURN(guid, MF_MT_ARBITRARY_HEADER);
+    IF_EQUAL_RETURN(guid, MF_MT_ARBITRARY_FORMAT);
+    IF_EQUAL_RETURN(guid, MF_MT_IMAGE_LOSS_TOLERANT);
+    IF_EQUAL_RETURN(guid, MF_MT_MPEG4_SAMPLE_DESCRIPTION);
+    IF_EQUAL_RETURN(guid, MF_MT_MPEG4_CURRENT_SAMPLE_ENTRY);
+    IF_EQUAL_RETURN(guid, MF_MT_ORIGINAL_4CC);
+    IF_EQUAL_RETURN(guid, MF_MT_ORIGINAL_WAVE_FORMAT_TAG);
+    // Media types
+    IF_EQUAL_RETURN(guid, MFMediaType_Audio);
+    IF_EQUAL_RETURN(guid, MFMediaType_Video);
+    IF_EQUAL_RETURN(guid, MFMediaType_Protected);
+    IF_EQUAL_RETURN(guid, MFMediaType_SAMI);
+    IF_EQUAL_RETURN(guid, MFMediaType_Script);
+    IF_EQUAL_RETURN(guid, MFMediaType_Image);
+    IF_EQUAL_RETURN(guid, MFMediaType_HTML);
+    IF_EQUAL_RETURN(guid, MFMediaType_Binary);
+    IF_EQUAL_RETURN(guid, MFMediaType_FileTransfer);
+    IF_EQUAL_RETURN(guid, MFVideoFormat_AI44); //     FCC('AI44')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_ARGB32); //   D3DFMT_A8R8G8B8
+    IF_EQUAL_RETURN(guid, MFVideoFormat_AYUV); //     FCC('AYUV')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_DV25); //     FCC('dv25')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_DV50); //     FCC('dv50')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_DVH1); //     FCC('dvh1')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_DVSD); //     FCC('dvsd')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_DVSL); //     FCC('dvsl')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_H264); //     FCC('H264')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_I420); //     FCC('I420')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_IYUV); //     FCC('IYUV')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_M4S2); //     FCC('M4S2')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_MJPG);
+    IF_EQUAL_RETURN(guid, MFVideoFormat_MP43); //     FCC('MP43')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_MP4S); //     FCC('MP4S')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_MP4V); //     FCC('MP4V')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_MPG1); //     FCC('MPG1')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_MSS1); //     FCC('MSS1')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_MSS2); //     FCC('MSS2')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_NV11); //     FCC('NV11')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_NV12); //     FCC('NV12')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_P010); //     FCC('P010')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_P016); //     FCC('P016')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_P210); //     FCC('P210')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_P216); //     FCC('P216')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_RGB24); //    D3DFMT_R8G8B8
+    IF_EQUAL_RETURN(guid, MFVideoFormat_RGB32); //    D3DFMT_X8R8G8B8
+    IF_EQUAL_RETURN(guid, MFVideoFormat_RGB555); //   D3DFMT_X1R5G5B5
+    IF_EQUAL_RETURN(guid, MFVideoFormat_RGB565); //   D3DFMT_R5G6B5
+    IF_EQUAL_RETURN(guid, MFVideoFormat_RGB8);
+    IF_EQUAL_RETURN(guid, MFVideoFormat_UYVY); //     FCC('UYVY')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_v210); //     FCC('v210')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_v410); //     FCC('v410')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_WMV1); //     FCC('WMV1')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_WMV2); //     FCC('WMV2')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_WMV3); //     FCC('WMV3')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_WVC1); //     FCC('WVC1')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_Y210); //     FCC('Y210')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_Y216); //     FCC('Y216')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_Y410); //     FCC('Y410')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_Y416); //     FCC('Y416')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_Y41P);
+    IF_EQUAL_RETURN(guid, MFVideoFormat_Y41T);
+    IF_EQUAL_RETURN(guid, MFVideoFormat_YUY2); //     FCC('YUY2')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_YV12); //     FCC('YV12')
+    IF_EQUAL_RETURN(guid, MFVideoFormat_YVYU);
+    IF_EQUAL_RETURN(guid, MFAudioFormat_PCM); //              WAVE_FORMAT_PCM
+    IF_EQUAL_RETURN(guid, MFAudioFormat_Float); //            WAVE_FORMAT_IEEE_FLOAT
+    IF_EQUAL_RETURN(guid, MFAudioFormat_DTS); //              WAVE_FORMAT_DTS
+    IF_EQUAL_RETURN(guid, MFAudioFormat_Dolby_AC3_SPDIF); //  WAVE_FORMAT_DOLBY_AC3_SPDIF
+    IF_EQUAL_RETURN(guid, MFAudioFormat_DRM); //              WAVE_FORMAT_DRM
+    IF_EQUAL_RETURN(guid, MFAudioFormat_WMAudioV8); //        WAVE_FORMAT_WMAUDIO2
+    IF_EQUAL_RETURN(guid, MFAudioFormat_WMAudioV9); //        WAVE_FORMAT_WMAUDIO3
+    IF_EQUAL_RETURN(guid, MFAudioFormat_WMAudio_Lossless); // WAVE_FORMAT_WMAUDIO_LOSSLESS
+    IF_EQUAL_RETURN(guid, MFAudioFormat_WMASPDIF); //         WAVE_FORMAT_WMASPDIF
+    IF_EQUAL_RETURN(guid, MFAudioFormat_MSP1); //             WAVE_FORMAT_WMAVOICE9
+    IF_EQUAL_RETURN(guid, MFAudioFormat_MP3); //              WAVE_FORMAT_MPEGLAYER3
+    IF_EQUAL_RETURN(guid, MFAudioFormat_MPEG); //             WAVE_FORMAT_MPEG
+    IF_EQUAL_RETURN(guid, MFAudioFormat_AAC); //              WAVE_FORMAT_MPEG_HEAAC
+    IF_EQUAL_RETURN(guid, MFAudioFormat_ADTS); //             WAVE_FORMAT_MPEG_ADTS_AAC
+    return NULL;
+}
+FormatReader::FormatReader(void)
+{
+}
+MediaType FormatReader::Read(IMFMediaType *pType)
+{
+    UINT32 count = 0;
+    HRESULT hr = S_OK;
+    MediaType out;
+    hr = pType->LockStore();
+    if (FAILED(hr))
+    {
+        return out;
+    }
+    hr = pType->GetCount(&count);
+    if (FAILED(hr))
+    {
+        return out;
+    }
+    for (UINT32 i = 0; i < count; i++)
+    {
+        hr = LogAttributeValueByIndexNew(pType, i, out);
+        if (FAILED(hr))
+        {
+            break;
+        }
+    }
+    hr = pType->UnlockStore();
+    if (FAILED(hr))
+    {
+        return out;
+    }
+    return out;
+}
+FormatReader::~FormatReader(void)
+{
+}
+#define CHECK_HR(x) if (FAILED(x)) { goto done; }
+ImageGrabber::ImageGrabber(unsigned int deviceID): m_cRef(1), ig_DeviceID(deviceID), ig_pSource(NULL), ig_pSession(NULL), ig_pTopology(NULL), ig_RIE(true), ig_Close(false)
+{
+}
+ImageGrabber::~ImageGrabber(void)
+{
+    if (ig_pSession)
+    {
+        ig_pSession->Shutdown();
+    }
+    //SafeRelease(&ig_pSession);
+    //SafeRelease(&ig_pTopology);
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Destroing instance of the ImageGrabber class \n", ig_DeviceID);
+}
+HRESULT ImageGrabber::initImageGrabber(IMFMediaSource *pSource, GUID VideoFormat)
+{
+    IMFActivate *pSinkActivate = NULL;
+    IMFMediaType *pType = NULL;
+    IMFPresentationDescriptor *pPD = NULL;
+    IMFStreamDescriptor *pSD = NULL;
+    IMFMediaTypeHandler *pHandler = NULL;
+    IMFMediaType *pCurrentType = NULL;
+    HRESULT hr = S_OK;
+    MediaType MT;
+     // Clean up.
+    if (ig_pSession)
+    {
+        ig_pSession->Shutdown();
+    }
+    SafeRelease(&ig_pSession);
+    SafeRelease(&ig_pTopology);
+    ig_pSource = pSource;
+    hr = pSource->CreatePresentationDescriptor(&pPD);
+    if (FAILED(hr))
+        goto err;
+    BOOL fSelected;
+    hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, &pSD);
+    if (FAILED(hr))
+        goto err;
+    hr = pSD->GetMediaTypeHandler(&pHandler);
+    if (FAILED(hr))
+        goto err;
+    DWORD cTypes = 0;
+    hr = pHandler->GetMediaTypeCount(&cTypes);
+    if (FAILED(hr))
+        goto err;
+    if(cTypes > 0)
+    {
+        hr = pHandler->GetCurrentMediaType(&pCurrentType);
+        if (FAILED(hr))
+            goto err;
+        MT = FormatReader::Read(pCurrentType);
+    }
+err:
+    SafeRelease(&pPD);
+    SafeRelease(&pSD);
+    SafeRelease(&pHandler);
+    SafeRelease(&pCurrentType);
+    unsigned int sizeRawImage = 0;
+    if(VideoFormat == MFVideoFormat_RGB24)
+    {
+        sizeRawImage = MT.MF_MT_FRAME_SIZE * 3;
+    }
+    else if(VideoFormat == MFVideoFormat_RGB32)
+    {
+        sizeRawImage = MT.MF_MT_FRAME_SIZE * 4;
+    }
+    CHECK_HR(hr = RawImage::CreateInstance(&ig_RIFirst, sizeRawImage));
+    CHECK_HR(hr = RawImage::CreateInstance(&ig_RISecond, sizeRawImage));
+    ig_RIOut = ig_RISecond;
+    // Configure the media type that the Sample Grabber will receive.
+    // Setting the major and subtype is usually enough for the topology loader
+    // to resolve the topology.
+    CHECK_HR(hr = MFCreateMediaType(&pType));
+    CHECK_HR(hr = pType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video));
+    CHECK_HR(hr = pType->SetGUID(MF_MT_SUBTYPE, VideoFormat));
+    // Create the sample grabber sink.
+    CHECK_HR(hr = MFCreateSampleGrabberSinkActivate(pType, this, &pSinkActivate));
+    // To run as fast as possible, set this attribute (requires Windows 7):
+    CHECK_HR(hr = pSinkActivate->SetUINT32(MF_SAMPLEGRABBERSINK_IGNORE_CLOCK, TRUE));
+    // Create the Media Session.
+    CHECK_HR(hr = MFCreateMediaSession(NULL, &ig_pSession));
+    // Create the topology.
+    CHECK_HR(hr = CreateTopology(pSource, pSinkActivate, &ig_pTopology));
+done:
+    // Clean up.
+    if (FAILED(hr))
+    {
+        if (ig_pSession)
+        {
+            ig_pSession->Shutdown();
+        }
+        SafeRelease(&ig_pSession);
+        SafeRelease(&ig_pTopology);
+    }
+    SafeRelease(&pSinkActivate);
+    SafeRelease(&pType);
+    return hr;
+}
+void ImageGrabber::stopGrabbing()
+{
+    if(ig_pSession)
+        ig_pSession->Stop();
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Stopping of of grabbing of images\n", ig_DeviceID);
+}
+HRESULT ImageGrabber::startGrabbing(void)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    IMFMediaEvent *pEvent = NULL;
+    PROPVARIANT var;
+    PropVariantInit(&var);
+    HRESULT hr = S_OK;
+    CHECK_HR(hr = ig_pSession->SetTopology(0, ig_pTopology));
+    CHECK_HR(hr = ig_pSession->Start(&GUID_NULL, &var));
+    DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Start Grabbing of the images\n", ig_DeviceID);
+    for(;;)
+    {
+        HRESULT hrStatus = S_OK;
+        MediaEventType met;
+        if(!ig_pSession) break;
+        hr = ig_pSession->GetEvent(0, &pEvent);
+        if(!SUCCEEDED(hr))
+        {
+            hr = S_OK;
+            goto done;
+        }
+        hr = pEvent->GetStatus(&hrStatus);
+        if(!SUCCEEDED(hr))
+        {
+            hr = S_OK;
+            goto done;
+        }
+        hr = pEvent->GetType(&met);
+        if(!SUCCEEDED(hr))
+        {
+            hr = S_OK;
+            goto done;
+        }
+        if (met == MESessionEnded)
+        {
+            DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: MESessionEnded \n", ig_DeviceID);
+            ig_pSession->Stop();
+            break;
+        }
+        if (met == MESessionStopped)
+        {
+            DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: MESessionStopped \n", ig_DeviceID);
+            break;
+        }
+        if (met == MEVideoCaptureDeviceRemoved)
+        {
+            DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: MEVideoCaptureDeviceRemoved \n", ig_DeviceID);
+            break;
+        }
+        SafeRelease(&pEvent);
+    }
+    DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Finish startGrabbing \n", ig_DeviceID);
+done:
+    SafeRelease(&pEvent);
+    SafeRelease(&ig_pSession);
+    SafeRelease(&ig_pTopology);
+    return hr;
+}
+HRESULT ImageGrabber::CreateTopology(IMFMediaSource *pSource, IMFActivate *pSinkActivate, IMFTopology **ppTopo)
+{
+    IMFTopology *pTopology = NULL;
+    IMFPresentationDescriptor *pPD = NULL;
+    IMFStreamDescriptor *pSD = NULL;
+    IMFMediaTypeHandler *pHandler = NULL;
+    IMFTopologyNode *pNode1 = NULL;
+    IMFTopologyNode *pNode2 = NULL;
+    HRESULT hr = S_OK;
+    DWORD cStreams = 0;
+    CHECK_HR(hr = MFCreateTopology(&pTopology));
+    CHECK_HR(hr = pSource->CreatePresentationDescriptor(&pPD));
+    CHECK_HR(hr = pPD->GetStreamDescriptorCount(&cStreams));
+    for (DWORD i = 0; i < cStreams; i++)
+    {
+        // In this example, we look for audio streams and connect them to the sink.
+        BOOL fSelected = FALSE;
+        GUID majorType;
+        CHECK_HR(hr = pPD->GetStreamDescriptorByIndex(i, &fSelected, &pSD));
+        CHECK_HR(hr = pSD->GetMediaTypeHandler(&pHandler));
+        CHECK_HR(hr = pHandler->GetMajorType(&majorType));
+        if (majorType == MFMediaType_Video && fSelected)
+        {
+            CHECK_HR(hr = AddSourceNode(pTopology, pSource, pPD, pSD, &pNode1));
+            CHECK_HR(hr = AddOutputNode(pTopology, pSinkActivate, 0, &pNode2));
+            CHECK_HR(hr = pNode1->ConnectOutput(0, pNode2, 0));
+            break;
+        }
+        else
+        {
+            CHECK_HR(hr = pPD->DeselectStream(i));
+        }
+        SafeRelease(&pSD);
+        SafeRelease(&pHandler);
+    }
+    *ppTopo = pTopology;
+    (*ppTopo)->AddRef();
+done:
+    SafeRelease(&pTopology);
+    SafeRelease(&pNode1);
+    SafeRelease(&pNode2);
+    SafeRelease(&pPD);
+    SafeRelease(&pSD);
+    SafeRelease(&pHandler);
+    return hr;
+}
+HRESULT ImageGrabber::AddSourceNode(
+    IMFTopology *pTopology,           // Topology.
+    IMFMediaSource *pSource,          // Media source.
+    IMFPresentationDescriptor *pPD,   // Presentation descriptor.
+    IMFStreamDescriptor *pSD,         // Stream descriptor.
+    IMFTopologyNode **ppNode)         // Receives the node pointer.
+{
+    IMFTopologyNode *pNode = NULL;
+    HRESULT hr = S_OK;
+    CHECK_HR(hr = MFCreateTopologyNode(MF_TOPOLOGY_SOURCESTREAM_NODE, &pNode));
+    CHECK_HR(hr = pNode->SetUnknown(MF_TOPONODE_SOURCE, pSource));
+    CHECK_HR(hr = pNode->SetUnknown(MF_TOPONODE_PRESENTATION_DESCRIPTOR, pPD));
+    CHECK_HR(hr = pNode->SetUnknown(MF_TOPONODE_STREAM_DESCRIPTOR, pSD));
+    CHECK_HR(hr = pTopology->AddNode(pNode));
+    // Return the pointer to the caller.
+    *ppNode = pNode;
+    (*ppNode)->AddRef();
+done:
+    SafeRelease(&pNode);
+    return hr;
+}
+HRESULT ImageGrabber::AddOutputNode(
+    IMFTopology *pTopology,     // Topology.
+    IMFActivate *pActivate,     // Media sink activation object.
+    DWORD dwId,                 // Identifier of the stream sink.
+    IMFTopologyNode **ppNode)   // Receives the node pointer.
+{
+    IMFTopologyNode *pNode = NULL;
+    HRESULT hr = S_OK;
+    CHECK_HR(hr = MFCreateTopologyNode(MF_TOPOLOGY_OUTPUT_NODE, &pNode));
+    CHECK_HR(hr = pNode->SetObject(pActivate));
+    CHECK_HR(hr = pNode->SetUINT32(MF_TOPONODE_STREAMID, dwId));
+    CHECK_HR(hr = pNode->SetUINT32(MF_TOPONODE_NOSHUTDOWN_ON_REMOVE, FALSE));
+    CHECK_HR(hr = pTopology->AddNode(pNode));
+    // Return the pointer to the caller.
+    *ppNode = pNode;
+    (*ppNode)->AddRef();
+done:
+    SafeRelease(&pNode);
+    return hr;
+}
+HRESULT ImageGrabber::CreateInstance(ImageGrabber **ppIG, unsigned int deviceID)
+{
+    *ppIG = new (std::nothrow) ImageGrabber(deviceID);
+    if (ppIG == NULL)
+    {
+        return E_OUTOFMEMORY;
+    }
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Creating instance of ImageGrabber\n", deviceID);
+    return S_OK;
+}
+STDMETHODIMP ImageGrabber::QueryInterface(REFIID riid, void** ppv)
+{
+    HRESULT hr = E_NOINTERFACE;
+    *ppv = NULL;
+    if(riid == IID_IUnknown || riid == IID_IMFSampleGrabberSinkCallback)
+    {
+        *ppv = static_cast<IMFSampleGrabberSinkCallback *>(this);
+        hr = S_OK;
+    }
+    if(riid == IID_IMFClockStateSink)
+    {
+        *ppv = static_cast<IMFClockStateSink *>(this);
+        hr = S_OK;
+    }
+    if(SUCCEEDED(hr))
+    {
+        reinterpret_cast<IUnknown *>(*ppv)->AddRef();
+    }
+    return hr;
+}
+STDMETHODIMP_(ULONG) ImageGrabber::AddRef()
+{
+    return InterlockedIncrement(&m_cRef);
+}
+STDMETHODIMP_(ULONG) ImageGrabber::Release()
+{
+    ULONG cRef = InterlockedDecrement(&m_cRef);
+    if (cRef == 0)
+    {
+        delete this;
+    }
+    return cRef;
+}
+STDMETHODIMP ImageGrabber::OnClockStart(MFTIME hnsSystemTime, LONGLONG llClockStartOffset)
+{
+    (void)hnsSystemTime;
+    (void)llClockStartOffset;
+    return S_OK;
+}
+STDMETHODIMP ImageGrabber::OnClockStop(MFTIME hnsSystemTime)
+{
+    (void)hnsSystemTime;
+    return S_OK;
+}
+STDMETHODIMP ImageGrabber::OnClockPause(MFTIME hnsSystemTime)
+{
+    (void)hnsSystemTime;
+    return S_OK;
+}
+STDMETHODIMP ImageGrabber::OnClockRestart(MFTIME hnsSystemTime)
+{
+    (void)hnsSystemTime;
+    return S_OK;
+}
+STDMETHODIMP ImageGrabber::OnClockSetRate(MFTIME hnsSystemTime, float flRate)
+{
+    (void)flRate;
+    (void)hnsSystemTime;
+    return S_OK;
+}
+STDMETHODIMP ImageGrabber::OnSetPresentationClock(IMFPresentationClock* pClock)
+{
+    (void)pClock;
+    return S_OK;
+}
+STDMETHODIMP ImageGrabber::OnProcessSample(REFGUID guidMajorMediaType, DWORD dwSampleFlags,
+    LONGLONG llSampleTime, LONGLONG llSampleDuration, const BYTE * pSampleBuffer,
+    DWORD dwSampleSize)
+{
+    (void)guidMajorMediaType;
+    (void)llSampleTime;
+    (void)dwSampleFlags;
+    (void)llSampleDuration;
+    (void)dwSampleSize;
+    if(ig_RIE)
+    {
+        ig_RIFirst->fastCopy(pSampleBuffer);
+        ig_RIOut = ig_RIFirst;
+    }
+    else
+    {
+        ig_RISecond->fastCopy(pSampleBuffer);
+        ig_RIOut = ig_RISecond;
+    }
+    ig_RIE = !ig_RIE;
+    return S_OK;
+}
+STDMETHODIMP ImageGrabber::OnShutdown()
+{
+    return S_OK;
+}
+RawImage *ImageGrabber::getRawImage()
+{
+    return ig_RIOut;
+}
+DWORD WINAPI MainThreadFunction( LPVOID lpParam )
+{
+    ImageGrabberThread *pIGT = (ImageGrabberThread *)lpParam;
+    pIGT->run();
+    return 0;
+}
+HRESULT ImageGrabberThread::CreateInstance(ImageGrabberThread **ppIGT, IMFMediaSource *pSource, unsigned int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    *ppIGT = new (std::nothrow) ImageGrabberThread(pSource, deviceID);
+    if (ppIGT == NULL)
+    {
+        DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Memory cannot be allocated\n", deviceID);
+        return E_OUTOFMEMORY;
+    }
+    else
+        DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Creating of the instance of ImageGrabberThread\n", deviceID);
+    return S_OK;
+}
+ImageGrabberThread::ImageGrabberThread(IMFMediaSource *pSource, unsigned int deviceID): igt_Handle(NULL), igt_stop(false)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    HRESULT hr = ImageGrabber::CreateInstance(&igt_pImageGrabber, deviceID);
+    igt_DeviceID = deviceID;
+    if(SUCCEEDED(hr))
+    {
+        hr = igt_pImageGrabber->initImageGrabber(pSource, MFVideoFormat_RGB24);
+        if(!SUCCEEDED(hr))
+        {
+            DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: There is a problem with initialization of the instance of the ImageGrabber class\n", deviceID);
+        }
+        else
+        {
+            DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Initialization of instance of the ImageGrabber class\n", deviceID);
+        }
+    }
+    else
+    {
+        DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i There is a problem with creation of the instance of the ImageGrabber class\n", deviceID);
+    }
+}
+void ImageGrabberThread::setEmergencyStopEvent(void *userData, void(*func)(int, void *))
+{
+    if(func)
+    {
+        igt_func = func;
+        igt_userData = userData;
+    }
+}
+ImageGrabberThread::~ImageGrabberThread(void)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Destroing ImageGrabberThread\n", igt_DeviceID);
+    delete igt_pImageGrabber;
+}
+void ImageGrabberThread::stop()
+{
+    igt_stop = true;
+    if(igt_pImageGrabber)
+    {
+        igt_pImageGrabber->stopGrabbing();
+    }
+}
+void ImageGrabberThread::start()
+{
+    igt_Handle = CreateThread(
+            NULL,                   // default security attributes
+            0,                      // use default stack size
+            MainThreadFunction,       // thread function name
+            this,          // argument to thread function
+            0,                      // use default creation flags
+            &igt_ThreadIdArray);   // returns the thread identifier
+}
+void ImageGrabberThread::run()
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if(igt_pImageGrabber)
+    {
+        DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Thread for grabbing images is started\n", igt_DeviceID);
+        HRESULT hr = igt_pImageGrabber->startGrabbing();
+        if(!SUCCEEDED(hr))
+        {
+            DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: There is a problem with starting the process of grabbing\n", igt_DeviceID);
+        }
+    }
+    else
+    {
+        DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i The thread is finished without execution of grabbing\n", igt_DeviceID);
+    }
+    if(!igt_stop)
+    {
+        DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Emergency Stop thread\n", igt_DeviceID);
+        if(igt_func)
+        {
+            igt_func(igt_DeviceID, igt_userData);
+        }
+    }
+    else
+        DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Finish thread\n", igt_DeviceID);
+}
+ImageGrabber *ImageGrabberThread::getImageGrabber()
+{
+    return igt_pImageGrabber;
+}
+Media_Foundation::Media_Foundation(void)
+{
+    HRESULT hr = MFStartup(MF_VERSION);
+    if(!SUCCEEDED(hr))
+    {
+        DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+        DPO->printOut(L"MEDIA FOUNDATION: It cannot be created!!!\n");
+    }
+}
+Media_Foundation::~Media_Foundation(void)
+{
+    HRESULT hr = MFShutdown();
+    if(!SUCCEEDED(hr))
+    {
+        DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+        DPO->printOut(L"MEDIA FOUNDATION: Resources cannot be released\n");
+    }
+}
+bool Media_Foundation::buildListOfDevices()
+{
+    HRESULT hr = S_OK;
+    IMFAttributes *pAttributes = NULL;
+    CoInitialize(NULL);
+    hr = MFCreateAttributes(&pAttributes, 1);
+    if (SUCCEEDED(hr))
+    {
+        hr = pAttributes->SetGUID(
+            MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE,
+            MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_GUID
+            );
+    }
+    if (SUCCEEDED(hr))
+    {
+        videoDevices *vDs = &videoDevices::getInstance();
+        hr = vDs->initDevices(pAttributes);
+    }
+    else
+    {
+       DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+       DPO->printOut(L"MEDIA FOUNDATION: The access to the video cameras denied\n");
+    }
+    SafeRelease(&pAttributes);
+    return (SUCCEEDED(hr));
+}
+Media_Foundation& Media_Foundation::getInstance()
+{
+    static Media_Foundation instance;
+    return instance;
+}
+RawImage::RawImage(unsigned int size): ri_new(false), ri_pixels(NULL)
+{
+    ri_size = size;
+    ri_pixels = new unsigned char[size];
+    memset((void *)ri_pixels,0,ri_size);
+}
+bool RawImage::isNew()
+{
+    return ri_new;
+}
+unsigned int RawImage::getSize()
+{
+    return ri_size;
+}
+RawImage::~RawImage(void)
+{
+    delete []ri_pixels;
+    ri_pixels = NULL;
+}
+long RawImage::CreateInstance(RawImage **ppRImage,unsigned int size)
+{
+    *ppRImage = new (std::nothrow) RawImage(size);
+    if (ppRImage == NULL)
+    {
+        return E_OUTOFMEMORY;
+    }
+    return S_OK;
+}
+void RawImage::setCopy(const BYTE * pSampleBuffer)
+{
+    memcpy(ri_pixels, pSampleBuffer, ri_size);
+    ri_new = true;
+}
+void RawImage::fastCopy(const BYTE * pSampleBuffer)
+{
+    memcpy(ri_pixels, pSampleBuffer, ri_size);
+    ri_new = true;
+}
+unsigned char * RawImage::getpPixels()
+{
+    ri_new = false;
+    return ri_pixels;
+}
+videoDevice::videoDevice(void): vd_IsSetuped(false), vd_LockOut(OpenLock), vd_pFriendlyName(NULL),
+    vd_Width(0), vd_Height(0), vd_pSource(NULL), vd_func(NULL), vd_userData(NULL)
+{
+}
+void videoDevice::setParametrs(CamParametrs parametrs)
+{
+    if(vd_IsSetuped)
+    {
+        if(vd_pSource)
+        {
+            Parametr *pParametr = (Parametr *)(&parametrs);
+            Parametr *pPrevParametr = (Parametr *)(&vd_PrevParametrs);
+            IAMVideoProcAmp *pProcAmp = NULL;
+            HRESULT hr = vd_pSource->QueryInterface(IID_PPV_ARGS(&pProcAmp));
+            if (SUCCEEDED(hr))
+            {
+                for(unsigned int i = 0; i < 10; i++)
+                {
+                    if(pPrevParametr[i].CurrentValue != pParametr[i].CurrentValue || pPrevParametr[i].Flag != pParametr[i].Flag)
+                        hr = pProcAmp->Set(VideoProcAmp_Brightness + i, pParametr[i].CurrentValue, pParametr[i].Flag);
+                }
+                pProcAmp->Release();
+            }
+            IAMCameraControl *pProcControl = NULL;
+            hr = vd_pSource->QueryInterface(IID_PPV_ARGS(&pProcControl));
+            if (SUCCEEDED(hr))
+            {
+                for(unsigned int i = 0; i < 7; i++)
+                {
+                    if(pPrevParametr[10 + i].CurrentValue != pParametr[10 + i].CurrentValue || pPrevParametr[10 + i].Flag != pParametr[10 + i].Flag)
+                    hr = pProcControl->Set(CameraControl_Pan+i, pParametr[10 + i].CurrentValue, pParametr[10 + i].Flag);
+                }
+                pProcControl->Release();
+            }
+            vd_PrevParametrs = parametrs;
+        }
+    }
+}
+CamParametrs videoDevice::getParametrs()
+{
+    CamParametrs out;
+    if(vd_IsSetuped)
+    {
+        if(vd_pSource)
+        {
+            Parametr *pParametr = (Parametr *)(&out);
+            IAMVideoProcAmp *pProcAmp = NULL;
+            HRESULT hr = vd_pSource->QueryInterface(IID_PPV_ARGS(&pProcAmp));
+            if (SUCCEEDED(hr))
+            {
+                for(unsigned int i = 0; i < 10; i++)
+                {
+                    Parametr temp;
+                    hr = pProcAmp->GetRange(VideoProcAmp_Brightness+i, &temp.Min, &temp.Max, &temp.Step, &temp.Default, &temp.Flag);
+                    if (SUCCEEDED(hr))
+                    {
+                        temp.CurrentValue = temp.Default;
+                        pParametr[i] = temp;
+                    }
+                }
+                pProcAmp->Release();
+            }
+            IAMCameraControl *pProcControl = NULL;
+            hr = vd_pSource->QueryInterface(IID_PPV_ARGS(&pProcControl));
+            if (SUCCEEDED(hr))
+            {
+                for(unsigned int i = 0; i < 7; i++)
+                {
+                    Parametr temp;
+                    hr = pProcControl->GetRange(CameraControl_Pan+i, &temp.Min, &temp.Max, &temp.Step, &temp.Default, &temp.Flag);
+                    if (SUCCEEDED(hr))
+                    {
+                        temp.CurrentValue = temp.Default;
+                        pParametr[10 + i] = temp;
+                    }
+                }
+                pProcControl->Release();
+            }
+        }
+    }
+    return out;
+}
+long videoDevice::resetDevice(IMFActivate *pActivate)
+{
+    HRESULT hr = -1;
+    vd_CurrentFormats.clear();
+    if(vd_pFriendlyName)
+        CoTaskMemFree(vd_pFriendlyName);
+    vd_pFriendlyName = NULL;
+    if(pActivate)
+    {
+        IMFMediaSource *pSource = NULL;
+        hr = pActivate->GetAllocatedString(
+                MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME,
+                &vd_pFriendlyName,
+                NULL
+                );
+        hr = pActivate->ActivateObject(
+            __uuidof(IMFMediaSource),
+            (void**)&pSource
+            );
+        enumerateCaptureFormats(pSource);
+        buildLibraryofTypes();
+        SafeRelease(&pSource);
+        if(FAILED(hr))
+        {
+            vd_pFriendlyName = NULL;
+            DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+            DPO->printOut(L"VIDEODEVICE %i: IMFMediaSource interface cannot be created \n", vd_CurrentNumber);
+        }
+    }
+    return hr;
+}
+long videoDevice::readInfoOfDevice(IMFActivate *pActivate, unsigned int Num)
+{
+    HRESULT hr = -1;
+    vd_CurrentNumber = Num;
+    hr = resetDevice(pActivate);
+    return hr;
+}
+long videoDevice::checkDevice(IMFAttributes *pAttributes, IMFActivate **pDevice)
+{
+    HRESULT hr = S_OK;
+    IMFActivate **ppDevices = NULL;
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    UINT32 count;
+    wchar_t *newFriendlyName = NULL;
+    hr = MFEnumDeviceSources(pAttributes, &ppDevices, &count);
+    if (SUCCEEDED(hr))
+    {
+        if(count > 0)
+        {
+            if(count > vd_CurrentNumber)
+            {
+                hr = ppDevices[vd_CurrentNumber]->GetAllocatedString(
+                MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME,
+                &newFriendlyName,
+                NULL
+                );
+                if (SUCCEEDED(hr))
+                {
+                    if(wcscmp(newFriendlyName, vd_pFriendlyName) != 0)
+                    {
+                        DPO->printOut(L"VIDEODEVICE %i: Chosen device cannot be found \n", vd_CurrentNumber);
+                        hr = -1;
+                        pDevice = NULL;
+                    }
+                    else
+                    {
+                        *pDevice = ppDevices[vd_CurrentNumber];
+                        (*pDevice)->AddRef();
+                    }
+                }
+                else
+                {
+                    DPO->printOut(L"VIDEODEVICE %i: Name of device cannot be gotten \n", vd_CurrentNumber);
+                }
+            }
+            else
+            {
+                DPO->printOut(L"VIDEODEVICE %i: Number of devices more than corrent number of the device \n", vd_CurrentNumber);
+                hr = -1;
+            }
+            for(UINT32 i = 0; i < count; i++)
+            {
+                SafeRelease(&ppDevices[i]);
+            }
+            SafeRelease(ppDevices);
+        }
+        else
+            hr = -1;
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE %i: List of DeviceSources cannot be enumerated \n", vd_CurrentNumber);
+    }
+    return hr;
+}
+long videoDevice::initDevice()
+{
+    HRESULT hr = -1;
+    IMFAttributes *pAttributes = NULL;
+    IMFActivate * vd_pActivate= NULL;
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    CoInitialize(NULL);
+    hr = MFCreateAttributes(&pAttributes, 1);
+    if (SUCCEEDED(hr))
+    {
+        hr = pAttributes->SetGUID(
+            MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE,
+            MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_GUID
+            );
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = checkDevice(pAttributes, &vd_pActivate);
+        if (SUCCEEDED(hr) && vd_pActivate)
+        {
+            SafeRelease(&vd_pSource);
+            hr = vd_pActivate->ActivateObject(
+                __uuidof(IMFMediaSource),
+                (void**)&vd_pSource
+                );
+            if (SUCCEEDED(hr))
+            {
+            }
+            SafeRelease(&vd_pActivate);
+        }
+        else
+        {
+            DPO->printOut(L"VIDEODEVICE %i: Device there is not \n", vd_CurrentNumber);
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE %i: The attribute of video cameras cannot be getting \n", vd_CurrentNumber);
+    }
+    SafeRelease(&pAttributes);
+    return hr;
+}
+MediaType videoDevice::getFormat(unsigned int id)
+{
+    if(id < vd_CurrentFormats.size())
+    {
+        return vd_CurrentFormats[id];
+    }
+    else return MediaType();
+}
+int videoDevice::getCountFormats()
+{
+    return vd_CurrentFormats.size();
+}
+void videoDevice::setEmergencyStopEvent(void *userData, void(*func)(int, void *))
+{
+    vd_func = func;
+    vd_userData = userData;
+}
+void videoDevice::closeDevice()
+{
+    if(vd_IsSetuped)
+    {
+        vd_IsSetuped = false;
+        vd_pSource->Stop();
+        SafeRelease(&vd_pSource);
+        if(vd_LockOut == RawDataLock)
+        {
+            vd_pImGrTh->stop();
+            Sleep(500);
+            delete vd_pImGrTh;
+        }
+        vd_pImGrTh = NULL;
+        vd_LockOut = OpenLock;
+        DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+        DPO->printOut(L"VIDEODEVICE %i: Device is stopped \n", vd_CurrentNumber);
+    }
+}
+unsigned int videoDevice::getWidth()
+{
+    if(vd_IsSetuped)
+        return vd_Width;
+    else
+        return 0;
+}
+unsigned int videoDevice::getHeight()
+{
+    if(vd_IsSetuped)
+        return vd_Height;
+    else
+        return 0;
+}
+IMFMediaSource *videoDevice::getMediaSource()
+{
+    IMFMediaSource *out = NULL;
+    if(vd_LockOut == OpenLock)
+    {
+        vd_LockOut = MediaSourceLock;
+        out = vd_pSource;
+    }
+    return out;
+}
+int videoDevice::findType(unsigned int size, unsigned int frameRate)
+{
+    if(vd_CaptureFormats.size() == 0)
+        return 0;
+    FrameRateMap FRM = vd_CaptureFormats[size];
+    if(FRM.size() == 0)
+        return 0;
+    UINT64 frameRateMax = 0;  SUBTYPEMap STMMax;
+    if(frameRate == 0)
+    {
+        std::map<UINT64, SUBTYPEMap>::iterator f = FRM.begin();
+        for(; f != FRM.end(); f++)
+        {
+             if((*f).first >= frameRateMax)
+             {
+                 frameRateMax = (*f).first;
+                 STMMax = (*f).second;
+             }
+        }
+    }
+    else
+    {
+        std::map<UINT64, SUBTYPEMap>::iterator f = FRM.begin();
+        for(; f != FRM.end(); f++)
+        {
+             if((*f).first >= frameRateMax)
+             {
+                 if(frameRate > (*f).first)
+                 {
+                     frameRateMax = (*f).first;
+                     STMMax = (*f).second;
+                 }
+             }
+        }
+    }
+    if(STMMax.size() == 0)
+        return 0;
+    std::map<String, vectorNum>::iterator S = STMMax.begin();
+    vectorNum VN = (*S).second;
+    if(VN.size() == 0)
+        return 0;
+    return VN[0];
+}
+void videoDevice::buildLibraryofTypes()
+{
+    unsigned int size;
+    unsigned int framerate;
+    std::vector<MediaType>::iterator i = vd_CurrentFormats.begin();
+    int count = 0;
+    for(; i != vd_CurrentFormats.end(); i++)
+    {
+        size = (*i).MF_MT_FRAME_SIZE;
+        framerate = (*i).MF_MT_FRAME_RATE;
+        FrameRateMap FRM = vd_CaptureFormats[size];
+        SUBTYPEMap STM = FRM[framerate];
+        String subType((*i).pMF_MT_SUBTYPEName);
+        vectorNum VN = STM[subType];
+        VN.push_back(count);
+        STM[subType] = VN;
+        FRM[framerate] = STM;
+        vd_CaptureFormats[size] = FRM;
+        count++;
+    }
+}
+long videoDevice::setDeviceFormat(IMFMediaSource *pSource, unsigned long  dwFormatIndex)
+{
+    IMFPresentationDescriptor *pPD = NULL;
+    IMFStreamDescriptor *pSD = NULL;
+    IMFMediaTypeHandler *pHandler = NULL;
+    IMFMediaType *pType = NULL;
+    HRESULT hr = pSource->CreatePresentationDescriptor(&pPD);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    BOOL fSelected;
+    hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, &pSD);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    hr = pSD->GetMediaTypeHandler(&pHandler);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    hr = pHandler->GetMediaTypeByIndex((DWORD)dwFormatIndex, &pType);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    hr = pHandler->SetCurrentMediaType(pType);
+done:
+    SafeRelease(&pPD);
+    SafeRelease(&pSD);
+    SafeRelease(&pHandler);
+    SafeRelease(&pType);
+    return hr;
+}
+bool videoDevice::isDeviceSetup()
+{
+    return vd_IsSetuped;
+}
+RawImage * videoDevice::getRawImageOut()
+{
+    if(!vd_IsSetuped) return NULL;
+    if(vd_pImGrTh)
+            return vd_pImGrTh->getImageGrabber()->getRawImage();
+    else
+    {
+        DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+        DPO->printOut(L"VIDEODEVICE %i: The instance of ImageGrabberThread class does not exist  \n", vd_CurrentNumber);
+    }
+    return NULL;
+}
+bool videoDevice::isFrameNew()
+{
+    if(!vd_IsSetuped) return false;
+    if(vd_LockOut == RawDataLock || vd_LockOut == OpenLock)
+    {
+        if(vd_LockOut == OpenLock)
+        {
+            vd_LockOut = RawDataLock;
+            HRESULT hr = ImageGrabberThread::CreateInstance(&vd_pImGrTh, vd_pSource, vd_CurrentNumber);
+            if(FAILED(hr))
+            {
+                DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+                DPO->printOut(L"VIDEODEVICE %i: The instance of ImageGrabberThread class cannot be created.\n", vd_CurrentNumber);
+                return false;
+            }
+            vd_pImGrTh->setEmergencyStopEvent(vd_userData, vd_func);
+            vd_pImGrTh->start();
+            return true;
+        }
+        if(vd_pImGrTh)
+            return vd_pImGrTh->getImageGrabber()->getRawImage()->isNew();
+    }
+    return false;
+}
+bool videoDevice::isDeviceMediaSource()
+{
+    if(vd_LockOut == MediaSourceLock) return true;
+    return false;
+}
+bool videoDevice::isDeviceRawDataSource()
+{
+    if(vd_LockOut == RawDataLock) return true;
+    return false;
+}
+bool videoDevice::setupDevice(unsigned int id)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if(!vd_IsSetuped)
+    {
+        HRESULT hr = -1;
+        hr = initDevice();
+        if(SUCCEEDED(hr))
+        {
+            vd_Width = vd_CurrentFormats[id].width;
+            vd_Height = vd_CurrentFormats[id].height;
+            hr = setDeviceFormat(vd_pSource, (DWORD) id);
+            vd_IsSetuped = (SUCCEEDED(hr));
+            if(vd_IsSetuped)
+                DPO->printOut(L"\n\nVIDEODEVICE %i: Device is setuped \n", vd_CurrentNumber);
+            vd_PrevParametrs = getParametrs();
+            return vd_IsSetuped;
+        }
+        else
+        {
+            DPO->printOut(L"VIDEODEVICE %i: Interface IMFMediaSource cannot be got \n", vd_CurrentNumber);
+            return false;
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Device is setuped already \n", vd_CurrentNumber);
+        return false;
+    }
+}
+bool videoDevice::setupDevice(unsigned int w, unsigned int h, unsigned int idealFramerate)
+{
+    unsigned int id = findType(w * h, idealFramerate);
+    return setupDevice(id);
+}
+wchar_t *videoDevice::getName()
+{
+    return vd_pFriendlyName;
+}
+videoDevice::~videoDevice(void)
+{
+    closeDevice();
+    SafeRelease(&vd_pSource);
+    if(vd_pFriendlyName)
+        CoTaskMemFree(vd_pFriendlyName);
+}
+long videoDevice::enumerateCaptureFormats(IMFMediaSource *pSource)
+{
+    IMFPresentationDescriptor *pPD = NULL;
+    IMFStreamDescriptor *pSD = NULL;
+    IMFMediaTypeHandler *pHandler = NULL;
+    IMFMediaType *pType = NULL;
+    HRESULT hr = pSource->CreatePresentationDescriptor(&pPD);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    BOOL fSelected;
+    hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, &pSD);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    hr = pSD->GetMediaTypeHandler(&pHandler);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    DWORD cTypes = 0;
+    hr = pHandler->GetMediaTypeCount(&cTypes);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    for (DWORD i = 0; i < cTypes; i++)
+    {
+        hr = pHandler->GetMediaTypeByIndex(i, &pType);
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+        MediaType MT = FormatReader::Read(pType);
+        vd_CurrentFormats.push_back(MT);
+        SafeRelease(&pType);
+    }
+done:
+    SafeRelease(&pPD);
+    SafeRelease(&pSD);
+    SafeRelease(&pHandler);
+    SafeRelease(&pType);
+    return hr;
+}
+videoDevices::videoDevices(void): count(0)
+{}
+void videoDevices::clearDevices()
+{
+    std::vector<videoDevice *>::iterator i = vds_Devices.begin();
+    for(; i != vds_Devices.end(); ++i)
+        delete (*i);
+    vds_Devices.clear();
+}
+videoDevices::~videoDevices(void)
+{
+    clearDevices();
+}
+videoDevice * videoDevices::getDevice(unsigned int i)
+{
+    if(i >= vds_Devices.size())
+    {
+        return NULL;
+    }
+    if(i < 0)
+    {
+        return NULL;
+    }
+    return vds_Devices[i];
+}
+long videoDevices::initDevices(IMFAttributes *pAttributes)
+{
+    HRESULT hr = S_OK;
+    IMFActivate **ppDevices = NULL;
+    clearDevices();
+    hr = MFEnumDeviceSources(pAttributes, &ppDevices, &count);
+    if (SUCCEEDED(hr))
+    {
+        if(count > 0)
+        {
+            for(UINT32 i = 0; i < count; i++)
+            {
+                videoDevice *vd = new videoDevice;
+                vd->readInfoOfDevice(ppDevices[i], i);
+                vds_Devices.push_back(vd);
+                SafeRelease(&ppDevices[i]);
+            }
+            SafeRelease(ppDevices);
+        }
+        else
+            hr = -1;
+    }
+    else
+    {
+        DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+        DPO->printOut(L"VIDEODEVICES: The instances of the videoDevice class cannot be created\n");
+    }
+    return hr;
+}
+size_t videoDevices::getCount()
+{
+    return vds_Devices.size();
+}
+videoDevices& videoDevices::getInstance()
+{
+    static videoDevices instance;
+    return instance;
+}
+Parametr::Parametr()
+{
+    CurrentValue = 0;
+    Min = 0;
+    Max = 0;
+    Step = 0;
+    Default = 0;
+    Flag = 0;
+}
+MediaType::MediaType()
+{
+    pMF_MT_AM_FORMAT_TYPEName = NULL;
+    pMF_MT_MAJOR_TYPEName = NULL;
+    pMF_MT_SUBTYPEName = NULL;
+    Clear();
+}
+MediaType::~MediaType()
+{
+    Clear();
+}
+void MediaType::Clear()
+{
+    MF_MT_FRAME_SIZE = 0;
+    height = 0;
+    width = 0;
+    MF_MT_YUV_MATRIX = 0;
+    MF_MT_VIDEO_LIGHTING = 0;
+    MF_MT_DEFAULT_STRIDE = 0;
+    MF_MT_VIDEO_CHROMA_SITING = 0;
+    MF_MT_FIXED_SIZE_SAMPLES = 0;
+    MF_MT_VIDEO_NOMINAL_RANGE = 0;
+    MF_MT_FRAME_RATE = 0;
+    MF_MT_FRAME_RATE_low = 0;
+    MF_MT_PIXEL_ASPECT_RATIO = 0;
+    MF_MT_PIXEL_ASPECT_RATIO_low = 0;
+    MF_MT_ALL_SAMPLES_INDEPENDENT = 0;
+    MF_MT_FRAME_RATE_RANGE_MIN = 0;
+    MF_MT_FRAME_RATE_RANGE_MIN_low = 0;
+    MF_MT_SAMPLE_SIZE = 0;
+    MF_MT_VIDEO_PRIMARIES = 0;
+    MF_MT_INTERLACE_MODE = 0;
+    MF_MT_FRAME_RATE_RANGE_MAX = 0;
+    MF_MT_FRAME_RATE_RANGE_MAX_low = 0;
+    memset(&MF_MT_MAJOR_TYPE, 0, sizeof(GUID));
+    memset(&MF_MT_AM_FORMAT_TYPE, 0, sizeof(GUID));
+    memset(&MF_MT_SUBTYPE, 0, sizeof(GUID));
+}
+videoInput::videoInput(void): accessToDevices(false)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    DPO->printOut(L"\n***** VIDEOINPUT LIBRARY - 2013 (Author: Evgeny Pereguda) *****\n\n");
+    updateListOfDevices();
+    if(!accessToDevices)
+        DPO->printOut(L"INITIALIZATION: Ther is not any suitable video device\n");
+}
+void videoInput::updateListOfDevices()
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    Media_Foundation *MF = &Media_Foundation::getInstance();
+    accessToDevices = MF->buildListOfDevices();
+    if(!accessToDevices)
+        DPO->printOut(L"UPDATING: Ther is not any suitable video device\n");
+}
+videoInput::~videoInput(void)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    DPO->printOut(L"\n***** CLOSE VIDEOINPUT LIBRARY - 2013 *****\n\n");
+}
+IMFMediaSource *videoInput::getMediaSource(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+        {
+            IMFMediaSource *out = VD->getMediaSource();
+            if(!out)
+                DPO->printOut(L"VideoDevice %i: There is not any suitable IMFMediaSource interface\n", deviceID);
+            return out;
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return NULL;
+}
+bool videoInput::setupDevice(int deviceID, unsigned int id)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0 )
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return false;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+        {
+            bool out = VD->setupDevice(id);
+            if(!out)
+                DPO->printOut(L"VIDEODEVICE %i: This device cannot be started\n", deviceID);
+            return out;
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return false;
+}
+bool videoInput::setupDevice(int deviceID, unsigned int w, unsigned int h, unsigned int idealFramerate)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0 )
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return false;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+        {
+            bool out = VD->setupDevice(w, h, idealFramerate);
+            if(!out)
+                DPO->printOut(L"VIDEODEVICE %i: this device cannot be started\n", deviceID);
+            return out;
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n", deviceID);
+    }
+    return false;
+}
+MediaType videoInput::getFormat(int deviceID, unsigned int id)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return MediaType();
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+            return VD->getFormat(id);
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return MediaType();
+}
+bool videoInput::isDeviceSetup(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return false;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+            return VD->isDeviceSetup();
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return false;
+}
+bool videoInput::isDeviceMediaSource(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return false;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+            return VD->isDeviceMediaSource();
+    }
+    else
+    {
+        DPO->printOut(L"Device(s): There is not any suitable video device\n");
+    }
+    return false;
+}
+bool videoInput::isDeviceRawDataSource(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return false;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+        {
+            bool isRaw = VD->isDeviceRawDataSource();
+            return isRaw;
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return false;
+}
+bool videoInput::isFrameNew(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return false;
+    }
+    if(accessToDevices)
+    {
+        if(!isDeviceSetup(deviceID))
+        {
+            if(isDeviceMediaSource(deviceID))
+                return false;
+        }
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+        {
+            return VD->isFrameNew();
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return false;
+}
+unsigned int videoInput::getCountFormats(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return 0;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+            return VD->getCountFormats();
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return 0;
+}
+void videoInput::closeAllDevices()
+{
+    videoDevices *VDS = &videoDevices::getInstance();
+    for(unsigned int i = 0; i < VDS->getCount(); i++)
+        closeDevice(i);
+}
+void videoInput::setParametrs(int deviceID, CamParametrs parametrs)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice *VD = VDS->getDevice(deviceID);
+        if(VD)
+            VD->setParametrs(parametrs);
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+}
+CamParametrs videoInput::getParametrs(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    CamParametrs out;
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return out;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice *VD = VDS->getDevice(deviceID);
+        if(VD)
+            out = VD->getParametrs();
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return out;
+}
+void videoInput::closeDevice(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice *VD = VDS->getDevice(deviceID);
+        if(VD)
+            VD->closeDevice();
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+}
+unsigned int videoInput::getWidth(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return 0;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+            return VD->getWidth();
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return 0;
+}
+unsigned int videoInput::getHeight(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return 0;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+            return VD->getHeight();
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return 0;
+}
+wchar_t *videoInput::getNameVideoDevice(int deviceID)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return NULL;
+    }
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        videoDevice * VD = VDS->getDevice(deviceID);
+        if(VD)
+            return VD->getName();
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return L"Empty";
+}
+unsigned int videoInput::listDevices(bool silent)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    int out = 0;
+    if(accessToDevices)
+    {
+        videoDevices *VDS = &videoDevices::getInstance();
+        out = VDS->getCount();
+        DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+        if(!silent)DPO->printOut(L"\nVIDEOINPUT SPY MODE!\n\n");
+        if(!silent)DPO->printOut(L"SETUP: Looking For Capture Devices\n");
+        for(int i = 0; i < out; i++)
+        {
+            if(!silent)DPO->printOut(L"SETUP: %i) %s \n",i, getNameVideoDevice(i));
+        }
+        if(!silent)DPO->printOut(L"SETUP: %i Device(s) found\n\n", out);
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return out;
+}
+videoInput& videoInput::getInstance()
+{
+    static videoInput instance;
+    return instance;
+}
+bool videoInput::isDevicesAcceable()
+{
+    return accessToDevices;
+}
+void videoInput::setVerbose(bool state)
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    DPO->setVerbose(state);
+}
+void videoInput::setEmergencyStopEvent(int deviceID, void *userData, void(*func)(int, void *))
+{
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return;
+    }
+    if(accessToDevices)
+    {
+        if(func)
+        {
+            videoDevices *VDS = &videoDevices::getInstance();
+            videoDevice * VD = VDS->getDevice(deviceID);
+            if(VD)
+                VD->setEmergencyStopEvent(userData, func);
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+}
+bool videoInput::getPixels(int deviceID, unsigned char * dstBuffer, bool flipRedAndBlue, bool flipImage)
+{
+    bool success = false;
+    unsigned int bytes = 3;
+    DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+    if (deviceID < 0)
+    {
+        DPO->printOut(L"VIDEODEVICE %i: Invalid device ID\n", deviceID);
+        return success;
+    }
+    if(accessToDevices)
+    {
+        bool isRaw = isDeviceRawDataSource(deviceID);
+        if(isRaw)
+        {
+            videoDevices *VDS = &videoDevices::getInstance();
+            DebugPrintOut *DPO = &DebugPrintOut::getInstance();
+            RawImage *RIOut = VDS->getDevice(deviceID)->getRawImageOut();
+            if(RIOut)
+            {
+                unsigned int height = VDS->getDevice(deviceID)->getHeight();
+                unsigned int width  = VDS->getDevice(deviceID)->getWidth();
+                unsigned int size = bytes * width * height;
+                if(size == RIOut->getSize())
+                {
+                    processPixels(RIOut->getpPixels(), dstBuffer, width, height, bytes, flipRedAndBlue, flipImage);
+                    success = true;
+                }
+                else
+                {
+                    DPO->printOut(L"ERROR: GetPixels() - bufferSizes do not match!\n");
+                }
+            }
+            else
+            {
+                DPO->printOut(L"ERROR: GetPixels() - Unable to grab frame for device %i\n", deviceID);
+            }
+        }
+        else
+        {
+            DPO->printOut(L"ERROR: GetPixels() - Not raw data source device %i\n", deviceID);
+        }
+    }
+    else
+    {
+        DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
+    }
+    return success;
+}
+void videoInput::processPixels(unsigned char * src, unsigned char * dst, unsigned int width,
+                                unsigned int height, unsigned int bpp, bool bRGB, bool bFlip)
+{
+    unsigned int widthInBytes = width * bpp;
+    unsigned int numBytes = widthInBytes * height;
+    int *dstInt, *srcInt;
+    if(!bRGB)
+    {
+        if(bFlip)
+        {
+            for(unsigned int y = 0; y < height; y++)
+            {
+                dstInt = (int *)(dst + (y * widthInBytes));
+                srcInt = (int *)(src + ( (height -y -1) * widthInBytes));
+                memcpy(dstInt, srcInt, widthInBytes);
+            }
+        }
+        else
+        {
+            memcpy(dst, src, numBytes);
+        }
+    }
+    else
+    {
+        if(bFlip)
+        {
+            unsigned int x = 0;
+            unsigned int y = (height - 1) * widthInBytes;
+            src += y;
+            for(unsigned int i = 0; i < numBytes; i+=3)
+            {
+                if(x >= width)
+                {
+                    x = 0;
+                    src -= widthInBytes*2;
+                }
+                *dst = *(src+2);
+                dst++;
+                *dst = *(src+1);
+                dst++;
+                *dst = *src;
+                dst++;
+                src+=3;
+                x++;
+            }
+        }
+        else
+        {
+            for(unsigned int i = 0; i < numBytes; i+=3)
+            {
+                *dst = *(src+2);
+                dst++;
+                *dst = *(src+1);
+                dst++;
+                *dst = *src;
+                dst++;
+                src+=3;
+            }
+        }
+    }
+}
+}
+/******* Capturing video from camera via Microsoft Media Foundation **********/
+class CvCaptureCAM_MSMF : public CvCapture
+{
+public:
+    CvCaptureCAM_MSMF();
+    virtual ~CvCaptureCAM_MSMF();
+    virtual bool open( int index );
+    virtual void close();
+    virtual double getProperty(int);
+    virtual bool setProperty(int, double);
+    virtual bool grabFrame();
+    virtual IplImage* retrieveFrame(int);
+    virtual int getCaptureDomain() { return CV_CAP_MSMF; } // Return the type of the capture object: CV_CAP_VFW, etc...
+protected:
+    void init();
+    int index, width, height,fourcc;
+    int widthSet, heightSet;
+    IplImage* frame;
+    videoInput VI;
+};
+struct SuppressVideoInputMessages
+{
+    SuppressVideoInputMessages() { videoInput::setVerbose(true); }
+};
+static SuppressVideoInputMessages do_it;
+CvCaptureCAM_MSMF::CvCaptureCAM_MSMF():
+    index(-1),
+    width(-1),
+    height(-1),
+    fourcc(-1),
+    widthSet(-1),
+    heightSet(-1),
+    frame(0),
+    VI(videoInput::getInstance())
+{
+    CoInitialize(0);
+}
+CvCaptureCAM_MSMF::~CvCaptureCAM_MSMF()
+{
+    close();
+    CoUninitialize();
+}
+void CvCaptureCAM_MSMF::close()
+{
+    if( index >= 0 )
+    {
+        VI.closeDevice(index);
+        index = -1;
+        cvReleaseImage(&frame);
+    }
+    widthSet = heightSet = width = height = -1;
+}
+// Initialize camera input
+bool CvCaptureCAM_MSMF::open( int _index )
+{
+    int try_index = _index;
+    int devices = 0;
+    close();
+    devices = VI.listDevices(true);
+    if (devices == 0)
+        return false;
+    try_index = try_index < 0 ? 0 : (try_index > devices-1 ? devices-1 : try_index);
+    VI.setupDevice(try_index);
+    if( !VI.isFrameNew(try_index) )
+        return false;
+    index = try_index;
+    return true;
+}
+bool CvCaptureCAM_MSMF::grabFrame()
+{
+    return true;
+}
+IplImage* CvCaptureCAM_MSMF::retrieveFrame(int)
+{
+    if( !frame || (int)VI.getWidth(index) != frame->width || (int)VI.getHeight(index) != frame->height )
+    {
+        if (frame)
+            cvReleaseImage( &frame );
+        unsigned int w = VI.getWidth(index), h = VI.getHeight(index);
+        frame = cvCreateImage( cvSize(w,h), 8, 3 );
+    }
+    VI.getPixels( index, (uchar*)frame->imageData, false, true );
+    return frame;
+}
+double CvCaptureCAM_MSMF::getProperty( int property_id )
+{
+    // image format proprrties
+    switch( property_id )
+    {
+    case CV_CAP_PROP_FRAME_WIDTH:
+        return VI.getWidth(index);
+    case CV_CAP_PROP_FRAME_HEIGHT:
+        return VI.getHeight(index);
+    case CV_CAP_PROP_FOURCC:
+        // FIXME: implement method in VideoInput back end
+        //return VI.getFourcc(index);
+        ;
+    case CV_CAP_PROP_FPS:
+        // FIXME: implement method in VideoInput back end
+        //return VI.getFPS(index);
+        ;
+    }
+    // video filter properties
+    switch( property_id )
+    {
+    case CV_CAP_PROP_BRIGHTNESS:
+    case CV_CAP_PROP_CONTRAST:
+    case CV_CAP_PROP_HUE:
+    case CV_CAP_PROP_SATURATION:
+    case CV_CAP_PROP_SHARPNESS:
+    case CV_CAP_PROP_GAMMA:
+    case CV_CAP_PROP_MONOCROME:
+    case CV_CAP_PROP_WHITE_BALANCE_BLUE_U:
+    case CV_CAP_PROP_BACKLIGHT:
+    case CV_CAP_PROP_GAIN:
+        // FIXME: implement method in VideoInput back end
+        // if ( VI.getVideoSettingFilter(index, VI.getVideoPropertyFromCV(property_id), min_value,
+        //                               max_value, stepping_delta, current_value, flags,defaultValue) )
+        //     return (double)current_value;
+        return 0.;
+    }
+    // camera properties
+    switch( property_id )
+    {
+    case CV_CAP_PROP_PAN:
+    case CV_CAP_PROP_TILT:
+    case CV_CAP_PROP_ROLL:
+    case CV_CAP_PROP_ZOOM:
+    case CV_CAP_PROP_EXPOSURE:
+    case CV_CAP_PROP_IRIS:
+    case CV_CAP_PROP_FOCUS:
+    // FIXME: implement method in VideoInput back end
+    //     if (VI.getVideoSettingCamera(index,VI.getCameraPropertyFromCV(property_id),min_value,
+    //          max_value,stepping_delta,current_value,flags,defaultValue) ) return (double)current_value;
+        return 0.;
+    }
+    // unknown parameter or value not available
+    return -1;
+}
+bool CvCaptureCAM_MSMF::setProperty( int property_id, double value )
+{
+    // image capture properties
+    bool handled = false;
+    switch( property_id )
+    {
+    case CV_CAP_PROP_FRAME_WIDTH:
+        width = cvRound(value);
+        handled = true;
+        break;
+    case CV_CAP_PROP_FRAME_HEIGHT:
+        height = cvRound(value);
+        handled = true;
+        break;
+    case CV_CAP_PROP_FOURCC:
+        fourcc = (int)(unsigned long)(value);
+        if ( fourcc == -1 ) {
+            // following cvCreateVideo usage will pop up caprturepindialog here if fourcc=-1
+            // TODO - how to create a capture pin dialog
+        }
+        handled = true;
+        break;
+    case CV_CAP_PROP_FPS:
+        // FIXME: implement method in VideoInput back end
+        // int fps = cvRound(value);
+        // if (fps != VI.getFPS(index))
+        // {
+        //     VI.stopDevice(index);
+        //     VI.setIdealFramerate(index,fps);
+        //     if (widthSet > 0 && heightSet > 0)
+        //         VI.setupDevice(index, widthSet, heightSet);
+        //     else
+        //         VI.setupDevice(index);
+        // }
+        // return VI.isDeviceSetup(index);
+        ;
+    }
+    if ( handled ) {
+        // a stream setting
+        if( width > 0 && height > 0 )
+        {
+            if( width != (int)VI.getWidth(index) || height != (int)VI.getHeight(index) )//|| fourcc != VI.getFourcc(index) )
+            {
+                // FIXME: implement method in VideoInput back end
+                // int fps = static_cast<int>(VI.getFPS(index));
+                // VI.stopDevice(index);
+                // VI.setIdealFramerate(index, fps);
+                // VI.setupDeviceFourcc(index, width, height, fourcc);
+            }
+            bool success = VI.isDeviceSetup(index);
+            if (success)
+            {
+                widthSet = width;
+                heightSet = height;
+                width = height = fourcc = -1;
+            }
+            return success;
+        }
+        return true;
+    }
+    // show video/camera filter dialog
+    // FIXME: implement method in VideoInput back end
+    // if ( property_id == CV_CAP_PROP_SETTINGS ) {
+    //     VI.showSettingsWindow(index);
+    //     return true;
+    // }
+    //video Filter properties
+    switch( property_id )
+    {
+    case CV_CAP_PROP_BRIGHTNESS:
+    case CV_CAP_PROP_CONTRAST:
+    case CV_CAP_PROP_HUE:
+    case CV_CAP_PROP_SATURATION:
+    case CV_CAP_PROP_SHARPNESS:
+    case CV_CAP_PROP_GAMMA:
+    case CV_CAP_PROP_MONOCROME:
+    case CV_CAP_PROP_WHITE_BALANCE_BLUE_U:
+    case CV_CAP_PROP_BACKLIGHT:
+    case CV_CAP_PROP_GAIN:
+        // FIXME: implement method in VideoInput back end
+        //return VI.setVideoSettingFilter(index,VI.getVideoPropertyFromCV(property_id),(long)value);
+        ;
+    }
+    //camera properties
+    switch( property_id )
+    {
+    case CV_CAP_PROP_PAN:
+    case CV_CAP_PROP_TILT:
+    case CV_CAP_PROP_ROLL:
+    case CV_CAP_PROP_ZOOM:
+    case CV_CAP_PROP_EXPOSURE:
+    case CV_CAP_PROP_IRIS:
+    case CV_CAP_PROP_FOCUS:
+        // FIXME: implement method in VideoInput back end
+        //return VI.setVideoSettingCamera(index,VI.getCameraPropertyFromCV(property_id),(long)value);
+        ;
+    }
+    return false;
+}
+CvCapture* cvCreateCameraCapture_MSMF( int index )
+{
+    CvCaptureCAM_MSMF* capture = new CvCaptureCAM_MSMF;
+    try
+    {
+        if( capture->open( index ))
+            return capture;
+    }
+    catch(...)
+    {
+        delete capture;
+        throw;
+    }
+    delete capture;
+    return 0;
+}
+#endif
\ No newline at end of file
diff --git a/modules/highgui/src/cap_vfw.cpp b/modules/highgui/src/cap_vfw.cpp
index 4e6ff5e13a..d419a48912 100644
--- a/modules/highgui/src/cap_vfw.cpp
+++ b/modules/highgui/src/cap_vfw.cpp
@@ -406,7 +406,7 @@ bool CvCaptureCAM_VFW::open( int wIndex )
         fourcc = (DWORD)-1;
 
         memset( &caps, 0, sizeof(caps));
-        capDriverGetCaps( hWndC, &caps, sizeof(&caps));
+        capDriverGetCaps( hWndC, &caps, sizeof(caps));
         ::MoveWindow( hWndC, 0, 0, 320, 240, TRUE );
         capSetUserData( hWndC, (size_t)this );
         capSetCallbackOnFrame( hWndC, frameCallback );
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index 463cd1685c..5062be73c5 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -103,14 +103,6 @@ struct CvVideoWriter
     virtual bool writeFrame(const IplImage*) { return false; }
 };
 
-#if defined WIN32 || defined _WIN32
-#define HAVE_VFW 1
-
-/* uncomment to enable CMUCamera1394 fireware camera module */
-//#define HAVE_CMU1394 1
-#endif
-
-
 CvCapture * cvCreateCameraCapture_V4L( int index );
 CvCapture * cvCreateCameraCapture_DC1394( int index );
 CvCapture * cvCreateCameraCapture_DC1394_2( int index );
@@ -126,6 +118,7 @@ CvVideoWriter* cvCreateVideoWriter_Win32( const char* filename, int fourcc,
 CvVideoWriter* cvCreateVideoWriter_VFW( const char* filename, int fourcc,
                                         double fps, CvSize frameSize, int is_color );
 CvCapture* cvCreateCameraCapture_DShow( int index );
+CvCapture* cvCreateCameraCapture_MSMF( int index );
 CvCapture* cvCreateCameraCapture_OpenNI( int index );
 CvCapture* cvCreateFileCapture_OpenNI( const char* filename );
 CvCapture* cvCreateCameraCapture_Android( int index );
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 5e499c28a2..8de6dbfdef 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -57,7 +57,7 @@ CV_IMPL void cvSetWindowProperty(const char* name, int prop_id, double prop_valu
 
         #if defined (HAVE_QT)
             cvSetModeWindow_QT(name,prop_value);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
             cvSetModeWindow_W32(name,prop_value);
         #elif defined (HAVE_GTK)
             cvSetModeWindow_GTK(name,prop_value);
@@ -96,7 +96,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
 
         #if defined (HAVE_QT)
             return cvGetModeWindow_QT(name);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
             return cvGetModeWindow_W32(name);
         #elif defined (HAVE_GTK)
             return cvGetModeWindow_GTK(name);
@@ -113,7 +113,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
 
         #if defined (HAVE_QT)
             return cvGetPropWindow_QT(name);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
             return cvGetPropWindowAutoSize_W32(name);
         #elif defined (HAVE_GTK)
             return cvGetPropWindowAutoSize_GTK(name);
@@ -126,7 +126,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
 
         #if defined (HAVE_QT)
             return cvGetRatioWindow_QT(name);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
             return cvGetRatioWindow_W32(name);
         #elif defined (HAVE_GTK)
             return cvGetRatioWindow_GTK(name);
@@ -139,7 +139,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
 
         #if defined (HAVE_QT)
             return cvGetOpenGlProp_QT(name);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
             return cvGetOpenGlProp_W32(name);
         #elif defined (HAVE_GTK)
             return cvGetOpenGlProp_GTK(name);
@@ -440,11 +440,11 @@ int cv::createButton(const String&, ButtonCallback, void*, int , bool )
 
 #endif
 
-#if   defined WIN32 || defined _WIN32         // see window_w32.cpp
+#if   defined(HAVE_WIN32UI)   // see window_w32.cpp
 #elif defined (HAVE_GTK)      // see window_gtk.cpp
-#elif defined (HAVE_COCOA)   // see window_carbon.cpp
+#elif defined (HAVE_COCOA)    // see window_carbon.cpp
 #elif defined (HAVE_CARBON)
-#elif defined (HAVE_QT) //YV see window_QT.cpp
+#elif defined (HAVE_QT)       //YV see window_QT.cpp
 
 #else
 
diff --git a/modules/highgui/test/test_ffmpeg.cpp b/modules/highgui/test/test_ffmpeg.cpp
index ca22ca0ddb..cd8356c517 100644
--- a/modules/highgui/test/test_ffmpeg.cpp
+++ b/modules/highgui/test/test_ffmpeg.cpp
@@ -176,7 +176,7 @@ TEST(Highgui_Video, ffmpeg_image) { CV_FFmpegReadImageTest test; test.safe_run()
 
 #endif
 
-#if defined(HAVE_FFMPEG) || defined(WIN32) || defined(_WIN32)
+#if defined(HAVE_FFMPEG)
 
 //////////////////////////////// Parallel VideoWriters and VideoCaptures ////////////////////////////////////
 
diff --git a/modules/highgui/test/test_gui.cpp b/modules/highgui/test/test_gui.cpp
index 9ca0aaf7cb..106a64b873 100644
--- a/modules/highgui/test/test_gui.cpp
+++ b/modules/highgui/test/test_gui.cpp
@@ -43,7 +43,7 @@
 #include "test_precomp.hpp"
 #include "opencv2/highgui.hpp"
 
-#if defined HAVE_GTK  || defined HAVE_QT || defined WIN32 || defined _WIN32 || defined HAVE_CARBON || defined HAVE_COCOA
+#if defined HAVE_GTK || defined HAVE_QT || defined HAVE_WIN32UI || defined HAVE_CARBON || defined HAVE_COCOA
 
 using namespace cv;
 using namespace std;
diff --git a/modules/highgui/test/test_precomp.hpp b/modules/highgui/test/test_precomp.hpp
index 92b76076c3..863cddcaf9 100644
--- a/modules/highgui/test/test_precomp.hpp
+++ b/modules/highgui/test/test_precomp.hpp
@@ -16,7 +16,7 @@
 
 #include "opencv2/core/private.hpp"
 
-#if defined(HAVE_VIDEOINPUT)   || \
+#if defined(HAVE_DSHOW)        || \
     defined(HAVE_TYZX)         || \
     defined(HAVE_VFW)          || \
     defined(HAVE_LIBV4L)       || \
@@ -32,7 +32,7 @@
     defined(HAVE_OPENNI)       || \
     defined(HAVE_XIMEA)        || \
     defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_GIGE_API) || \
+    defined(HAVE_GIGE_API)     || \
     (0)
     //defined(HAVE_ANDROID_NATIVE_CAMERA) ||   - enable after #1193
 #  define BUILD_WITH_CAMERA_SUPPORT 1
@@ -45,9 +45,7 @@
     defined(HAVE_QUICKTIME)    || \
     defined(HAVE_AVFOUNDATION) || \
     /*defined(HAVE_OPENNI)     || too specialized */ \
-    defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
-
+    defined(HAVE_FFMPEG)
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
 #else
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
@@ -57,8 +55,7 @@
     defined(HAVE_GSTREAMER)    || \
     defined(HAVE_QUICKTIME)    || \
     defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
+    defined(HAVE_FFMPEG)
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 1
 #else
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 0
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index f344f8b1aa..ba9cd6d7ba 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1931,7 +1931,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
 
 
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if (tegra::resize(src, dst, inv_scale_x, inv_scale_y, interpolation))
+    if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
         return;
 #endif
 
@@ -3858,7 +3858,7 @@ cv2DRotationMatrix( CvPoint2D32f center, double angle,
                     double scale, CvMat* matrix )
 {
     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
-    CV_Assert( M.size() == M.size() );
+    CV_Assert( M.size() == M0.size() );
     M.convertTo(M0, M0.type());
     return matrix;
 }
@@ -3871,7 +3871,7 @@ cvGetPerspectiveTransform( const CvPoint2D32f* src,
 {
     cv::Mat M0 = cv::cvarrToMat(matrix),
         M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
-    CV_Assert( M.size() == M.size() );
+    CV_Assert( M.size() == M0.size() );
     M.convertTo(M0, M0.type());
     return matrix;
 }
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 694e960d21..30aa9efe6a 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -283,7 +283,14 @@ if(BUILD_FAT_JAVA_LIB)
   if(__extradeps)
     list(REMOVE_ITEM __deps ${__extradeps})
   endif()
-  target_link_libraries(${the_module} -Wl,-whole-archive ${__deps} -Wl,-no-whole-archive ${__extradeps} ${OPENCV_LINKER_LIBS})
+  if(APPLE)
+    foreach(_dep ${__deps})
+      target_link_libraries(${the_module} -Wl,-force_load "${_dep}")
+    endforeach()
+  else()
+    target_link_libraries(${the_module} -Wl,-whole-archive ${__deps} -Wl,-no-whole-archive)
+  endif()
+  target_link_libraries(${the_module} ${__extradeps} ${OPENCV_LINKER_LIBS})
 else()
   target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
 endif()
diff --git a/modules/java/android_test/CMakeLists.txt b/modules/java/android_test/CMakeLists.txt
index 06ebf4aa10..41f69e6ca7 100644
--- a/modules/java/android_test/CMakeLists.txt
+++ b/modules/java/android_test/CMakeLists.txt
@@ -14,7 +14,7 @@ ocv_list_filterout(opencv_test_java_files ".svn")
 
 # copy sources out from the build tree
 set(opencv_test_java_file_deps "")
-foreach(f ${opencv_test_java_files} ${ANDROID_MANIFEST_FILE})
+foreach(f ${opencv_test_java_files} ${ANDROID_MANIFEST_FILE} ".classpath" ".project")
   add_custom_command(
       OUTPUT "${opencv_test_java_bin_dir}/${f}"
       COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${f}" "${opencv_test_java_bin_dir}/${f}"
diff --git a/modules/java/android_test/src/org/opencv/test/features2d/BruteForceHammingDescriptorMatcherTest.java b/modules/java/android_test/src/org/opencv/test/features2d/BruteForceHammingDescriptorMatcherTest.java
index 63cb71ad8c..5ed99df824 100644
--- a/modules/java/android_test/src/org/opencv/test/features2d/BruteForceHammingDescriptorMatcherTest.java
+++ b/modules/java/android_test/src/org/opencv/test/features2d/BruteForceHammingDescriptorMatcherTest.java
@@ -1,5 +1,6 @@
 package org.opencv.test.features2d;
 
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
@@ -204,7 +205,17 @@ public class BruteForceHammingDescriptorMatcherTest extends OpenCVTestCase {
     }
 
     public void testRadiusMatchMatListOfListOfDMatchFloat() {
-        fail("Not yet implemented");
+        Mat train = getTrainDescriptors();
+        Mat query = getQueryDescriptors();
+        ArrayList<MatOfDMatch> matches = new ArrayList<MatOfDMatch>();
+
+        matcher.radiusMatch(query, train, matches, 50.f);
+
+        assertEquals(matches.size(), 4);
+        assertTrue(matches.get(0).empty());
+        assertMatEqual(matches.get(1), new MatOfDMatch(truth[1]), EPS);
+        assertMatEqual(matches.get(2), new MatOfDMatch(truth[2]), EPS);
+        assertTrue(matches.get(3).empty());
     }
 
     public void testRadiusMatchMatListOfListOfDMatchFloatListOfMat() {
diff --git a/modules/java/generator/src/java/android+CameraBridgeViewBase.java b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
index e76ac26c56..36417c5829 100644
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@@ -54,6 +54,9 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
     public CameraBridgeViewBase(Context context, int cameraId) {
         super(context);
         mCameraIndex = cameraId;
+        getHolder().addCallback(this);
+        mMaxWidth = MAX_UNSPECIFIED;
+        mMaxHeight = MAX_UNSPECIFIED;
     }
 
     public CameraBridgeViewBase(Context context, AttributeSet attrs) {
diff --git a/modules/java/generator/src/java/android+JavaCameraView.java b/modules/java/generator/src/java/android+JavaCameraView.java
index 34fe6091ae..f07b7d2ca8 100644
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@@ -60,7 +60,6 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
 
     public JavaCameraView(Context context, AttributeSet attrs) {
         super(context, attrs);
-        Log.d(TAG, "Java camera view ctor");
     }
 
     protected boolean initializeCamera(int width, int height) {
@@ -237,10 +236,8 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
     }
 
     public void onPreviewFrame(byte[] frame, Camera arg1) {
-        Log.i(TAG, "Preview Frame received. Need to create MAT and deliver it to clients");
-        Log.i(TAG, "Frame size  is " + frame.length);
-        synchronized (this)
-        {
+        Log.d(TAG, "Preview Frame received. Frame size: " + frame.length);
+        synchronized (this) {
             mFrameChain[1 - mChainIdx].put(0, 0, frame);
             this.notify();
         }
@@ -248,8 +245,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
             mCamera.addCallbackBuffer(mBuffer);
     }
 
-    private class JavaCameraFrame implements CvCameraViewFrame
-    {
+    private class JavaCameraFrame implements CvCameraViewFrame {
         public Mat gray() {
             return mYuvFrameData.submat(0, mHeight, 0, mWidth);
         }
diff --git a/modules/java/generator/src/java/android+OpenCVLoader.java b/modules/java/generator/src/java/android+OpenCVLoader.java
index 70e94944dd..fb05b826ca 100644
--- a/modules/java/generator/src/java/android+OpenCVLoader.java
+++ b/modules/java/generator/src/java/android+OpenCVLoader.java
@@ -22,6 +22,12 @@ public class OpenCVLoader
      */
     public static final String OPENCV_VERSION_2_4_4 = "2.4.4";
 
+    /**
+     * OpenCV Library version 2.4.5.
+     */
+    public static final String OPENCV_VERSION_2_4_5 = "2.4.5";
+
+
     /**
      * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
      * @return Returns true is initialization of OpenCV was successful.
diff --git a/modules/java/generator/src/java/core+MatOfDMatch.java b/modules/java/generator/src/java/core+MatOfDMatch.java
index 2f90f32840..aec3b99e48 100644
--- a/modules/java/generator/src/java/core+MatOfDMatch.java
+++ b/modules/java/generator/src/java/core+MatOfDMatch.java
@@ -16,8 +16,8 @@ public class MatOfDMatch extends Mat {
 
     protected MatOfDMatch(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
-            throw new IllegalArgumentException("Incomatible Mat");
+        if( !empty() && checkVector(_channels, _depth) < 0 )
+            throw new IllegalArgumentException("Incomatible Mat: " + toString());
         //FIXME: do we need release() here?
     }
 
@@ -27,8 +27,8 @@ public class MatOfDMatch extends Mat {
 
     public MatOfDMatch(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
-            throw new IllegalArgumentException("Incomatible Mat");
+        if( !empty() && checkVector(_channels, _depth) < 0 )
+            throw new IllegalArgumentException("Incomatible Mat: " + toString());
         //FIXME: do we need release() here?
     }
 
diff --git a/modules/legacy/src/blobtrackingauto.cpp b/modules/legacy/src/blobtrackingauto.cpp
index fab0503bd0..59e0ee60fb 100644
--- a/modules/legacy/src/blobtrackingauto.cpp
+++ b/modules/legacy/src/blobtrackingauto.cpp
@@ -429,10 +429,11 @@ void CvBlobTrackerAuto1::Process(IplImage* pImg, IplImage* pMask)
             for(i=0; i<NewBlobList.GetBlobNum(); ++i)
             {
                 CvBlob* pBN = NewBlobList.GetBlob(i);
-                pBN->ID = m_NextBlobID;
 
                 if(pBN && pBN->w >= CV_BLOB_MINW && pBN->h >= CV_BLOB_MINH)
                 {
+                    pBN->ID = m_NextBlobID;
+
                     CvBlob* pB = m_pBT->AddBlob(pBN, pImg, pmask );
                     if(pB)
                     {
diff --git a/modules/legacy/src/calibfilter.cpp b/modules/legacy/src/calibfilter.cpp
index e532e2a37f..7db27a1628 100644
--- a/modules/legacy/src/calibfilter.cpp
+++ b/modules/legacy/src/calibfilter.cpp
@@ -235,7 +235,7 @@ void CvCalibFilter::SetCameraCount( int count )
             cvReleaseMat( &rectMap[i][1] );
         }
 
-        memset( latestCounts, 0, sizeof(latestPoints) );
+        memset( latestCounts, 0, sizeof(latestCounts) );
         maxPoints = 0;
         cameraCount = count;
     }
diff --git a/modules/legacy/src/epilines.cpp b/modules/legacy/src/epilines.cpp
index 19d929df98..e4f3eda4f1 100644
--- a/modules/legacy/src/epilines.cpp
+++ b/modules/legacy/src/epilines.cpp
@@ -2115,7 +2115,7 @@ CV_IMPL IplImage* icvCreateIsometricImage( IplImage* src, IplImage* dst,
     if( !dst || dst->depth != desired_depth ||
         dst->nChannels != desired_num_channels ||
         dst_size.width != src_size.width ||
-        dst_size.height != dst_size.height )
+        dst_size.height != src_size.height )
     {
         cvReleaseImage( &dst );
         dst = cvCreateImage( src_size, desired_depth, desired_num_channels );
diff --git a/modules/nonfree/test/test_gpu.cpp b/modules/nonfree/test/test_gpu.cpp
index ece82f316e..30aec352cd 100644
--- a/modules/nonfree/test/test_gpu.cpp
+++ b/modules/nonfree/test/test_gpu.cpp
@@ -58,9 +58,8 @@ namespace
     IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
 }
 
-PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
+PARAM_TEST_CASE(SURF, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
 {
-    cv::gpu::DeviceInfo devInfo;
     double hessianThreshold;
     int nOctaves;
     int nOctaveLayers;
@@ -69,14 +68,11 @@ PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves,
 
     virtual void SetUp()
     {
-        devInfo = GET_PARAM(0);
-        hessianThreshold = GET_PARAM(1);
-        nOctaves = GET_PARAM(2);
-        nOctaveLayers = GET_PARAM(3);
-        extended = GET_PARAM(4);
-        upright = GET_PARAM(5);
-
-        cv::gpu::setDevice(devInfo.deviceID());
+        hessianThreshold = GET_PARAM(0);
+        nOctaves = GET_PARAM(1);
+        nOctaveLayers = GET_PARAM(2);
+        extended = GET_PARAM(3);
+        upright = GET_PARAM(4);
     }
 };
 
@@ -93,39 +89,24 @@ GPU_TEST_P(SURF, Detector)
     surf.upright = upright;
     surf.keypointsRatio = 0.05f;
 
-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
+    std::vector<cv::KeyPoint> keypoints;
+    surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
 
-        cv::SURF surf_gold;
-        surf_gold.hessianThreshold = hessianThreshold;
-        surf_gold.nOctaves = nOctaves;
-        surf_gold.nOctaveLayers = nOctaveLayers;
-        surf_gold.extended = extended;
-        surf_gold.upright = upright;
+    cv::SURF surf_gold;
+    surf_gold.hessianThreshold = hessianThreshold;
+    surf_gold.nOctaves = nOctaves;
+    surf_gold.nOctaveLayers = nOctaveLayers;
+    surf_gold.extended = extended;
+    surf_gold.upright = upright;
 
-        std::vector<cv::KeyPoint> keypoints_gold;
-        surf_gold(image, cv::noArray(), keypoints_gold);
+    std::vector<cv::KeyPoint> keypoints_gold;
+    surf_gold(image, cv::noArray(), keypoints_gold);
 
-        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
-        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+    ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+    int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+    double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
 
-        EXPECT_GT(matchedRatio, 0.95);
-    }
+    EXPECT_GT(matchedRatio, 0.95);
 }
 
 GPU_TEST_P(SURF, Detector_Masked)
@@ -144,39 +125,24 @@ GPU_TEST_P(SURF, Detector_Masked)
     surf.upright = upright;
     surf.keypointsRatio = 0.05f;
 
-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            surf(loadMat(image), loadMat(mask), keypoints);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf(loadMat(image), loadMat(mask), keypoints);
+    std::vector<cv::KeyPoint> keypoints;
+    surf(loadMat(image), loadMat(mask), keypoints);
 
-        cv::SURF surf_gold;
-        surf_gold.hessianThreshold = hessianThreshold;
-        surf_gold.nOctaves = nOctaves;
-        surf_gold.nOctaveLayers = nOctaveLayers;
-        surf_gold.extended = extended;
-        surf_gold.upright = upright;
+    cv::SURF surf_gold;
+    surf_gold.hessianThreshold = hessianThreshold;
+    surf_gold.nOctaves = nOctaves;
+    surf_gold.nOctaveLayers = nOctaveLayers;
+    surf_gold.extended = extended;
+    surf_gold.upright = upright;
 
-        std::vector<cv::KeyPoint> keypoints_gold;
-        surf_gold(image, mask, keypoints_gold);
+    std::vector<cv::KeyPoint> keypoints_gold;
+    surf_gold(image, mask, keypoints_gold);
 
-        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
-        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+    ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+    int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+    double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
 
-        EXPECT_GT(matchedRatio, 0.95);
-    }
+    EXPECT_GT(matchedRatio, 0.95);
 }
 
 GPU_TEST_P(SURF, Descriptor)
@@ -199,43 +165,26 @@ GPU_TEST_P(SURF, Descriptor)
     surf_gold.extended = extended;
     surf_gold.upright = upright;
 
-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            cv::gpu::GpuMat descriptors;
-            surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf_gold(image, cv::noArray(), keypoints);
+    std::vector<cv::KeyPoint> keypoints;
+    surf_gold(image, cv::noArray(), keypoints);
 
-        cv::gpu::GpuMat descriptors;
-        surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
+    cv::gpu::GpuMat descriptors;
+    surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
 
-        cv::Mat descriptors_gold;
-        surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
+    cv::Mat descriptors_gold;
+    surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
 
-        cv::BFMatcher matcher(cv::NORM_L2);
-        std::vector<cv::DMatch> matches;
-        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+    cv::BFMatcher matcher(cv::NORM_L2);
+    std::vector<cv::DMatch> matches;
+    matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
 
-        int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+    int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
+    double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
 
-        EXPECT_GT(matchedRatio, 0.6);
-    }
+    EXPECT_GT(matchedRatio, 0.6);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
-    ALL_DEVICES,
     testing::Values(SURF_HessianThreshold(100.0), SURF_HessianThreshold(500.0), SURF_HessianThreshold(1000.0)),
     testing::Values(SURF_Octaves(3), SURF_Octaves(4)),
     testing::Values(SURF_OctaveLayers(2), SURF_OctaveLayers(3)),
@@ -245,17 +194,15 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
 //////////////////////////////////////////////////////
 // VIBE
 
-PARAM_TEST_CASE(VIBE, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(VIBE, cv::Size, MatType, UseRoi)
 {
 };
 
 GPU_TEST_P(VIBE, Accuracy)
 {
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const int type = GET_PARAM(2);
-    const bool useRoi = GET_PARAM(3);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);
 
     const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
 
@@ -278,7 +225,6 @@ GPU_TEST_P(VIBE, Accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
-    ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
     WHOLE_SUBMAT));
diff --git a/modules/nonfree/test/test_main.cpp b/modules/nonfree/test/test_main.cpp
index 757d0a095a..6b24993447 100644
--- a/modules/nonfree/test/test_main.cpp
+++ b/modules/nonfree/test/test_main.cpp
@@ -1,73 +1,3 @@
 #include "test_precomp.hpp"
 
-#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
-
-using namespace cv;
-using namespace cv::gpu;
-using namespace cvtest;
-using namespace testing;
-
-int main(int argc, char **argv)
-{
-    try
-    {
-         const char*  keys =
-                "{ h | help ?            | false | Print help}"
-                "{ i | info              | false | Print information about system and exit }"
-                "{ d | device            | -1   | Device on which tests will be executed (-1 means all devices) }"
-                ;
-
-        CommandLineParser cmd(argc, (const char**)argv, keys);
-
-        if (cmd.get<bool>("help"))
-        {
-            cmd.printMessage();
-            return 0;
-    }
-
-        printCudaInfo();
-
-        if (cmd.get<bool>("info"))
-    {
-            return 0;
-    }
-
-        int device = cmd.get<int>("device");
-        if (device < 0)
-    {
-            DeviceManager::instance().loadAll();
-
-            std::cout << "Run tests on all supported devices \n" << std::endl;
-    }
-        else
-    {
-            DeviceManager::instance().load(device);
-
-            DeviceInfo info(device);
-            std::cout << "Run tests on device " << device << " [" << info.name() << "] \n" << std::endl;
-}
-
-        TS::ptr()->init("cv");
-        InitGoogleTest(&argc, argv);
-
-    return RUN_ALL_TESTS();
-}
-    catch (const std::exception& e)
-    {
-        std::cerr << e.what() << std::endl;
-        return -1;
-    }
-    catch (...)
-{
-        std::cerr << "Unknown error" << std::endl;
-        return -1;
-    }
-
-    return 0;
-}
-
-#else // HAVE_CUDA
-
 CV_TEST_MAIN("cv")
-
-#endif // HAVE_CUDA
diff --git a/modules/nonfree/test/test_precomp.hpp b/modules/nonfree/test/test_precomp.hpp
index 3346fdc788..230ea9118a 100644
--- a/modules/nonfree/test/test_precomp.hpp
+++ b/modules/nonfree/test/test_precomp.hpp
@@ -15,14 +15,16 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/nonfree.hpp"
 
+#include "opencv2/ts/gpu_test.hpp"
+
 #include "opencv2/opencv_modules.hpp"
+
 #ifdef HAVE_OPENCV_OCL
 #  include "opencv2/nonfree/ocl.hpp"
 #endif
 
-#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
-    #include "opencv2/ts/gpu_test.hpp"
-    #include "opencv2/nonfree/gpu.hpp"
+#ifdef HAVE_OPENCV_GPU
+#  include "opencv2/nonfree/gpu.hpp"
 #endif
 
 #endif
diff --git a/modules/nonfree/test/test_surf.ocl.cpp b/modules/nonfree/test/test_surf.ocl.cpp
index 76ed37de45..d6a877bc80 100644
--- a/modules/nonfree/test/test_surf.ocl.cpp
+++ b/modules/nonfree/test/test_surf.ocl.cpp
@@ -109,17 +109,6 @@ static int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, co
     return validCount;
 }
 
-#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    namespace { class name { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) {*os << #name <<  "=" << testing::PrintToString(static_cast< type >(param));}}
-
 IMPLEMENT_PARAM_CLASS(HessianThreshold, double)
 IMPLEMENT_PARAM_CLASS(Octaves, int)
 IMPLEMENT_PARAM_CLASS(OctaveLayers, int)
diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt
index a46aa5f967..a76b9e4bc2 100644
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -3,5 +3,5 @@ if(NOT HAVE_OPENCL)
 endif()
 
 set(the_description "OpenCL-accelerated Computer Vision")
-ocv_define_module(ocl opencv_core opencv_imgproc opencv_objdetect opencv_video)
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_objdetect opencv_video opencv_features2d)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index 2d8f5dfec1..f79e6b8180 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -50,7 +50,6 @@
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
-//#include "opencv2/features2d.hpp"
 
 namespace cv
 {
@@ -125,6 +124,9 @@ namespace cv
 
         CV_EXPORTS void* getoclCommandQueue();
 
+        //explicit call clFinish. The global command queue will be used.
+        CV_EXPORTS void finish();
+
         //this function enable ocl module to use customized cl_context and cl_command_queue
         //getDevice also need to be called before this function
         CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0);
@@ -1714,6 +1716,36 @@ namespace cv
         private:
             oclMat minSSD, leBuf, riBuf;
         };
+        class CV_EXPORTS StereoBeliefPropagation
+        {
+        public:
+            enum { DEFAULT_NDISP  = 64 };
+            enum { DEFAULT_ITERS  = 5  };
+            enum { DEFAULT_LEVELS = 5  };
+            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
+            explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
+                                             int iters  = DEFAULT_ITERS,
+                                             int levels = DEFAULT_LEVELS,
+                                             int msg_type = CV_16S);
+            StereoBeliefPropagation(int ndisp, int iters, int levels,
+                                    float max_data_term, float data_weight,
+                                    float max_disc_term, float disc_single_jump,
+                                    int msg_type = CV_32F);
+            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+            void operator()(const oclMat &data, oclMat &disparity);
+            int ndisp;
+            int iters;
+            int levels;
+            float max_data_term;
+            float data_weight;
+            float max_disc_term;
+            float disc_single_jump;
+            int msg_type;
+        private:
+            oclMat u, d, l, r, u2, d2, l2, r2;
+            std::vector<oclMat> datas;
+            oclMat out;
+        };
     }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200
diff --git a/modules/ocl/perf/interpolation.hpp b/modules/ocl/perf/interpolation.hpp
deleted file mode 100644
index fb89e701d7..0000000000
--- a/modules/ocl/perf/interpolation.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
-#define __OPENCV_TEST_INTERPOLATION_HPP__
-
-template <typename T> T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-{
-    if (border_type == cv::BORDER_CONSTANT)
-        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
-
-    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
-}
-
-template <typename T> struct NearestInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
-    }
-};
-
-template <typename T> struct LinearInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        x -= 0.5f;
-        y -= 0.5f;
-
-        int x1 = cvFloor(x);
-        int y1 = cvFloor(y);
-        int x2 = x1 + 1;
-        int y2 = y1 + 1;
-
-        float res = 0;
-
-        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
-        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
-        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
-        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
-
-        return cv::saturate_cast<T>(res);
-    }
-};
-
-template <typename T> struct CubicInterpolator
-{
-    static float getValue(float p[4], float x)
-    {
-        return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
-    }
-
-    static float getValue(float p[4][4], float x, float y)
-    {
-        float arr[4];
-
-        arr[0] = getValue(p[0], x);
-        arr[1] = getValue(p[1], x);
-        arr[2] = getValue(p[2], x);
-        arr[3] = getValue(p[3], x);
-
-        return getValue(arr, y);
-    }
-
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        int ix = cvRound(x);
-        int iy = cvRound(y);
-
-        float vals[4][4] =
-        {
-            {readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy    , ix - 2, c, border_type, borderVal), readVal<T>(src, iy    , ix - 1, c, border_type, borderVal), readVal<T>(src, iy    , ix, c, border_type, borderVal), readVal<T>(src, iy    , ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
-        };
-
-        return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
-    }
-};
-
-#endif // __OPENCV_TEST_INTERPOLATION_HPP__
diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp
index f1b6884279..4cc70aa7b7 100644
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -41,99 +42,118 @@
 
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
+int main(int argc, const char *argv[])
+{
+    vector<ocl::Info> oclinfo;
+    int num_devices = getDevice(oclinfo);
 
-using namespace std;
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
+    if (num_devices < 1)
+    {
+        cerr << "no device found\n";
+        return -1;
+    }
+
+    int devidx = 0;
+
+    for (size_t i = 0; i < oclinfo.size(); i++)
+    {
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
+        {
+            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
+        }
+    }
+
+    redirectError(cvErrorCallback);
 
-void print_info()
-{
-    printf("\n");
-#if defined _WIN32
-#   if defined _WIN64
-    puts("OS: Windows 64");
-#   else
-    puts("OS: Windows 32");
-#   endif
-#elif defined linux
-#   if defined _LP64
-    puts("OS: Linux 64");
-#   else
-    puts("OS: Linux 32");
-#   endif
-#elif defined __APPLE__
-#   if defined _LP64
-    puts("OS: Apple 64");
-#   else
-    puts("OS: Apple 32");
-#   endif
-#endif
-
-}
-std::string workdir;
-int main(int argc, char **argv)
-{
-    TS::ptr()->init("ocl");
-    InitGoogleTest(&argc, argv);
     const char *keys =
-        "{ h | false              | print help message }"
-		"{ w | ../../../samples/c/| set working directory i.e. -w=C:\\}"
-        "{ t | gpu                | set device type:i.e. -t=cpu or gpu}"
-        "{ p | 0                  | set platform id i.e. -p=0}"
-        "{ d | 0                  | set device id i.e. -d=0}";
+        "{ h help    | false | print help message }"
+        "{ f filter  |       | filter for test }"
+        "{ w workdir |       | set working directory }"
+        "{ l list    | false | show all tests }"
+        "{ d device  | 0     | device id }"
+        "{ i iters   | 10    | iteration count }"
+        "{ m warmup  | 1     | gpu warm up iteration count}"
+        "{ t xtop    | 1.1	  | xfactor top boundary}"
+        "{ b xbottom | 0.9	  | xfactor bottom boundary}"
+        "{ v verify  | false | only run gpu once to verify if problems occur}";
 
     CommandLineParser cmd(argc, argv, keys);
-    if (cmd.get<string>("h")=="true")
+
+    if (cmd.has("help"))
     {
-        cout << "Avaible options besides goole test option:" << endl;
+        cout << "Avaible options:" << endl;
         cmd.printMessage();
         return 0;
     }
-    workdir = cmd.get<string>("w");
-    string type = cmd.get<string>("t");
-    unsigned int pid = cmd.get<unsigned int>("p");
-    int device = cmd.get<int>("d");
-    print_info();
-    // int flag = CVCL_DEVICE_TYPE_GPU;
-
-    // if(type == "cpu")
-    // {
-    //     flag = CVCL_DEVICE_TYPE_CPU;
-    // }
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = getDevice(oclinfo);
-    if(devnums <= device || device < 0)
+
+    int device = cmd.get<int>("device");
+
+    if (device < 0 || device >= num_devices)
     {
-        std::cout << "device invalid\n";
+        cerr << "Invalid device ID" << endl;
         return -1;
     }
 
-    if(pid >= oclinfo.size())
+    if (cmd.get<bool>("verify"))
     {
-        std::cout << "platform invalid\n";
-        return -1;
+        TestSystem::instance().setNumIters(1);
+        TestSystem::instance().setGPUWarmupIters(0);
+        TestSystem::instance().setCPUIters(0);
     }
 
-    if(pid != 0 || device != 0)
+    devidx = 0;
+
+    for (size_t i = 0; i < oclinfo.size(); i++)
     {
-        setDevice(oclinfo[pid], device);
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
+        {
+            if (device == devidx)
+            {
+                ocl::setDevice(oclinfo[i], (int)j);
+                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
+                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
+                goto END_DEV;
+            }
+        }
     }
 
-    cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
-    setBinpath(CLBINPATH);
-    return RUN_ALL_TESTS();
-}
+END_DEV:
 
-#else // DON'T HAVE_OPENCL
+    string filter = cmd.get<string>("filter");
+    string workdir = cmd.get<string>("workdir");
+    bool list = cmd.has("list");
+    int iters = cmd.get<int>("iters");
+    int wu_iters = cmd.get<int>("warmup");
+    double x_top = cmd.get<double>("xtop");
+    double x_bottom = cmd.get<double>("xbottom");
 
-int main()
-{
-    printf("OpenCV was built without OpenCL support\n");
-    return 0;
-}
+    TestSystem::instance().setTopThreshold(x_top);
+    TestSystem::instance().setBottomThreshold(x_bottom);
+
+    if (!filter.empty())
+    {
+        TestSystem::instance().setTestFilter(filter);
+    }
+
+    if (!workdir.empty())
+    {
+        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
+        {
+            workdir += '/';
+        }
+
+        TestSystem::instance().setWorkingDir(workdir);
+    }
+
+    if (list)
+    {
+        TestSystem::instance().setListMode(true);
+    }
 
+    TestSystem::instance().setNumIters(iters);
+    TestSystem::instance().setGPUWarmupIters(wu_iters);
 
-#endif // HAVE_OPENCL
+    TestSystem::instance().run();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp
index b7f82b685d..e6e957641b 100644
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -1,4 +1,4 @@
-///////////////////////////////////////////////////////////////////////////////////////
+/*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -10,17 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan,jlyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Zailong Wu, bullet@yeah.net
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -35,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -48,4371 +43,1165 @@
 //
 //M*/
 
-
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
+///////////// Lut ////////////////////////
+TEST(lut)
 {
-    int type;
-    cv::Scalar val;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        cv::Size size(MWIDTH, MHEIGHT);
+    Mat src, lut, dst;
+    ocl::oclMat d_src, d_lut, d_dst;
 
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        //mat2 = randomMat(rng, cv::Size(512,3), type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+    int all_type[] = {CV_8UC1, CV_8UC3};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC3"};
 
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src2x   = 1;
-            src1y   = 1;
-            src2y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            maskx	 = 1;
-            masky	= 1;
-        }
-        else
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src2x = 0;
-            src1y = 0;
-            src2y = 0;
-            dstx = 0;
-            dsty = 0;
-            maskx	 = 0;
-            masky	= 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        //mat2_roi = mat2(Rect(src2x,src2y,256,1));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gdst1_whole = dst1;
-        //gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat1 = mat1_roi;
-        //gmat2 = mat2_roi;
-        //gmask = mask_roi;
-    }
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-};
-////////////////////////////////lut/////////////////////////////////////////////////
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(lut, 1, 256, CV_8UC1, 0, 1);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-struct Lut : ArithmTestBase {};
+            LUT(src, lut, dst);
 
-TEST_P(Lut, Mat)
-{
+            CPU_ON;
+            LUT(src, lut, dst);
+            CPU_OFF;
 
-    cv::Mat mat2(3, 512, CV_8UC1);
-    cv::RNG &rng = TS::ptr()->get_rng();
-    rng.fill(mat2, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(256));
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            mat2 = randomMat(rng, cv::Size(512, 3), type, 5, 16, false);
-            mat2_roi = mat2(Rect(src2x, src2y, 256, 1));
-
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::LUT(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::LUT(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            d_src.upload(src);
+            d_lut.upload(lut);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        // s=GetParam();
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        //  src2x = rng.uniform( 0,mat2.cols - 256);
-        // src2y = rng.uniform (0,mat2.rows - 1);
-
-        // cv::Mat mat2_roi = mat2(Rect(src2x,src2y,256,1));
-        mat2 = randomMat(rng, cv::Size(512, 3), type, 5, 16, false);
-        mat2_roi = mat2(Rect(src2x, src2y, 256, 1));
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        //   gdst1_whole = dst1;
-        //     gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        //     gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::LUT(gmat1, gmat2, gdst);
-    };
-#endif
-
-}
+            WARMUP_ON;
+            ocl::LUT(d_src, d_lut, d_dst);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::LUT(d_src, d_lut, d_dst);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_lut.upload(lut);
+            ocl::LUT(d_src, d_lut, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
 
-////////////////////////////////exp/////////////////////////////////////////////////
-
-struct Exp : ArithmTestBase {};
-
-TEST_P(Exp, Mat)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::exp(mat1_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::exp(gmat1, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download(cpu_dst);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-            //EXPECT_MAT_NEAR(dst, cpu_dst, 0,"");
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
 
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::exp(gmat1, gdst);
-    };
-#endif
-
 }
 
-
-////////////////////////////////log/////////////////////////////////////////////////
-
-struct Log : ArithmTestBase {};
-
-TEST_P(Log, Mat)
+///////////// Exp ////////////////////////
+TEST(Exp)
 {
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::log(mat1_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::log(gmat1, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::log(gmat1, gdst);
-    };
-#endif
-
-}
-
-
-
-
-////////////////////////////////add/////////////////////////////////////////////////
-
-struct Add : ArithmTestBase {};
-
-TEST_P(Add, Mat)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::add(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::add(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::add(gmat1, gmat2, gdst);
-    };
-#endif
-}
-
-TEST_P(Add, Mat_Mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::add(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::add(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::add(gmat1, gmat2, gdst, gmask);
-    };
-#endif
-}
-TEST_P(Add, Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::add(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::add(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::add(gmat1, val, gdst);
-    };
-#endif
-}
-
-TEST_P(Add, Scalar_Mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::add(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::add(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::add(gmat1, val, gdst, gmask);
-    };
-#endif
-}
-
+        SUBTEST << size << 'x' << size << "; CV_32FC1";
 
-////////////////////////////////sub/////////////////////////////////////////////////
-struct Sub : ArithmTestBase {};
+        gen(src, size, size, CV_32FC1, 0, 256);
+        gen(dst, size, size, CV_32FC1, 0, 256);
 
-TEST_P(Sub, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::subtract(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::subtract(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+        exp(src, dst);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::subtract(gmat1, gmat2, gdst);
-    };
-#endif
-}
+        CPU_ON;
+        exp(src, dst);
+        CPU_OFF;
+        d_src.upload(src);
 
-TEST_P(Sub, Mat_Mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::subtract(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+        WARMUP_ON;
+        ocl::exp(d_src, d_dst);
+        WARMUP_OFF;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
-    };
-#endif
-}
-TEST_P(Sub, Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::subtract(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::subtract(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+        GPU_ON;
+        ocl::exp(d_src, d_dst);
+         ;
+        GPU_OFF;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::exp(d_src, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::subtract(gmat1, val, gdst);
-    };
-#endif
 }
 
-TEST_P(Sub, Scalar_Mask)
+///////////// LOG ////////////////////////
+TEST(Log)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::subtract(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::subtract(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::subtract(gmat1, val, gdst, gmask);
-    };
-#endif
-}
+        SUBTEST << size << 'x' << size << "; 32F";
 
+        gen(src, size, size, CV_32F, 1, 10);
 
-////////////////////////////////Mul/////////////////////////////////////////////////
-struct Mul : ArithmTestBase {};
+        log(src, dst);
 
-TEST_P(Mul, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::multiply(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::multiply(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+        CPU_ON;
+        log(src, dst);
+        CPU_OFF;
+        d_src.upload(src);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::multiply(gmat1, gmat2, gdst);
-    };
-#endif
-}
+        WARMUP_ON;
+        ocl::log(d_src, d_dst);
+        WARMUP_OFF;
 
-TEST_P(Mul, Mat_Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            cv::RNG &rng = TS::ptr()->get_rng();
-            double s = rng.uniform(-10.0, 10.0);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::multiply(mat1_roi, mat2_roi, dst_roi, s);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::multiply(gmat1, gmat2, gdst, s);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+        GPU_ON;
+        ocl::log(d_src, d_dst);
+         ;
+        GPU_OFF;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::log(d_src, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        double s = rng.uniform(-10.0, 10.0);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::multiply(gmat1, gmat2, gdst, s);
-    };
-#endif
 }
 
-
-struct Div : ArithmTestBase {};
-
-TEST_P(Div, Mat)
+///////////// Add ////////////////////////
+TEST(Add)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::divide(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::divide(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::divide(gmat1, gmat2, gdst);
-    };
-#endif
-}
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-TEST_P(Div, Mat_Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            cv::RNG &rng = TS::ptr()->get_rng();
-            double s = rng.uniform(-10.0, 10.0);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::divide(mat1_roi, mat2_roi, dst_roi, s);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::divide(gmat1, gmat2, gdst, s);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        double s = rng.uniform(-10.0, 10.0);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::divide(gmat1, gmat2, gdst, s);
-    };
-#endif
-}
+            gen(src1, size, size, all_type[j], 0, 1);
+            gen(src2, size, size, all_type[j], 0, 1);
 
+            add(src1, src2, dst);
 
-struct Absdiff : ArithmTestBase {};
+            CPU_ON;
+            add(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-TEST_P(Absdiff, Mat)
-{
+            WARMUP_ON;
+            ocl::add(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::absdiff(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::absdiff(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::add(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::add(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::absdiff(gmat1, gmat2, gdst);
-    };
-#endif
-}
-
-TEST_P(Absdiff, Mat_Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::absdiff(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::absdiff(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::absdiff(gmat1, val, gdst);
-    };
-#endif
 }
 
-
-
-struct CartToPolar : ArithmTestBase {};
-
-TEST_P(CartToPolar, angleInDegree)
+///////////// Mul ////////////////////////
+TEST(Mul)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            cv::Mat cpu_dst1;
-            gdst1_whole.download(cpu_dst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
-    };
-#endif
-}
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-TEST_P(CartToPolar, angleInRadians)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            cv::Mat cpu_dst1;
-            gdst1_whole.download(cpu_dst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
-    };
-#endif
-}
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
 
-struct PolarToCart : ArithmTestBase {};
+            multiply(src1, src2, dst);
 
-TEST_P(PolarToCart, angleInDegree)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            cv::Mat cpu_dst1;
-            gdst1_whole.download(cpu_dst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            multiply(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
-    };
-#endif
-}
+            WARMUP_ON;
+            ocl::multiply(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
 
-TEST_P(PolarToCart, angleInRadians)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            cv::Mat cpu_dst1;
-            gdst1_whole.download(cpu_dst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::multiply(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::multiply(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
-    };
-#endif
-}
-
-
-
-struct Magnitude : ArithmTestBase {};
 
-TEST_P(Magnitude, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::magnitude(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::magnitude(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::magnitude(gmat1, gmat2, gdst);
-    };
-#endif
 }
 
-struct Transpose : ArithmTestBase {};
-
-TEST_P(Transpose, Mat)
+///////////// Div ////////////////////////
+TEST(Div)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::transpose(mat1_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::transpose(gmat1, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::transpose(gmat1, gdst);
-    };
-#endif
-}
-
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-struct Flip : ArithmTestBase {};
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-TEST_P(Flip, X)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::flip(mat1_roi, dst_roi, 0);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::flip(gmat1, gdst, 0);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::flip(gmat1, gdst, 0);
-    };
-#endif
-}
+            divide(src1, src2, dst);
 
-TEST_P(Flip, Y)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::flip(mat1_roi, dst_roi, 1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::flip(gmat1, gdst, 1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            divide(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::flip(gmat1, gdst, 1);
-    };
-#endif
-}
+            WARMUP_ON;
+            ocl::divide(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
 
-TEST_P(Flip, BOTH)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::flip(mat1_roi, dst_roi, -1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::flip(gmat1, gdst, -1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::divide(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::divide(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::flip(gmat1, gdst, -1);
-    };
-#endif
-}
-
-
-
-struct MinMax : ArithmTestBase {};
-
-TEST_P(MinMax, MAT)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double minVal, maxVal;
-            cv::Point minLoc, maxLoc;
-            t0 = (double)cvGetTickCount();//cpu start
-            if (mat1.depth() != CV_8S)
-            {
-                cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-            }
-            else
-            {
-                minVal = std::numeric_limits<double>::max();
-                maxVal = -std::numeric_limits<double>::max();
-                for (int i = 0; i < mat1_roi.rows; ++i)
-                    for (int j = 0; j < mat1_roi.cols; ++j)
-                    {
-                        signed char val = mat1_roi.at<signed char>(i, j);
-                        if (val < minVal) minVal = val;
-                        if (val > maxVal) maxVal = val;
-                    }
-            }
-
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            double minVal_, maxVal_;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        double minVal_, maxVal_;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
-    };
-#endif
 }
 
-TEST_P(MinMax, MASK)
+///////////// Absdiff ////////////////////////
+TEST(Absdiff)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double minVal, maxVal;
-            cv::Point minLoc, maxLoc;
-            t0 = (double)cvGetTickCount();//cpu start
-            if (mat1.depth() != CV_8S)
-            {
-                cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-            }
-            else
-            {
-                minVal = std::numeric_limits<double>::max();
-                maxVal = -std::numeric_limits<double>::max();
-                for (int i = 0; i < mat1_roi.rows; ++i)
-                    for (int j = 0; j < mat1_roi.cols; ++j)
-                    {
-                        signed char val = mat1_roi.at<signed char>(i, j);
-                        unsigned char m = mask_roi.at<unsigned char>(i, j);
-                        if (val < minVal && m) minVal = val;
-                        if (val > maxVal && m) maxVal = val;
-                    }
-            }
-
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            double minVal_, maxVal_;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-        double minVal_, maxVal_;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask);
-    };
-#endif
-}
-
-
-struct MinMaxLoc : ArithmTestBase {};
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-TEST_P(MinMaxLoc, MAT)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            double minVal, maxVal;
-            cv::Point minLoc, maxLoc;
-            int depth = mat1.depth();
-            t0 = (double)cvGetTickCount();//cpu start
-            if (depth != CV_8S)
-            {
-                cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-            }
-            else
-            {
-                minVal = std::numeric_limits<double>::max();
-                maxVal = -std::numeric_limits<double>::max();
-                for (int i = 0; i < mat1_roi.rows; ++i)
-                    for (int j = 0; j < mat1_roi.cols; ++j)
-                    {
-                        signed char val = mat1_roi.at<signed char>(i, j);
-                        if (val < minVal)
-                        {
-                            minVal = val;
-                            minLoc.x = j;
-                            minLoc.y = i;
-                        }
-                        if (val > maxVal)
-                        {
-                            maxVal = val;
-                            maxLoc.x = j;
-                            maxLoc.y = i;
-                        }
-                    }
-            }
-
-
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            double minVal_, maxVal_;
-            cv::Point minLoc_, maxLoc_;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        double minVal_, maxVal_;
-        cv::Point minLoc_, maxLoc_;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
-    };
-#endif
-
-}
-
-
-TEST_P(MinMaxLoc, MASK)
-{
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double minVal, maxVal;
-            cv::Point minLoc, maxLoc;
-            int depth = mat1.depth();
-            t0 = (double)cvGetTickCount();//cpu start
-            if (depth != CV_8S)
-            {
-                cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-            }
-            else
-            {
-                minVal = std::numeric_limits<double>::max();
-                maxVal = -std::numeric_limits<double>::max();
-                for (int i = 0; i < mat1_roi.rows; ++i)
-                    for (int j = 0; j < mat1_roi.cols; ++j)
-                    {
-                        signed char val = mat1_roi.at<signed char>(i, j);
-                        unsigned char m = mask_roi.at<unsigned char>(i , j);
-                        if (val < minVal && m)
-                        {
-                            minVal = val;
-                            minLoc.x = j;
-                            minLoc.y = i;
-                        }
-                        if (val > maxVal && m)
-                        {
-                            maxVal = val;
-                            maxLoc.x = j;
-                            maxLoc.y = i;
-                        }
-                    }
-            }
-
-
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            double minVal_, maxVal_;
-            cv::Point minLoc_, maxLoc_;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-        double minVal_, maxVal_;
-        cv::Point minLoc_, maxLoc_;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
-    };
-#endif
-}
 
+            absdiff(src1, src2, dst);
 
-struct Sum : ArithmTestBase {};
+            CPU_ON;
+            absdiff(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-TEST_P(Sum, MAT)
-{
+            WARMUP_ON;
+            ocl::absdiff(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::sum(mat1_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::sum(gmat1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::absdiff(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::absdiff(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        Scalar gpures = cv::ocl::sum(gmat1);
-    };
-#endif
-}
-
-//TEST_P(Sum, MASK)
-//{
-//    for(int j=0; j<LOOP_TIMES; j++)
-//    {
-//
-//    }
-//}
-
-struct CountNonZero : ArithmTestBase {};
 
-TEST_P(CountNonZero, MAT)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::countNonZero(mat1_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::countNonZero(gmat1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::countNonZero(gmat1);
-    };
-#endif
-
 }
 
-
-
-////////////////////////////////phase/////////////////////////////////////////////////
-struct Phase : ArithmTestBase {};
-
-TEST_P(Phase, Mat)
+///////////// CartToPolar ////////////////////////
+TEST(CartToPolar)
 {
-    if(mat1.depth() != CV_32F && mat1.depth() != CV_64F)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
+    Mat src1, src2, dst, dst1;
+    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::phase(mat1_roi, mat2_roi, dst_roi, 0);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::phase(gmat1, gmat2, gdst, 0);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::phase(gmat1, gmat2, gdst, 0);
-    };
-#endif
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-}
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+            gen(dst1, size, size, all_type[j], 0, 256);
 
 
-////////////////////////////////bitwise_and/////////////////////////////////////////////////
-struct Bitwise_and : ArithmTestBase {};
+            cartToPolar(src1, src2, dst, dst1, 1);
 
-TEST_P(Bitwise_and, Mat)
-{
+            CPU_ON;
+            cartToPolar(src1, src2, dst, dst1, 1);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_and(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_and(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
+            WARMUP_OFF;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
+            GPU_ON;
+            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
+             ;
+            GPU_OFF;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
+            d_dst.download(dst);
+            d_dst1.download(dst1);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_and(gmat1, gmat2, gdst);
-    };
-#endif
 
+    }
 }
 
-TEST_P(Bitwise_and, Mat_Mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_and(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
-    };
-#endif
-}
-
-TEST_P(Bitwise_and, Scalar)
+///////////// PolarToCart ////////////////////////
+TEST(PolarToCart)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst, dst1;
+    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_and(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_and(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+            gen(dst1, size, size, all_type[j], 0, 256);
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+
+            polarToCart(src1, src2, dst, dst1, 1);
+
+            CPU_ON;
+            polarToCart(src1, src2, dst, dst1, 1);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
+            d_dst.download(dst);
+            d_dst1.download(dst1);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_and(gmat1, val, gdst);
-    };
-#endif
+
+    }
 }
 
-TEST_P(Bitwise_and, Scalar_Mask)
+///////////// Magnitude ////////////////////////
+TEST(magnitude)
 {
+    Mat x, y, mag;
+    ocl::oclMat d_x, d_y, d_mag;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_and(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
+            gen(x, size, size, all_type[j], 0, 1);
+            gen(y, size, size, all_type[j], 0, 1);
 
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
-    };
-#endif
-}
+            magnitude(x, y, mag);
+
+            CPU_ON;
+            magnitude(x, y, mag);
+            CPU_OFF;
+            d_x.upload(x);
+            d_y.upload(y);
 
+            WARMUP_ON;
+            ocl::magnitude(d_x, d_y, d_mag);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::magnitude(d_x, d_y, d_mag);
+             ;
+            GPU_OFF;
 
-////////////////////////////////bitwise_or/////////////////////////////////////////////////
+            GPU_FULL_ON;
+            d_x.upload(x);
+            d_y.upload(y);
+            ocl::magnitude(d_x, d_y, d_mag);
+            d_mag.download(mag);
+            GPU_FULL_OFF;
+        }
 
-struct Bitwise_or : ArithmTestBase {};
+    }
+}
 
-TEST_P(Bitwise_or, Mat)
+///////////// Transpose ////////////////////////
+TEST(Transpose)
 {
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_or(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_or(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+            transpose(src, dst);
+
+            CPU_ON;
+            transpose(src, dst);
+            CPU_OFF;
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::transpose(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::transpose(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::transpose(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_or(gmat1, gmat2, gdst);
-    };
-#endif
+
+    }
 }
 
-TEST_P(Bitwise_or, Mat_Mask)
+///////////// Flip ////////////////////////
+TEST(Flip)
 {
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_or(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; FLIP_BOTH";
 
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+
+            flip(src, dst, 0);
+
+            CPU_ON;
+            flip(src, dst, 0);
+            CPU_OFF;
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::flip(d_src, d_dst, 0);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::flip(d_src, d_dst, 0);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::flip(d_src, d_dst, 0);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
-    };
-#endif
 }
-TEST_P(Bitwise_or, Scalar)
+
+///////////// minMax ////////////////////////
+TEST(minMax)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src;
+    ocl::oclMat d_src;
+
+    double min_val, max_val;
+    Point min_loc, max_loc;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_or(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_or(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
+            gen(src, size, size, all_type[j], 0, 256);
+
+            CPU_ON;
+            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
+            CPU_OFF;
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::minMax(d_src, &min_val, &max_val);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::minMax(d_src, &min_val, &max_val);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::minMax(d_src, &min_val, &max_val);
+            GPU_FULL_OFF;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_or(gmat1, val, gdst);
-    };
-#endif
+
+    }
 }
 
-TEST_P(Bitwise_or, Scalar_Mask)
+///////////// minMaxLoc ////////////////////////
+TEST(minMaxLoc)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_or(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    Mat src;
+    ocl::oclMat d_src;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
+    double min_val, max_val;
+    Point min_loc, max_loc;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
-    };
-#endif
-}
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
+            gen(src, size, size, all_type[j], 0, 1);
 
-////////////////////////////////bitwise_xor/////////////////////////////////////////////////
+            CPU_ON;
+            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
+            CPU_OFF;
+            d_src.upload(src);
 
-struct Bitwise_xor : ArithmTestBase {};
+            WARMUP_ON;
+            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
+            WARMUP_OFF;
 
-TEST_P(Bitwise_xor, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
-    };
-#endif
+    }
 }
 
-TEST_P(Bitwise_xor, Mat_Mask)
+///////////// Sum ////////////////////////
+TEST(Sum)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src;
+    Scalar cpures, gpures;
+    ocl::oclMat d_src;
+
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
+            gen(src, size, size, all_type[j], 0, 256);
+
+            cpures = sum(src);
+
+            CPU_ON;
+            cpures = sum(src);
+            CPU_OFF;
+            d_src.upload(src);
+
+            WARMUP_ON;
+            gpures = ocl::sum(d_src);
+            WARMUP_OFF;
+
+            GPU_ON;
+            gpures = ocl::sum(d_src);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            gpures = ocl::sum(d_src);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
-    };
-#endif
 }
 
-TEST_P(Bitwise_xor, Scalar)
+///////////// countNonZero ////////////////////////
+TEST(countNonZero)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src;
+    ocl::oclMat d_src;
+
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_xor(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_xor(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
+            gen(src, size, size, all_type[j], 0, 256);
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+            countNonZero(src);
+
+            CPU_ON;
+            countNonZero(src);
+            CPU_OFF;
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::countNonZero(d_src);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::countNonZero(d_src);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::countNonZero(d_src);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_xor(gmat1, val, gdst);
-    };
-#endif
+
+    }
 }
 
-TEST_P(Bitwise_xor, Scalar_Mask)
+///////////// Phase ////////////////////////
+TEST(Phase)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_xor(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
-    };
-#endif
-}
 
+            phase(src1, src2, dst, 1);
 
-////////////////////////////////bitwise_not/////////////////////////////////////////////////
+            CPU_ON;
+            phase(src1, src2, dst, 1);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-struct Bitwise_not : ArithmTestBase {};
+            WARMUP_ON;
+            ocl::phase(d_src1, d_src2, d_dst, 1);
+            WARMUP_OFF;
 
-TEST_P(Bitwise_not, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_not(mat1_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_not(gmat1, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::phase(d_src1, d_src2, d_dst, 1);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::phase(d_src1, d_src2, d_dst, 1);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_not(gmat1, gdst);
-    };
-#endif
+    }
 }
 
-////////////////////////////////compare/////////////////////////////////////////////////
-PARAM_TEST_CASE ( CompareTestBase, MatType, bool)
+///////////// bitwise_and////////////////////////
+TEST(bitwise_and)
 {
-    int type;
-    cv::Scalar val;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //type = GET_PARAM(0);
-        type = CV_8UC1;
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        cv::RNG &rng = TS::ptr()->get_rng();
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-        cv::Size size(MWIDTH, MHEIGHT);
 
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        //mat2 = randomMat(rng, cv::Size(512,3), type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+            bitwise_and(src1, src2, dst);
 
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+            CPU_ON;
+            bitwise_and(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
+            WARMUP_ON;
+            ocl::bitwise_and(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
 
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src2x   = 1;
-            src1y   = 1;
-            src2y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            maskx	 = 1;
-            masky	= 1;
+            GPU_ON;
+            ocl::bitwise_and(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::bitwise_and(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src2x = 0;
-            src1y = 0;
-            src2y = 0;
-            dstx = 0;
-            dsty = 0;
-            maskx	 = 0;
-            masky	= 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        //mat2_roi = mat2(Rect(src2x,src2y,256,1));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gdst1_whole = dst1;
-        //gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat1 = mat1_roi;
-        //gmat2 = mat2_roi;
-        //gmask = mask_roi;
-    }
 
-};
-struct Compare : CompareTestBase {};
+    }
+}
 
-TEST_P(Compare, Mat)
+///////////// bitwise_or////////////////////////
+TEST(bitwise_or)
 {
-    if(mat1.type() == CV_8SC1)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
 
-    int cmp_codes[] = {CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE};
-    const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
-    int cmp_num = sizeof(cmp_codes) / sizeof(int);
-    for (int i = 0; i < cmp_num; ++i)
-    {
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
 
-#ifndef PRINT_KERNEL_RUN_TIME
-        double totalcputick = 0;
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        double t0 = 0;
-        double t1 = 0;
-        double t2 = 0;
-        for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-        {
-            totalcputick = 0;
-            totalgputick = 0;
-            totalgputick_kernel = 0;
-            for(int j = 0; j < LOOP_TIMES + 1; j ++)
-            {
-                Has_roi(k);
-
-                t0 = (double)cvGetTickCount();//cpu start
-                cv::compare(mat1_roi, mat2_roi, dst_roi, cmp_codes[i]);
-                t0 = (double)cvGetTickCount() - t0;//cpu end
-
-                t1 = (double)cvGetTickCount();//gpu start1
-                gdst_whole = dst;
-                gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-                gmat1 = mat1_roi;
-                gmat2 = mat2_roi;
-                t2 = (double)cvGetTickCount(); //kernel
-                cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]);
-                t2 = (double)cvGetTickCount() - t2;//kernel
-                cv::Mat cpu_dst;
-                gdst_whole.download (cpu_dst);//download
-                t1 = (double)cvGetTickCount() - t1;//gpu end1
-                if(j == 0)
-                    continue;
-                totalgputick = t1 + totalgputick;
-                totalcputick = t0 + totalcputick;
-                totalgputick_kernel = t2 + totalgputick_kernel;
-
-            }
-            cout << cmp_str[i] << endl;
-            if(k == 0)
-            {
-                cout << "no roi\n";
-            }
-            else
-            {
-                cout << "with roi\n";
-            };
-            cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        }
-#else
-        for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(j);
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            if(j == 0)
-            {
-                cout << "no roi:";
-            }
-            else
-            {
-                cout << "\nwith roi:";
-            };
-            cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]);
-        };
-#endif
-    }
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-}
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-struct Pow : ArithmTestBase {};
 
-TEST_P(Pow, Mat)
-{
-    if(mat1.depth() != CV_32F && mat1.depth() != CV_64F)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
+            bitwise_or(src1, src2, dst);
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double p = 4.5;
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::pow(mat1_roi, p, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::pow(gmat1, p, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            bitwise_or(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
+            WARMUP_ON;
+            ocl::bitwise_or(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::bitwise_or(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::bitwise_or(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        double p = 4.5;
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::pow(gmat1, p, gdst);
-    };
-#endif
 }
 
-
-struct MagnitudeSqr : ArithmTestBase {};
-
-TEST_P(MagnitudeSqr, Mat)
+///////////// bitwise_xor////////////////////////
+TEST(bitwise_xor)
 {
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t0 = (double)cvGetTickCount();//cpu start
-            for(int i = 0; i < mat1.rows; ++i)
-                for(int j = 0; j < mat1.cols; ++j)
-                {
-                    float val1 = mat1.at<float>(i, j);
-                    float val2 = mat2.at<float>(i, j);
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
 
-                }
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            cv::ocl::oclMat clmat1(mat1), clmat2(mat2), cldst;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::magnitudeSqr(clmat1, clmat2, cldst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            cldst.download(cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            bitwise_xor(src1, src2, dst);
 
+            CPU_ON;
+            bitwise_xor(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::bitwise_xor(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::bitwise_xor(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::bitwise_xor(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        cv::ocl::oclMat clmat1(mat1), clmat2(mat2), cldst;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::magnitudeSqr(clmat1, clmat2, cldst);
-    };
-#endif
 
+    }
 }
 
+///////////// bitwise_not////////////////////////
+TEST(bitwise_not)
+{
+    Mat src1, dst;
+    ocl::oclMat d_src1, d_dst;
 
-struct AddWeighted : ArithmTestBase {};
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
 
-TEST_P(AddWeighted, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            double alpha = 2.0, beta = 1.0, gama = 3.0;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::addWeighted(mat1_roi, alpha, mat2_roi, beta, gama, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
 
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            bitwise_not(src1, dst);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
+            CPU_ON;
+            bitwise_not(src1, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
 
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download(cpu_dst);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::bitwise_not(d_src1, d_dst);
+            WARMUP_OFF;
 
-        }
+            GPU_ON;
+            ocl::bitwise_not(d_src1, d_dst);
+             ;
+            GPU_OFF;
 
-        if(k == 0)
-        {
-            cout << "no roi\n";
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            ocl::bitwise_not(d_src1, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+}
+
+///////////// compare////////////////////////
+TEST(compare)
+{
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int CMP_EQ = 0;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        Has_roi(j);
-        double alpha = 2.0, beta = 1.0, gama = 3.0;
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            cout << "no roi:";
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+
+
+            compare(src1, src2, dst, CMP_EQ);
+
+            CPU_ON;
+            compare(src1, src2, dst, CMP_EQ);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst);
-        // double alpha=2.0,beta=1.0,gama=3.0;
-        // cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
-        // if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
-        // cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
-    };
-#endif
 
+    }
 }
-/*
-struct AddWeighted : ArithmTestBase {};
 
-TEST_P(AddWeighted, Mat)
+///////////// pow ////////////////////////
+TEST(pow)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick=0;
-    double totalgputick=0;
-    double totalgputick_kernel=0;
-    double t0=0;
-    double t1=0;
-    double t2=0;
-    for(int j = 0; j < LOOP_TIMES+1; j ++)
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        double alpha=2.0,beta=1.0,gama=3.0;
-
-        t0 = (double)cvGetTickCount();//cpu start
-        cv::addWeighted(mat1,alpha,mat2,beta,gama,dst);
-        t0 = (double)cvGetTickCount() - t0;//cpu end
-
-        t1 = (double)cvGetTickCount();//gpu start1
-        cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
-
-        t2=(double)cvGetTickCount();//kernel
-        cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-        cv::Mat cpu_dst;
-        cldst.download(cpu_dst);
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-        if(j == 0)
-            continue;
-        totalgputick=t1+totalgputick;
-        totalcputick=t0+totalcputick;
-        totalgputick_kernel=t2+totalgputick_kernel;
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-    }
-    cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
-    cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
-
-#else
-    //for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    //	{
-    double alpha=2.0,beta=1.0,gama=3.0;
-    cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
-    //if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
-    cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
-    //	};
-#endif
+            gen(src, size, size, all_type[j], 0, 100);
+            gen(dst, size, size, all_type[j], 0, 100);
 
-}
+            pow(src, -2.0, dst);
+
+            CPU_ON;
+            pow(src, -2.0, dst);
+            CPU_OFF;
+            d_src.upload(src);
+            d_dst.upload(dst);
 
-*/
-//********test****************
+            WARMUP_ON;
+            ocl::pow(d_src, -2.0, d_dst);
+            WARMUP_OFF;
 
-INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(
-                            Values(CV_8UC1, CV_8UC4),
-                            Values(false))); // Values(false) is the reserved parameter
+            GPU_ON;
+            ocl::pow(d_src, -2.0, d_dst);
+             ;
+            GPU_OFF;
 
-INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pow(d_src, -2.0, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
+    }
+}
 
-INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1,  CV_32FC4),
-                            Values(false)));
+///////////// MagnitudeSqr////////////////////////
+TEST(MagnitudeSqr)
+{
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
 
-INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
 
-INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[t];
 
+            gen(src1, size, size, all_type[t], 0, 256);
+            gen(src2, size, size, all_type[t], 0, 256);
+            gen(dst, size, size, all_type[t], 0, 256);
 
-INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
 
-INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(
-                            Values(CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+            for (int i = 0; i < src1.rows; ++i)
 
-INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(
-                            Values(CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+                for (int j = 0; j < src1.cols; ++j)
+                {
+                    float val1 = src1.at<float>(i, j);
+                    float val2 = src2.at<float>(i, j);
 
-INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(
-                            Values(CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
 
-INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
+                }
 
-INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+            CPU_ON;
 
-INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            Values(false)));
+            for (int i = 0; i < src1.rows; ++i)
+                for (int j = 0; j < src1.cols; ++j)
+                {
+                    float val1 = src1.at<float>(i, j);
+                    float val2 = src2.at<float>(i, j);
 
-INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            Values(false)));
+                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
 
-INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(
-                            Values(CV_8U, CV_32S, CV_32F),
-                            Values(false)));
+                }
 
-INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(
-                            Values(CV_8U, CV_32S, CV_32F),
-                            Values(false)));
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
+            WARMUP_ON;
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
 
-INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
+            GPU_ON;
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
+    }
+}
 
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
+///////////// AddWeighted////////////////////////
+TEST(AddWeighted)
+{
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
 
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
+    double alpha = 2.0, beta = 1.0, gama = 3.0;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1), Values(false)));
-//Values(false) is the reserved parameter
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
 
-INSTANTIATE_TEST_CASE_P(Arithm, MagnitudeSqr, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
+            addWeighted(src1, alpha, src2, beta, gama, dst);
 
-INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
+            CPU_ON;
+            addWeighted(src1, alpha, src2, beta, gama, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
+            WARMUP_ON;
+            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-#endif // HAVE_OPENCL
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_blend.cpp b/modules/ocl/perf/perf_blend.cpp
index f78f7d6b2c..00034700b4 100644
--- a/modules/ocl/perf/perf_blend.cpp
+++ b/modules/ocl/perf/perf_blend.cpp
@@ -44,79 +44,77 @@
 //M*/
 
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(Blend, MatType, int)
+///////////// blend ////////////////////////
+template <typename T>
+void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
 {
-    int type;
-    int channels;
-    std::vector<cv::ocl::Info> oclinfo;
+    result_gold.create(img1.size(), img1.type());
+
+    int cn = img1.channels();
 
-    virtual void SetUp()
+    for (int y = 0; y < img1.rows; ++y)
     {
+        const float *weights1_row = weights1.ptr<float>(y);
+        const float *weights2_row = weights2.ptr<float>(y);
+        const T *img1_row = img1.ptr<T>(y);
+        const T *img2_row = img2.ptr<T>(y);
+        T *result_gold_row = result_gold.ptr<T>(y);
 
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        //cv::ocl::setBinpath(CLBINPATH);
+        for (int x = 0; x < img1.cols * cn; ++x)
+        {
+            float w1 = weights1_row[x / cn];
+            float w2 = weights2_row[x / cn];
+            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+        }
     }
-};
-
-TEST_P(Blend, Performance)
+}
+TEST(blend)
 {
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat img1_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
-    cv::Mat img2_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
-    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
-    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
-    cv::ocl::oclMat gimg1(size, CV_MAKETYPE(type, channels)), gimg2(size, CV_MAKETYPE(type, channels)), gweights1(size, CV_32F), gweights2(size, CV_32F);
-    cv::ocl::oclMat gdst(size, CV_MAKETYPE(type, channels));
+    Mat src1, src2, weights1, weights2, dst;
+    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
 
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    double totalgputick_all = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++) //LOOP_TIMES=100
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        t1 = (double)cvGetTickCount();
-        cv::ocl::oclMat gimg1 = cv::ocl::oclMat(img1_host);
-        cv::ocl::oclMat gimg2 = cv::ocl::oclMat(img2_host);
-        cv::ocl::oclMat gweights1 = cv::ocl::oclMat(weights1);
-        cv::ocl::oclMat gweights2 = cv::ocl::oclMat(weights1);
-
-        t2 = (double)cvGetTickCount();
-        cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, gdst);
-        t2 = (double)cvGetTickCount() - t2;
-
-        cv::Mat m;
-        gdst.download(m);
-        t1 = (double)cvGetTickCount() - t1;
-
-        if (j == 0)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            continue;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " and CV_32FC1";
+
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(weights1, size, size, CV_32FC1, 0, 1);
+            gen(weights2, size, size, CV_32FC1, 0, 1);
+
+            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+
+            CPU_ON;
+            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+            CPU_OFF;
+
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            d_weights1.upload(weights1);
+            d_weights2.upload(weights2);
+
+            WARMUP_ON;
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            d_weights1.upload(weights1);
+            d_weights2.upload(weights2);
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-
-        totalgputick_all = t1 + totalgputick_all;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-    };
-
-    cout << "average gpu total  runtime is  " << totalgputick_all / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-    cout << "average gpu runtime without data transfering  is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-#endif
\ No newline at end of file
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_brute_force_matcher.cpp b/modules/ocl/perf/perf_brute_force_matcher.cpp
new file mode 100644
index 0000000000..6562f91e43
--- /dev/null
+++ b/modules/ocl/perf/perf_brute_force_matcher.cpp
@@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+//////////////////// BruteForceMatch /////////////////
+TEST(BruteForceMatcher)
+{
+    Mat trainIdx_cpu;
+    Mat distance_cpu;
+    Mat allDist_cpu;
+    Mat nMatches_cpu;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        // Init CPU matcher
+        int desc_len = 64;
+
+        BFMatcher matcher(NORM_L2);
+
+        Mat query;
+        gen(query, size, desc_len, CV_32F, 0, 1);
+
+        Mat train;
+        gen(train, size, desc_len, CV_32F, 0, 1);
+        // Output
+        vector< vector<DMatch> > matches(2);
+        // Init GPU matcher
+        ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
+
+        ocl::oclMat d_query(query);
+        ocl::oclMat d_train(train);
+
+        ocl::oclMat d_trainIdx, d_distance, d_allDist, d_nMatches;
+
+        SUBTEST << size << "; match";
+
+        matcher.match(query, train, matches[0]);
+
+        CPU_ON;
+        matcher.match(query, train, matches[0]);
+        CPU_OFF;
+
+        WARMUP_ON;
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.match(d_query, d_train, matches[0]);
+        GPU_FULL_OFF;
+
+        SUBTEST << size << "; knnMatch";
+
+        matcher.knnMatch(query, train, matches, 2);
+
+        CPU_ON;
+        matcher.knnMatch(query, train, matches, 2);
+        CPU_OFF;
+
+        WARMUP_ON;
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.knnMatch(d_query, d_train, matches, 2);
+        GPU_FULL_OFF;
+
+        SUBTEST << size << "; radiusMatch";
+
+        float max_distance = 2.0f;
+
+        matcher.radiusMatch(query, train, matches, max_distance);
+
+        CPU_ON;
+        matcher.radiusMatch(query, train, matches, max_distance);
+        CPU_OFF;
+
+        d_trainIdx.release();
+
+        WARMUP_ON;
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.radiusMatch(d_query, d_train, matches, max_distance);
+        GPU_FULL_OFF;
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_canny.cpp b/modules/ocl/perf/perf_canny.cpp
index eb895df5ec..428e036d0c 100644
--- a/modules/ocl/perf/perf_canny.cpp
+++ b/modules/ocl/perf/perf_canny.cpp
@@ -42,112 +42,42 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-    { \
-    public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-    private: \
-    type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-////////////////////////////////////////////////////////
-// Canny1
-extern std::string workdir;
-IMPLEMENT_PARAM_CLASS(AppertureSize, int);
-IMPLEMENT_PARAM_CLASS(L2gradient, bool);
-
-PARAM_TEST_CASE(Canny1, AppertureSize, L2gradient)
-{
-    int apperture_size;
-    bool useL2gradient;
-    //std::vector<cv::ocl::Info> oclinfo;
-
-    virtual void SetUp()
-    {
-        apperture_size = GET_PARAM(0);
-        useL2gradient = GET_PARAM(1);
-
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
 
-TEST_P(Canny1, Performance)
+///////////// Canny ////////////////////////
+TEST(Canny)
 {
-    cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
+    Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
 
-    double low_thresh = 100.0;
-    double high_thresh = 150.0;
-
-    cv::Mat edges_gold;
-    cv::ocl::oclMat edges;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    if (img.empty())
     {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        edges.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
+        throw runtime_error("can't open aloeL.jpg");
     }
 
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";
 
+    Mat edges(img.size(), CV_8UC1);
 
-}
+    CPU_ON;
+    Canny(img, edges, 50.0, 100.0);
+    CPU_OFF;
 
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny1, testing::Combine(
-                            testing::Values(AppertureSize(3), AppertureSize(5)),
-                            testing::Values(L2gradient(false), L2gradient(true))));
+    ocl::oclMat d_img(img);
+    ocl::oclMat d_edges;
+    ocl::CannyBuf d_buf;
 
+    WARMUP_ON;
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    WARMUP_OFF;
 
+    GPU_ON;
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+     ;
+    GPU_OFF;
 
-#endif  //Have opencl
\ No newline at end of file
+    GPU_FULL_ON;
+    d_img.upload(img);
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    d_edges.download(edges);
+    GPU_FULL_OFF;
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_color.cpp b/modules/ocl/perf/perf_color.cpp
new file mode 100644
index 0000000000..e32a1839d8
--- /dev/null
+++ b/modules/ocl/perf/perf_color.cpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// cvtColor////////////////////////
+TEST(cvtColor)
+{
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC4};
+    std::string type_name[] = {"CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            gen(src, size, size, all_type[j], 0, 256);
+            SUBTEST << size << "x" << size << "; " << type_name[j] << " ; CV_RGBA2GRAY";
+
+            cvtColor(src, dst, CV_RGBA2GRAY, 4);
+
+            CPU_ON;
+            cvtColor(src, dst, CV_RGBA2GRAY, 4);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+
+
+    }
+
+
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_columnsum.cpp b/modules/ocl/perf/perf_columnsum.cpp
index 96ea26a503..d2e3b45e53 100644
--- a/modules/ocl/perf/perf_columnsum.cpp
+++ b/modules/ocl/perf/perf_columnsum.cpp
@@ -15,8 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//	   Fangfang Bai fangfang@multicorewareinc.com
-//
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -31,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -43,78 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-///////////////////////////////////////////////////////////////////////////////
-/// ColumnSum
 
-#ifdef HAVE_OPENCL
-
-////////////////////////////////////////////////////////////////////////
-// ColumnSum
-
-PARAM_TEST_CASE(ColumnSum)
+///////////// columnSum////////////////////////
+TEST(columnSum)
 {
-    cv::Mat src;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
+        SUBTEST << size << 'x' << size << "; CV_32FC1";
+
+        gen(src, size, size, CV_32FC1, 0, 256);
+
+        CPU_ON;
+        dst.create(src.size(), src.type());
+
+        for (int i = 1; i < src.rows; ++i)
+        {
+            for (int j = 0; j < src.cols; ++j)
+            {
+                dst.at<float>(i, j) = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            }
+        }
+
+        CPU_OFF;
+
+        d_src.upload(src);
+        WARMUP_ON;
+        ocl::columnSum(d_src, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::columnSum(d_src, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::columnSum(d_src, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
     }
-};
-
-TEST_F(ColumnSum, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat src = randomMat(size, CV_32FC1);
-    cv::ocl::oclMat d_dst;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat d_src(src);
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::columnSum(d_src, d_dst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        d_dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-
-}
-
-
-
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_fft.cpp b/modules/ocl/perf/perf_fft.cpp
index c9c19d0d49..50be2546ee 100644
--- a/modules/ocl/perf/perf_fft.cpp
+++ b/modules/ocl/perf/perf_fft.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Fangfangbai, fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,85 +42,48 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-using namespace std;
-#ifdef HAVE_CLAMDFFT
-////////////////////////////////////////////////////////////////////////////
-// Dft
-PARAM_TEST_CASE(Dft, cv::Size, bool)
-{
-    cv::Size dft_size;
-    bool	 dft_rows;
-    vector<cv::ocl::Info> info;
-    virtual void SetUp()
-    {
-        dft_size = GET_PARAM(0);
-        dft_rows = GET_PARAM(1);
-        cv::ocl::getDevice(info);
-    }
-};
 
-TEST_P(Dft, C2C)
+///////////// dft ////////////////////////
+TEST(dft)
 {
-    cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0);
-    int flags = 0;
-    flags |= dft_rows ? cv::DFT_ROWS : 0;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    cv::ocl::oclMat d_b;
+    int all_type[] = {CV_32FC1, CV_32FC2};
+    std::string type_name[] = {"CV_32FC1", "CV_32FC2"};
 
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; complex-to-complex";
 
-        t1 = (double)cvGetTickCount();//gpu start1
+            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(1));
 
-        cv::ocl::oclMat ga = cv::ocl::oclMat(a); //upload
+            dft(src, dst);
 
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::dft(ga, d_b, a.size(), flags);
-        t2 = (double)cvGetTickCount() - t2;//kernel
+            CPU_ON;
+            dft(src, dst);
+            CPU_OFF;
 
-        cv::Mat cpu_dst;
-        d_b.download (cpu_dst);//download
+            d_src.upload(src);
 
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
+            WARMUP_ON;
+            ocl::dft(d_src, d_dst, Size(size, size));
+            WARMUP_OFF;
 
-        if(j == 0)
-            continue;
+            GPU_ON;
+            ocl::dft(d_src, d_dst, Size(size, size));
+             ;
+            GPU_OFF;
 
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::dft(d_src, d_dst, Size(size, size));
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
     }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-
-
-
-TEST_P(Dft, R2CthenC2R)
-{
-    cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0);
-
-    int flags = 0;
-    //flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet
-
-    cv::ocl::oclMat d_b, d_c;
-
-    cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
-    cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT);
-
-    EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
-}
-
-//INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine(
-//						testing::Values(cv::Size(1280, 1024), cv::Size(1920, 1080),cv::Size(1800, 1500)),
-//						testing::Values(false, true)));
-
-#endif // HAVE_CLAMDFFT
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp
index 100a1c59d9..e9646c77e2 100644
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@@ -10,15 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Zero Lin, Zero.Lin@amd.com
-//    Zhang Ying, zhangying913@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -33,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -45,1165 +42,331 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-//using namespace cv::ocl;
-
-PARAM_TEST_CASE(FilterTestBase, MatType, bool)
-{
-    int type;
-    cv::Scalar val;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-    }
-
-    void random_roi()
-    {
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        //randomize ROI
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
-
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-    }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// blur
-
-PARAM_TEST_CASE(Blur, MatType, cv::Size, int)
+///////////// Blur////////////////////////
+TEST(Blur)
 {
-    int type;
-    cv::Size ksize;
-    int bordertype;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        bordertype = GET_PARAM(2);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
+    Mat src1, dst;
+    ocl::oclMat d_src1, d_dst;
 
+    Size ksize = Size(3, 3);
+    int bordertype = BORDER_CONSTANT;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        if(b)
-        {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-    }
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-};
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-TEST_P(Blur, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::blur(mat1_roi, dst_roi, ksize, Point(-1, -1), bordertype);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            blur(src1, dst, ksize, Point(-1, -1), bordertype);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            CPU_ON;
+            blur(src1, dst, ksize, Point(-1, -1), bordertype);
+            CPU_OFF;
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::blur(gmat1, gdst, ksize, Point(-1, -1), bordertype);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            d_src1.upload(src1);
 
-            if(j == 0)
-                continue;
+            WARMUP_ON;
+            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
+            WARMUP_OFF;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::blur(gmat1, gdst, ksize, Point(-1, -1), bordertype);
-    };
-#endif
 
+    }
 }
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//Laplacian
-
-PARAM_TEST_CASE(LaplacianTestBase, MatType, int)
+///////////// Laplacian////////////////////////
+TEST(Laplacian)
 {
-    int type;
-    int ksize;
-
-    //src mat
-    cv::Mat mat;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat dst_roi;
-    std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat  = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            roicols =  mat.cols - 1;
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx = 0;
-            srcy = 0;
-            dstx = 0;
-            dsty = 0;
-        };
+    Mat src1, dst;
+    ocl::oclMat d_src1, d_dst;
 
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+    int ksize = 3;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    }
-
-};
-
-struct Laplacian : LaplacianTestBase {};
-
-TEST_P(Laplacian, Accuracy)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::Laplacian(mat_roi, dst_roi, -1, ksize, 1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            Laplacian(src1, dst, -1, ksize, 1);
 
-            if(j == 0)
-                continue;
+            CPU_ON;
+            Laplacian(src1, dst, -1, ksize, 1);
+            CPU_OFF;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            d_src1.upload(src1);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat = mat_roi;
+            WARMUP_ON;
+            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
+             ;
+            GPU_OFF;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1);
-    };
-#endif
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// erode & dilate
 
-PARAM_TEST_CASE(ErodeDilateBase, MatType, bool)
-{
-    int type;
-    //int iterations;
-
-    //erode or dilate kernel
-    cv::Mat kernel;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        //  iterations = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //		rng.fill(kernel, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
-        kernel = randomMat(rng, Size(3, 3), CV_8UC1, 0, 3, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
     }
+}
 
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-    }
-
-};
-
-// erode
+///////////// Erode ////////////////////
+TEST(Erode)
+{
+    Mat src, dst, ker;
+    ocl::oclMat d_src, d_dst;
 
-struct Erode : ErodeDilateBase {};
+    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
 
-TEST_P(Erode, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::erode(mat1_roi, dst_roi, kernel);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(256));
+            ker = getStructuringElement(MORPH_RECT, Size(3, 3));
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            erode(src, dst, ker);
 
-            gmat1 = mat1_roi;
+            CPU_ON;
+            erode(src, dst, ker);
+            CPU_OFF;
 
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::erode(gmat1, gdst, kernel);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            d_src.upload(src);
 
-            if(j == 0)
-                continue;
+            WARMUP_ON;
+            ocl::erode(d_src, d_dst, ker);
+            WARMUP_OFF;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::erode(d_src, d_dst, ker);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::erode(d_src, d_dst, ker);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::erode(gmat1, gdst, kernel);
-    };
-#endif
-
-}
-
-// dilate
 
-struct Dilate : ErodeDilateBase {};
-
-TEST_P(Dilate, Mat)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::dilate(mat1_roi, dst_roi, kernel);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::dilate(gmat1, gdst, kernel);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::dilate(gmat1, gdst, kernel);
-    };
-#endif
-
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Sobel
-
-PARAM_TEST_CASE(Sobel, MatType, int, int, int, int)
+///////////// Sobel ////////////////////////
+TEST(Sobel)
 {
-    int type;
-    int dx, dy, ksize, bordertype;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        dx = GET_PARAM(1);
-        dy = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        bordertype = GET_PARAM(4);
-        dx = 2;
-        dy = 0;
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
+    int dx = 1;
+    int dy = 1;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-    }
-
-};
-
-TEST_P(Sobel, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::Sobel(mat1_roi, dst_roi, -1, dx, dy, ksize, /*scale*/0.00001,/*delta*/0, bordertype);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            Sobel(src, dst, -1, dx, dy);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::Sobel(gmat1, gdst, -1, dx, dy, ksize,/*scale*/0.00001,/*delta*/0, bordertype);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            CPU_ON;
+            Sobel(src, dst, -1, dx, dy);
+            CPU_OFF;
 
-            if(j == 0)
-                continue;
+            d_src.upload(src);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::Sobel(d_src, d_dst, -1, dx, dy);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::Sobel(d_src, d_dst, -1, dx, dy);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::Sobel(d_src, d_dst, -1, dx, dy);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::Sobel(gmat1, gdst, -1, dx, dy, ksize,/*scale*/0.00001,/*delta*/0, bordertype);
-    };
-#endif
 
+    }
 }
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Scharr
-
-PARAM_TEST_CASE(Scharr, MatType, int, int, int)
+///////////// Scharr ////////////////////////
+TEST(Scharr)
 {
-    int type;
-    int dx, dy, bordertype;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        dx = GET_PARAM(1);
-        dy = GET_PARAM(2);
-        bordertype = GET_PARAM(3);
-        dx = 1;
-        dy = 0;
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+    int dx = 1;
+    int dy = 0;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    }
-};
-
-TEST_P(Scharr, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::Scharr(mat1_roi, dst_roi, -1, dx, dy, /*scale*/1,/*delta*/0, bordertype);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            Scharr(src, dst, -1, dx, dy);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::Scharr(gmat1, gdst, -1, dx, dy,/*scale*/1,/*delta*/0, bordertype);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            CPU_ON;
+            Scharr(src, dst, -1, dx, dy);
+            CPU_OFF;
 
-            if(j == 0)
-                continue;
+            d_src.upload(src);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::Scharr(d_src, d_dst, -1, dx, dy);
+            WARMUP_OFF;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
+            GPU_ON;
+            ocl::Scharr(d_src, d_dst, -1, dx, dy);
+             ;
+            GPU_OFF;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::Scharr(d_src, d_dst, -1, dx, dy);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::Scharr(gmat1, gdst, -1, dx, dy,/*scale*/1,/*delta*/0, bordertype);
-    };
-#endif
 
+    }
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// GaussianBlur
-
-PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int)
+///////////// GaussianBlur ////////////////////////
+TEST(GaussianBlur)
 {
-    int type;
-    cv::Size ksize;
-    int bordertype;
-
-    double sigma1, sigma2;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        bordertype = GET_PARAM(2);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        sigma1 = rng.uniform(0.1, 1.0);
-        sigma2 = rng.uniform(0.1, 1.0);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
 
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        if(b)
-        {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+            gen(src, size, size, all_type[j], 0, 256);
 
-    }
+            GaussianBlur(src, dst, Size(9, 9), 0);
 
-};
+            CPU_ON;
+            GaussianBlur(src, dst, Size(9, 9), 0);
+            CPU_OFF;
 
-TEST_P(GaussianBlur, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::GaussianBlur(mat1_roi, dst_roi, ksize, sigma1, sigma2, bordertype);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst(src.size(), src.type());
+            ocl::oclMat d_buf;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype);
-    };
-#endif
 
+    }
 }
 
-//************test**********
-
-INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                        Values(cv::Size(3, 3)/*, cv::Size(5, 5), cv::Size(7, 7)*/),
-                        Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
+///////////// filter2D////////////////////////
+TEST(filter2D)
+{
+    Mat src;
 
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        int all_type[] = {CV_8UC1, CV_8UC4};
+        std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(1/*, 3*/)));
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            gen(src, size, size, all_type[j], 0, 256);
 
-//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
+            for (int ksize = 3; ksize <= 15; ksize = 2*ksize+1)
+            {
+                SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;
 
-INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
+                Mat kernel;
+                gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
 
-//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
+                Mat dst;
+                cv::filter2D(src, dst, -1, kernel);
 
-INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
+                CPU_ON;
+                cv::filter2D(src, dst, -1, kernel);
+                CPU_OFF;
 
+                ocl::oclMat d_src(src);
+                ocl::oclMat d_dst;
 
-INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_32FC1),
-                        Values(1, 2), Values(0, 1), Values(3, 5), Values((MatType)cv::BORDER_CONSTANT,
-                                (MatType)cv::BORDER_REPLICATE)));
+                WARMUP_ON;
+                ocl::filter2D(d_src, d_dst, -1, kernel);
+                WARMUP_OFF;
 
+                GPU_ON;
+                ocl::filter2D(d_src, d_dst, -1, kernel);
+                 ;
+                GPU_OFF;
 
-INSTANTIATE_TEST_CASE_P(Filter, Scharr, Combine(
-                            Values(CV_8UC1,  CV_32FC1), Values(0, 1), Values(0, 1),
-                            Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
+                GPU_FULL_ON;
+                d_src.upload(src);
+                ocl::filter2D(d_src, d_dst, -1, kernel);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
 
-INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
-                            Values(CV_8UC1,  CV_32FC1),
-                            Values(cv::Size(3, 3), cv::Size(5, 5)),
-                            Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
+        }
 
 
-#endif // HAVE_OPENCL
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_gemm.cpp b/modules/ocl/perf/perf_gemm.cpp
index c3dcab34fe..930ecb0464 100644
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/ocl/perf/perf_gemm.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -41,73 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-
 #include "precomp.hpp"
-using namespace std;
-#ifdef HAVE_CLAMDBLAS
-////////////////////////////////////////////////////////////////////////////
-// GEMM
-PARAM_TEST_CASE(Gemm, int, cv::Size, int)
-{
-    int      type;
-    cv::Size mat_size;
-    int		 flags;
-    vector<cv::ocl::Info> info;
-    virtual void SetUp()
-    {
-        type     = GET_PARAM(0);
-        mat_size = GET_PARAM(1);
-        flags    = GET_PARAM(2);
 
-        cv::ocl::getDevice(info);
-    }
-};
-
-TEST_P(Gemm, Performance)
+///////////// gemm ////////////////////////
+TEST(gemm)
 {
-    cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
-    cv::ocl::oclMat ocl_dst;
+    Mat src1, src2, src3, dst;
+    ocl::oclMat d_src1, d_src2, d_src3, d_dst;
 
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ga = cv::ocl::oclMat(a);//upload
-        cv::ocl::oclMat gb = cv::ocl::oclMat(b);//upload
-        cv::ocl::oclMat gc = cv::ocl::oclMat(c);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::gemm(ga, gb, 1.0, gc, 1.0, ocl_dst, flags);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        ocl_dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
+        SUBTEST << size << 'x' << size;
+
+        gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+        gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+        gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+
+        gemm(src1, src2, 1.0, src3, 1.0, dst);
+
+        CPU_ON;
+        gemm(src1, src2, 1.0, src3, 1.0, dst);
+        CPU_OFF;
+
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        d_src3.upload(src3);
+
+        WARMUP_ON;
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        d_src3.upload(src3);
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
     }
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-
-
-INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
-                            testing::Values(CV_32FC1, CV_32FC2/* , CV_64FC1, CV_64FC2*/),
-                            testing::Values(cv::Size(512, 512), cv::Size(1024, 1024)),
-                            testing::Values(0, (int)cv::GEMM_1_T, (int)cv::GEMM_2_T, (int)(cv::GEMM_1_T + cv::GEMM_2_T))));
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_haar.cpp b/modules/ocl/perf/perf_haar.cpp
index 08c066a311..5a909ace4e 100644
--- a/modules/ocl/perf/perf_haar.cpp
+++ b/modules/ocl/perf/perf_haar.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,132 +42,97 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
+///////////// Haar ////////////////////////
+namespace cv
+{
+namespace ocl
+{
 
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv;
-extern std::string workdir;
 struct getRect
 {
-    Rect operator ()(const CvAvgComp &e) const
+    Rect operator()(const CvAvgComp &e) const
     {
         return e.rect;
     }
 };
 
-PARAM_TEST_CASE(HaarTestBase, int, int)
+class CascadeClassifier_GPU : public OclCascadeClassifier
 {
-    //std::vector<cv::ocl::Info> oclinfo;
-    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
-    cv::CascadeClassifier cpucascade, cpunestedCascade;
-    //    Mat img;
-
-    double scale;
-    int index;
-
-    virtual void SetUp()
+public:
+    void detectMultiScale(oclMat &image,
+                          CV_OUT std::vector<cv::Rect>& faces,
+                          double scaleFactor = 1.1,
+                          int minNeighbors = 3, int flags = 0,
+                          Size minSize = Size(),
+                          Size maxSize = Size())
     {
-        scale = 1.0;
-        index = 0;
-        string cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml";
-
-        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
-        {
-            cout << "ERROR: Could not load classifier cascade" << endl;
-            return;
-        }
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath("E:\\");
+        (void)maxSize;
+        MemStorage storage(cvCreateMemStorage(0));
+        //CvMat img=image;
+        CvSeq *objs = oclHaarDetectObjects(image, storage, scaleFactor, minNeighbors, flags, minSize);
+        vector<CvAvgComp> vecAvgComp;
+        Seq<CvAvgComp>(objs).copyTo(vecAvgComp);
+        faces.resize(vecAvgComp.size());
+        std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
     }
-};
-
-////////////////////////////////faceDetect/////////////////////////////////////////////////
 
-struct Haar : HaarTestBase {};
+};
 
-TEST_F(Haar, FaceDetect)
+}
+}
+TEST(Haar)
 {
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
+    Mat img = imread(abspath("basketball1.png"), CV_LOAD_IMAGE_GRAYSCALE);
 
-    if(img.empty())
+    if (img.empty())
     {
-        std::cout << imgName << std::endl;
-        return ;
+        throw runtime_error("can't open basketball1.png");
     }
 
-    //int i = 0;
-    double t = 0;
-    vector<Rect> faces, oclfaces;
-
-    // const static Scalar colors[] =  { CV_RGB(0, 0, 255),
-    //                                   CV_RGB(0, 128, 255),
-    //                                   CV_RGB(0, 255, 255),
-    //                                   CV_RGB(0, 255, 0),
-    //                                   CV_RGB(255, 128, 0),
-    //                                   CV_RGB(255, 255, 0),
-    //                                   CV_RGB(255, 0, 0),
-    //                                   CV_RGB(255, 0, 255)
-    //                                 } ;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    t = (double)cvGetTickCount();
-    for(int k = 0; k < LOOP_TIMES; k++)
+    CascadeClassifier faceCascadeCPU;
+
+    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
     {
-        cpucascade.detectMultiScale( smallImg, faces,  1.1,
-                                     3, 0
-                                     | CV_HAAR_SCALE_IMAGE
-                                     , Size(30, 30), Size(0, 0) );
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
     }
-    t = (double)cvGetTickCount() - t ;
-    printf( "cpudetection time = %g ms\n", t / (LOOP_TIMES * (double)cvGetTickFrequency() * 1000.) );
 
-    cv::ocl::oclMat image;
-    CvSeq *_objects=NULL;
-    t = (double)cvGetTickCount();
-    for(int k = 0; k < LOOP_TIMES; k++)
+    vector<Rect> faces;
+
+    SUBTEST << img.cols << "x" << img.rows << "; scale image";
+    CPU_ON;
+    faceCascadeCPU.detectMultiScale(img, faces,
+                                    1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    CPU_OFF;
+
+    ocl::CascadeClassifier_GPU faceCascade;
+
+    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
     {
-        image.upload(smallImg);
-        _objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
-                   3, 0
-                   | CV_HAAR_SCALE_IMAGE
-                   , Size(30, 30), Size(0, 0) );
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
     }
-    t = (double)cvGetTickCount() - t ;
-    printf( "ocldetection time = %g ms\n", t / (LOOP_TIMES * (double)cvGetTickFrequency() * 1000.) );
-    vector<CvAvgComp> vecAvgComp;
-    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-    oclfaces.resize(vecAvgComp.size());
-    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
-
-    //for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
-    //{
-    //	Mat smallImgROI;
-    //	Point center;
-    //	Scalar color = colors[i%8];
-    //	int radius;
-    //	center.x = cvRound((r->x + r->width*0.5)*scale);
-    //	center.y = cvRound((r->y + r->height*0.5)*scale);
-    //	radius = cvRound((r->width + r->height)*0.25*scale);
-    //	circle( img, center, radius, color, 3, 8, 0 );
-    //}
-    //namedWindow("result");
-    //imshow("result",img);
-    //waitKey(0);
-    //destroyAllWindows();
 
-}
-#endif // HAVE_OPENCL
+    ocl::oclMat d_img(img);
+
+    faces.clear();
+
+    WARMUP_ON;
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    WARMUP_OFF;
+
+    faces.clear();
+
+    GPU_ON;
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+     ;
+    GPU_OFF;
+
+    GPU_FULL_ON;
+    d_img.upload(img);
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    GPU_FULL_OFF;
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_hog.cpp b/modules/ocl/perf/perf_hog.cpp
index fd58808a88..b74077ff40 100644
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,125 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-extern std::string workdir;
-
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-    { \
-    public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-    private: \
-    type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
 
-IMPLEMENT_PARAM_CLASS(WinSizw48, bool);
-
-PARAM_TEST_CASE(HOG, WinSizw48, bool)
-{
-    bool is48;
-    vector<float> detector;
-    virtual void SetUp()
-    {
-        is48 = GET_PARAM(0);
-        if(is48)
-        {
-            detector = cv::ocl::HOGDescriptor::getPeopleDetector48x96();
-        }
-        else
-        {
-            detector = cv::ocl::HOGDescriptor::getPeopleDetector64x128();
-        }
-    }
-};
-
-TEST_P(HOG, Performance)
+///////////// HOG////////////////////////
+TEST(HOG)
 {
-    cv::Mat img = readImage(workdir + "lena.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    // define HOG related arguments
-    float scale = 1.05f;
-    //int nlevels = 13;
-    int gr_threshold = 8;
-    float hit_threshold = 1.4f;
-    //bool hit_threshold_auto = true;
+    Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);
 
-    int win_width = is48 ? 48 : 64;
-    int win_stride_width = 8;
-    int win_stride_height = 8;
-
-    bool gamma_corr = true;
-
-    Size win_size(win_width, win_width * 2); //(64, 128) or (48, 96)
-    Size win_stride(win_stride_width, win_stride_height);
-
-    cv::ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::ocl::HOGDescriptor::DEFAULT_NLEVELS);
-
-    gpu_hog.setSVMDetector(detector);
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    if (src.empty())
     {
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        ocl::oclMat d_src(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-
-        vector<Rect> found;
-        gpu_hog.detectMultiScale(d_src, found, hit_threshold, win_stride,
-                                 Size(0, 0), scale, gr_threshold);
-
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        // no download time for HOG
+        throw runtime_error("can't open road.png");
+    }
 
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
 
-        if(j == 0)
-            continue;
+    cv::HOGDescriptor hog;
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    std::vector<cv::Rect> found_locations;
 
-        totalgputick = t1 + totalgputick;
+    SUBTEST << 768 << 'x' << 576 << "; road.png";
 
-        totalgputick_kernel = t2 + totalgputick_kernel;
+    hog.detectMultiScale(src, found_locations);
 
-    }
+    CPU_ON;
+    hog.detectMultiScale(src, found_locations);
+    CPU_OFF;
 
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
+    cv::ocl::HOGDescriptor ocl_hog;
+    ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+    ocl::oclMat d_src;
+    d_src.upload(src);
 
+    WARMUP_ON;
+    ocl_hog.detectMultiScale(d_src, found_locations);
+    WARMUP_OFF;
 
-INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, testing::Combine(testing::Values(WinSizw48(false), WinSizw48(true)), testing::Values(false)));
+    GPU_ON;
+    ocl_hog.detectMultiScale(d_src, found_locations);
+     ;
+    GPU_OFF;
 
-#endif  //Have opencl
\ No newline at end of file
+    GPU_FULL_ON;
+    d_src.upload(src);
+    ocl_hog.detectMultiScale(d_src, found_locations);
+    GPU_FULL_OFF;
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp
index bc54cb275c..756f69556f 100644
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@ -10,18 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan, lyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Zailong Wu, bullet@yeah.net
-//    Xu Pang, pangxu010@163.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -36,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -48,949 +42,290 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
+///////////// equalizeHist ////////////////////////
+TEST(equalizeHist)
+{
+    Mat src, dst;
+    int all_type[] = {CV_8UC1};
+    std::string type_name[] = {"CV_8UC1"};
 
-using namespace cvtest;
-using namespace testing;
-using namespace std;
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
+            gen(src, size, size, all_type[j], 0, 256);
 
-MatType nulltype = -1;
+            equalizeHist(src, dst);
 
-#define ONE_TYPE(type)  testing::ValuesIn(typeVector(type))
-#define NULL_TYPE  testing::ValuesIn(typeVector(nulltype))
+            CPU_ON;
+            equalizeHist(src, dst);
+            CPU_OFF;
 
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
+            ocl::oclMat d_hist;
+            ocl::oclMat d_buf;
 
-vector<MatType> typeVector(MatType type)
-{
-    vector<MatType> v;
-    v.push_back(type);
-    return v;
-}
+            WARMUP_ON;
+            ocl::equalizeHist(d_src, d_dst);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::equalizeHist(d_src, d_dst);
+             ;
+            GPU_OFF;
 
-PARAM_TEST_CASE(ImgprocTestBase, MatType, MatType, MatType, MatType, MatType, bool)
-{
-    int type1, type2, type3, type4, type5;
-    cv::Scalar val;
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int dst1x;
-    int dst1y;
-    int maskx;
-    int masky;
-
-    //mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
-
-    //mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl mat
-    cv::ocl::oclMat clmat1;
-    cv::ocl::oclMat clmat2;
-    cv::ocl::oclMat clmask;
-    cv::ocl::oclMat cldst;
-    cv::ocl::oclMat cldst1; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat clmat1_roi;
-    cv::ocl::oclMat clmat2_roi;
-    cv::ocl::oclMat clmask_roi;
-    cv::ocl::oclMat cldst_roi;
-    cv::ocl::oclMat cldst1_roi;
-
-    virtual void SetUp()
-    {
-        type1 = GET_PARAM(0);
-        type2 = GET_PARAM(1);
-        type3 = GET_PARAM(2);
-        type4 = GET_PARAM(3);
-        type5 = GET_PARAM(4);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-        double min = 1, max = 20;
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-        if(type1 != nulltype)
-        {
-            mat1 = randomMat(rng, size, type1, min, max, false);
-            clmat1 = mat1;
-        }
-        if(type2 != nulltype)
-        {
-            mat2 = randomMat(rng, size, type2, min, max, false);
-            clmat2 = mat2;
-        }
-        if(type3 != nulltype)
-        {
-            dst  = randomMat(rng, size, type3, min, max, false);
-            cldst = dst;
-        }
-        if(type4 != nulltype)
-        {
-            dst1 = randomMat(rng, size, type4, min, max, false);
-            cldst1 = dst1;
-        }
-        if(type5 != nulltype)
-        {
-            mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-            cv::threshold(mask, mask, 0.5, 255., type5);
-            clmask = mask;
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::equalizeHist(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+
     }
+}
+/////////// CopyMakeBorder //////////////////////
+TEST(CopyMakeBorder)
+{
+    Mat src, dst;
+    ocl::oclMat d_dst;
 
+    int bordertype = BORDER_CONSTANT;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1; //start
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src2x   = 1;
-            src1y   = 1;
-            src2y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            dst1x    = 1;
-            dst1y    = 1;
-            maskx	 = 1;
-            masky	= 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src2x = 0;
-            src1y = 0;
-            src2y = 0;
-            dstx = 0;
-            dsty = 0;
-            dst1x  = 0;
-            dst1y  = 0;
-            maskx	 = 0;
-            masky	= 0;
-        };
-
-        if(type1 != nulltype)
-        {
-            mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-            //clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
-        }
-        if(type2 != nulltype)
-        {
-            mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-            //clmat2_roi = clmat2(Rect(src2x,src2y,roicols,roirows));
-        }
-        if(type3 != nulltype)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-            //cldst_roi = cldst(Rect(dstx,dsty,roicols,roirows));
-        }
-        if(type4 != nulltype)
-        {
-            dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
-            //cldst1_roi = cldst1(Rect(dst1x,dst1y,roicols,roirows));
-        }
-        if(type5 != nulltype)
-        {
-            mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-            //clmask_roi = clmask(Rect(maskx,masky,roicols,roirows));
-        }
-    }
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-    void random_roi()
-    {
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        //randomize ROI
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-        dst1x    = rng.uniform(0, dst1.cols  - roicols);
-        dst1y    = rng.uniform(0, dst1.rows  - roirows);
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-
-        if(type1 != nulltype)
-        {
-            mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-            //clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
-        }
-        if(type2 != nulltype)
-        {
-            mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-            //clmat2_roi = clmat2(Rect(src2x,src2y,roicols,roirows));
-        }
-        if(type3 != nulltype)
-        {
-            dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-            //cldst_roi = cldst(Rect(dstx,dsty,roicols,roirows));
-        }
-        if(type4 != nulltype)
-        {
-            dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
-            //cldst1_roi = cldst1(Rect(dst1x,dst1y,roicols,roirows));
-        }
-        if(type5 != nulltype)
-        {
-            mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-            //clmask_roi = clmask(Rect(maskx,masky,roicols,roirows));
-        }
-    }
-};
-////////////////////////////////equalizeHist//////////////////////////////////////////
 
-struct equalizeHist : ImgprocTestBase {};
+            gen(src, size, size, all_type[j], 0, 256);
 
-TEST_P(equalizeHist, MatType)
-{
-    if (mat1.type() != CV_8UC1 || mat1.type() != dst.type())
-    {
-        cout << "Unsupported type" << endl;
-        EXPECT_DOUBLE_EQ(0.0, 0.0);
-    }
-    else
-    {
-#ifndef PRINT_KERNEL_RUN_TIME
-        double totalcputick = 0;
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        double t0 = 0;
-        double t1 = 0;
-        double t2 = 0;
-        for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-        {
-            totalcputick = 0;
-            totalgputick = 0;
-            totalgputick_kernel = 0;
-            for(int j = 0; j < LOOP_TIMES + 1; j ++)
-            {
-                Has_roi(k);
+            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
 
-                t0 = (double)cvGetTickCount();//cpu start
-                cv::equalizeHist(mat1_roi, dst_roi);
-                t0 = (double)cvGetTickCount() - t0;//cpu end
+            CPU_ON;
+            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
+            CPU_OFF;
 
-                t1 = (double)cvGetTickCount();//gpu start1
-                if(type1 != nulltype)
-                {
-                    clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                }
-                cldst_roi = cldst(Rect(dstx, dsty, roicols, roirows));
-                t2 = (double)cvGetTickCount(); //kernel
-                cv::ocl::equalizeHist(clmat1_roi, cldst_roi);
-                t2 = (double)cvGetTickCount() - t2;//kernel
-                cv::Mat cpu_cldst;
-                //cldst.download(cpu_cldst);//download
-                t1 = (double)cvGetTickCount() - t1;//gpu end1
+            ocl::oclMat d_src(src);
 
-                if(j == 0)
-                    continue;
+            WARMUP_ON;
+            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
+            WARMUP_OFF;
 
-                totalgputick = t1 + totalgputick;
-                totalcputick = t0 + totalcputick;
-                totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
+             ;
+            GPU_OFF;
 
-            }
-            if(k == 0)
-            {
-                cout << "no roi\n";
-            }
-            else
-            {
-                cout << "with roi\n";
-            };
-            cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-#else
-        for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-        {
-            Has_roi(j);
-            if(type1 != nulltype)
-            {
-                clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-            }
-            if(j == 0)
-            {
-                cout << "no roi:";
-            }
-            else
-            {
-                cout << "\nwith roi:";
-            };
-            cv::ocl::equalizeHist(clmat1_roi, cldst_roi);
-        };
-#endif
+
     }
 }
+///////////// cornerMinEigenVal ////////////////////////
+TEST(cornerMinEigenVal)
+{
+    Mat src, dst;
+    ocl::oclMat d_dst;
 
+    int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
+    int borderType = BORDER_REFLECT;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-////////////////////////////////bilateralFilter////////////////////////////////////////////
-
-struct bilateralFilter : ImgprocTestBase {};
-
-TEST_P(bilateralFilter, Mat)
-{
-    double sigmacolor = 50.0;
-    int radius = 9;
-    int d = 2 * radius + 1;
-    double sigmaspace = 20.0;
-    int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,cv::BORDER_REFLECT,cv::BORDER_WRAP,cv::BORDER_REFLECT_101*/};
-    const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
-
-    if (mat1.depth() != CV_8U || mat1.type() != dst.type())
-    {
-        cout << "Unsupported type" << endl;
-        EXPECT_DOUBLE_EQ(0.0, 0.0);
-    }
-    else
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        for(size_t i = 0; i < sizeof(bordertype) / sizeof(int); i++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            cout << borderstr[i] << endl;
-#ifndef PRINT_KERNEL_RUN_TIME
-            double totalcputick = 0;
-            double totalgputick = 0;
-            double totalgputick_kernel = 0;
-            double t0 = 0;
-            double t1 = 0;
-            double t2 = 0;
-            for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-            {
-                totalcputick = 0;
-                totalgputick = 0;
-                totalgputick_kernel = 0;
-                for(int j = 0; j < LOOP_TIMES + 1; j ++)
-                {
-                    Has_roi(k);
-                    if(((bordertype[i] != cv::BORDER_CONSTANT) && (bordertype[i] != cv::BORDER_REPLICATE) && (mat1_roi.cols <= radius)) || (mat1_roi.cols <= radius) || (mat1_roi.rows <= radius) || (mat1_roi.rows <= radius))
-                    {
-                        continue;
-                    }
-                    t0 = (double)cvGetTickCount();//cpu start
-                    cv::bilateralFilter(mat1_roi, dst_roi, d, sigmacolor, sigmaspace, bordertype[i]);
-                    t0 = (double)cvGetTickCount() - t0;//cpu end
-
-                    t1 = (double)cvGetTickCount();//gpu start1
-                    if(type1 != nulltype)
-                    {
-                        clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                    }
-                    t2 = (double)cvGetTickCount(); //kernel
-                    cv::ocl::bilateralFilter(clmat1_roi, cldst_roi, d, sigmacolor, sigmaspace, bordertype[i]);
-                    t2 = (double)cvGetTickCount() - t2;//kernel
-                    cv::Mat cpu_cldst;
-                    cldst.download(cpu_cldst);//download
-                    t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-                    if(j == 0)
-                        continue;
-
-                    totalgputick = t1 + totalgputick;
-                    totalcputick = t0 + totalcputick;
-                    totalgputick_kernel = t2 + totalgputick_kernel;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-                }
-                if(k == 0)
-                {
-                    cout << "no roi\n";
-                }
-                else
-                {
-                    cout << "with roi\n";
-                };
-                cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-                cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-                cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            }
 
-#else
-            for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-            {
-                Has_roi(j);
-                if(type1 != nulltype)
-                {
-                    clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                };
-                if(j == 0)
-                {
-                    cout << "no roi:";
-                }
-                else
-                {
-                    cout << "\nwith roi:";
-                };
-                cv::ocl::bilateralFilter(clmat1_roi, cldst_roi, d, sigmacolor, sigmaspace, bordertype[i]);
-            };
-
-#endif
-        };
-
-    }
-}
-
-////////////////////////////////copyMakeBorder////////////////////////////////////////////
+            gen(src, size, size, all_type[j], 0, 256);
 
-struct CopyMakeBorder : ImgprocTestBase {};
-
-TEST_P(CopyMakeBorder, Mat)
-{
-    int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT, cv::BORDER_WRAP, cv::BORDER_REFLECT_101};
-    //const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
-    int top = 5;
-    int bottom = 5;
-    int left = 6;
-    int right = 6;
-    if (mat1.type() != dst.type())
-    {
-        cout << "Unsupported type" << endl;
-        EXPECT_DOUBLE_EQ(0.0, 0.0);
-    }
-    else
-    {
-        for(size_t i = 0; i < sizeof(bordertype) / sizeof(int); i++)
-        {
-#ifndef PRINT_KERNEL_RUN_TIME
-            double totalcputick = 0;
-            double totalgputick = 0;
-            double totalgputick_kernel = 0;
-            double t0 = 0;
-            double t1 = 0;
-            double t2 = 0;
-            for(int k = LOOPROISTART; k < 1; k++) //don't support roi perf test
-            {
-                totalcputick = 0;
-                totalgputick = 0;
-                totalgputick_kernel = 0;
-                for(int j = 0; j < LOOP_TIMES + 1; j ++)
-                {
-                    Has_roi(k);
-
-                    t0 = (double)cvGetTickCount();//cpu start
-                    cv::copyMakeBorder(mat1_roi, dst_roi, top, bottom, left, right, bordertype[i] | cv::BORDER_ISOLATED, cv::Scalar(1.0));
-                    t0 = (double)cvGetTickCount() - t0;//cpu end
-
-                    t1 = (double)cvGetTickCount();//gpu start1
-                    if(type1 != nulltype)
-                    {
-                        clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                    }
-                    t2 = (double)cvGetTickCount(); //kernel
-                    cv::ocl::copyMakeBorder(clmat1_roi, cldst_roi, top, bottom, left, right,  bordertype[i] | cv::BORDER_ISOLATED, cv::Scalar(1.0));
-                    t2 = (double)cvGetTickCount() - t2;//kernel
-                    cv::Mat cpu_cldst;
-                    cldst.download(cpu_cldst);//download
-                    t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-                    if(j == 0)
-                        continue;
-
-                    totalgputick = t1 + totalgputick;
-                    totalcputick = t0 + totalcputick;
-                    totalgputick_kernel = t2 + totalgputick_kernel;
-
-                }
-                if(k == 0)
-                {
-                    cout << "no roi\n";
-                }
-                else
-                {
-                    cout << "with roi\n";
-                };
-                cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-                cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-                cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            }
-#else
-            for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-            {
-                Has_roi(j);
-                if(type1 != nulltype)
-                {
-                    clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                };
-                if(j == 0)
-                {
-                    cout << "no roi:";
-                }
-                else
-                {
-                    cout << "\nwith roi:";
-                };
-                cv::ocl::copyMakeBorder(clmat1_roi, cldst_roi, top, bottom, left, right,  bordertype[i] | cv::BORDER_ISOLATED, cv::Scalar(1.0));
-            };
-#endif
-        };
-    }
-}
+            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
 
-////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
+            CPU_ON;
+            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
+            CPU_OFF;
 
-struct cornerMinEigenVal : ImgprocTestBase {};
+            ocl::oclMat d_src(src);
 
-TEST_P(cornerMinEigenVal, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            int blockSize = 7, apertureSize = 3; //1 + 2 * (rand() % 4);
-            int borderType = cv::BORDER_REFLECT;
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::cornerMinEigenVal(mat1_roi, dst_roi, blockSize, apertureSize, borderType);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            if(type1 != nulltype)
-            {
-                clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-            }
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::cornerMinEigenVal(clmat1_roi, cldst_roi, blockSize, apertureSize, borderType);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_cldst;
-            cldst.download(cpu_cldst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
+            WARMUP_ON;
+            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
+            WARMUP_OFF;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
-        int borderType = cv::BORDER_REFLECT;
-        if(type1 != nulltype)
-        {
-            clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-        };
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::cornerMinEigenVal(clmat1_roi, cldst_roi, blockSize, apertureSize, borderType);
-    };
-#endif
 }
-
-
-////////////////////////////////cornerHarris//////////////////////////////////////////
-
-struct cornerHarris : ImgprocTestBase {};
-
-TEST_P(cornerHarris, Mat)
+///////////// cornerHarris ////////////////////////
+TEST(cornerHarris)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            int blockSize = 7, apertureSize = 3;
-            int borderType = cv::BORDER_REFLECT;
-            double kk = 2;
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::cornerHarris(mat1_roi, dst_roi, blockSize, apertureSize, kk, borderType);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            if(type1 != nulltype)
-            {
-                clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-            }
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::cornerHarris(clmat1_roi, cldst_roi, blockSize, apertureSize, kk, borderType);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_cldst;
-            cldst.download(cpu_cldst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        Has_roi(j);
-        double kk = 2;
-        int blockSize = 7, apertureSize = 3;
-        int borderType = cv::BORDER_REFLECT;
-        if(type1 != nulltype)
-        {
-            clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-        };
-        if(j == 0)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::cornerHarris(clmat1_roi, cldst_roi, blockSize, apertureSize, kk, borderType);
-    };
-#endif
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; BORDER_REFLECT";
 
-}
+            gen(src, size, size, all_type[j], 0, 1);
 
+            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
 
-////////////////////////////////integral/////////////////////////////////////////////////
+            CPU_ON;
+            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
+            CPU_OFF;
 
-struct integral : ImgprocTestBase {};
+            d_src.upload(src);
 
-TEST_P(integral, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::integral(mat1_roi, dst_roi, dst1_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            WARMUP_ON;
+            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
+            WARMUP_OFF;
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            if(type1 != nulltype)
-            {
-                clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-            }
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::integral(clmat1_roi, cldst_roi, cldst1_roi);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_cldst;
-            cv::Mat cpu_cldst1;
-            cldst.download(cpu_cldst);//download
-            cldst1.download(cpu_cldst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
+            GPU_ON;
+            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
+             ;
+            GPU_OFF;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        if(type1 != nulltype)
-        {
-            clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-        };
-        if(j == 0)
-        {
-            cout << "no roi:";
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::integral(clmat1_roi, cldst_roi, cldst1_roi);
-    };
-#endif
-}
 
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// warpAffine  & warpPerspective
-
-PARAM_TEST_CASE(WarpTestBase, MatType, int)
+    }
+}
+///////////// integral ////////////////////////
+TEST(integral)
 {
-    int type;
-    cv::Size size;
-    int interpolation;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int src_roicols;
-    int src_roirows;
-    int dst_roicols;
-    int dst_roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        //dsize = GET_PARAM(1);
-        interpolation = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        size = cv::Size(MWIDTH, MHEIGHT);
+    Mat src, sum;
+    ocl::oclMat d_src, d_sum, d_buf;
 
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
+    int all_type[] = {CV_8UC1};
+    std::string type_name[] = {"CV_8UC1"};
 
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            //randomize ROI
-            src_roicols =  mat1.cols - 1; //start
-            src_roirows = mat1.rows - 1;
-            dst_roicols = dst.cols - 1;
-            dst_roirows = dst.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
+            SUBTEST << size << 'x' << size << "; " << type_name[j]  ;
 
-        }
-        else
-        {
-            src_roicols = mat1.cols;
-            src_roirows = mat1.rows;
-            dst_roicols = dst.cols;
-            dst_roirows = dst.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
+            gen(src, size, size, all_type[j], 0, 256);
 
-        };
-        mat1_roi = mat1(Rect(src1x, src1y, src_roicols, src_roirows));
-        dst_roi  = dst(Rect(dstx, dsty, dst_roicols, dst_roirows));
+            integral(src, sum);
 
+            CPU_ON;
+            integral(src, sum);
+            CPU_OFF;
 
-    }
+            d_src.upload(src);
 
-};
+            WARMUP_ON;
+            ocl::integral(d_src, d_sum);
+            WARMUP_OFF;
 
-/////warpAffine
+            GPU_ON;
+            ocl::integral(d_src, d_sum);
+             ;
+            GPU_OFF;
 
-struct WarpAffine : WarpTestBase {};
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::integral(d_src, d_sum);
+            d_sum.download(sum);
+            GPU_FULL_OFF;
+        }
 
-TEST_P(WarpAffine, Mat)
+    }
+}
+///////////// WarpAffine ////////////////////////
+TEST(WarpAffine)
 {
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
     static const double coeffs[2][3] =
     {
         {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
         {sin(3.14 / 6), cos(3.14 / 6), -100.0}
     };
     Mat M(2, 3, CV_64F, (void *)coeffs);
+    int interpolation = INTER_NEAREST;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::warpAffine(mat1_roi, dst_roi, M, size, interpolation);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+            Size size1 = Size(size, size);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
+            warpAffine(src, dst, M, size1, interpolation);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::warpAffine(gmat1, gdst, M, size, interpolation);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            CPU_ON;
+            warpAffine(src, dst, M, size1, interpolation);
+            CPU_OFF;
 
-            if(j == 0)
-                continue;
+            d_src.upload(src);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::warpAffine(gmat1, gdst, M, size, interpolation);
-    };
-#endif
 
+    }
 }
-
-
-// warpPerspective
-
-struct WarpPerspective : WarpTestBase {};
-
-TEST_P(WarpPerspective, Mat)
+///////////// WarpPerspective ////////////////////////
+TEST(WarpPerspective)
 {
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
     static const double coeffs[3][3] =
     {
         {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
@@ -998,1154 +333,576 @@ TEST_P(WarpPerspective, Mat)
         {0.0, 0.0, 1.0}
     };
     Mat M(3, 3, CV_64F, (void *)coeffs);
+    int interpolation = INTER_NEAREST;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::warpPerspective(mat1_roi, dst_roi, M, size, interpolation);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+            Size size1 = Size(size, size);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
+            warpPerspective(src, dst, M, size1, interpolation);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::warpPerspective(gmat1, gdst, M, size, interpolation);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            CPU_ON;
+            warpPerspective(src, dst, M, size1, interpolation);
+            CPU_OFF;
 
-            if(j == 0)
-                continue;
+            d_src.upload(src);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::warpPerspective(gmat1, gdst, M, size, interpolation);
-    };
-#endif
 
+    }
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// remap
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int)
+///////////// resize ////////////////////////
+TEST(resize)
 {
-    int srcType;
-    int map1Type;
-    int map2Type;
-    cv::Scalar val;
-
-    int interpolation;
-    int bordertype;
-
-    cv::Mat src;
-    cv::Mat dst;
-    cv::Mat map1;
-    cv::Mat map2;
-
-
-    int src_roicols;
-    int src_roirows;
-    int dst_roicols;
-    int dst_roirows;
-    int map1_roicols;
-    int map1_roirows;
-    int map2_roicols;
-    int map2_roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-    int map1x;
-    int map1y;
-    int map2x;
-    int map2y;
-
-    cv::Mat src_roi;
-    cv::Mat dst_roi;
-    cv::Mat map1_roi;
-    cv::Mat map2_roi;
-
-    //ocl mat for testing
-    cv::ocl::oclMat gdst;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gsrc_roi;
-    cv::ocl::oclMat gdst_roi;
-    cv::ocl::oclMat gmap1_roi;
-    cv::ocl::oclMat gmap2_roi;
-
-    virtual void SetUp()
-    {
-        srcType = GET_PARAM(0);
-        map1Type = GET_PARAM(1);
-        map2Type = GET_PARAM(2);
-        interpolation = GET_PARAM(3);
-        bordertype = GET_PARAM(4);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size srcSize = cv::Size(MWIDTH, MHEIGHT);
-        cv::Size map1Size = cv::Size(MWIDTH, MHEIGHT);
-        double min = 5, max = 16;
-
-        if(srcType != nulltype)
-        {
-            src = randomMat(rng, srcSize, srcType, min, max, false);
-        }
-        if((map1Type == CV_16SC2 && map2Type == nulltype) || (map1Type == CV_32FC2 && map2Type == nulltype))
-        {
-            map1 = randomMat(rng, map1Size, map1Type, min, max, false);
-
-        }
-        else if (map1Type == CV_32FC1 && map2Type == CV_32FC1)
-        {
-            map1 = randomMat(rng, map1Size, map1Type, min, max, false);
-            map2 = randomMat(rng, map1Size, map1Type, min, max, false);
-        }
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-        else
-            cout << "The wrong input type" << endl;
 
-        dst = randomMat(rng, map1Size, srcType, min, max, false);
-        switch (src.channels())
-        {
-        case 1:
-            val = cv::Scalar(rng.uniform(0.0, 10.0), 0, 0, 0);
-            break;
-        case 2:
-            val = cv::Scalar(rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), 0, 0);
-            break;
-        case 3:
-            val = cv::Scalar(rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), 0);
-            break;
-        case 4:
-            val = cv::Scalar(rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0));
-            break;
-        }
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        //if you want to use undefault device, set it here
-        //setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            //randomize ROI
-            dst_roicols = dst.cols - 1;
-            dst_roirows = dst.rows - 1;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; up";
 
-            src_roicols = src.cols - 1;
-            src_roirows = src.rows - 1;
+            gen(src, size, size, all_type[j], 0, 256);
 
+            resize(src, dst, Size(), 2.0, 2.0);
 
-            srcx = 1;
-            srcy = 1;
-            dstx = 1;
-            dsty = 1;
-        }
-        else
-        {
-            dst_roicols = dst.cols;
-            dst_roirows = dst.rows;
+            CPU_ON;
+            resize(src, dst, Size(), 2.0, 2.0);
+            CPU_OFF;
 
-            src_roicols = src.cols;
-            src_roirows = src.rows;
+            d_src.upload(src);
 
+            WARMUP_ON;
+            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
+            WARMUP_OFF;
 
-            srcx = 0;
-            srcy = 0;
-            dstx = 0;
-            dsty = 0;
-        }
-        map1_roicols = dst_roicols;
-        map1_roirows = dst_roirows;
-        map2_roicols = dst_roicols;
-        map2_roirows = dst_roirows;
-        map1x = dstx;
-        map1y = dsty;
-        map2x = dstx;
-        map2y = dsty;
-
-        if((map1Type == CV_16SC2 && map2Type == nulltype) || (map1Type == CV_32FC2 && map2Type == nulltype))
-        {
-            map1_roi = map1(Rect(map1x, map1y, map1_roicols, map1_roirows));
-            gmap1_roi = map1_roi;
-        }
+            GPU_ON;
+            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
+             ;
+            GPU_OFF;
 
-        else if (map1Type == CV_32FC1 && map2Type == CV_32FC1)
-        {
-            map1_roi = map1(Rect(map1x, map1y, map1_roicols, map1_roirows));
-            map2_roi = map2(Rect(map2x, map2y, map2_roicols, map2_roirows));
-            gmap1_roi = map1_roi;
-            gmap2_roi = map2_roi;
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        dst_roi = dst(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        src_roi = dst(Rect(srcx, srcy, src_roicols, src_roirows));
 
     }
-};
 
-TEST_P(Remap, Mat)
-{
-    if((interpolation == 1 && map1Type == CV_16SC2) || (map1Type == CV_32FC1 && map2Type == nulltype) || (map1Type == CV_16SC2 && map2Type == CV_32FC1) || (map1Type == CV_32FC2 && map2Type == CV_32FC1))
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        cout << "LINEAR don't support the map1Type and map2Type" << endl;
-        return;
-    }
-    int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
-    const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
-    cout << borderstr[0] << endl;
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = 0; k < 2; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; down";
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::remap(src_roi, dst_roi, map1_roi, map2_roi, interpolation, bordertype[0], val);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start
-            gsrc_roi = src_roi;
-            gdst = dst;
-            gdst_roi = gdst(Rect(dstx, dsty, dst_roicols, dst_roirows));
+            resize(src, dst, Size(), 0.5, 0.5);
 
-            t2 = (double)cvGetTickCount();//kernel
-            cv::ocl::remap(gsrc_roi, gdst_roi, gmap1_roi, gmap2_roi, interpolation, bordertype[0], val);
-            t2 = (double)cvGetTickCount() - t2;//kernel
+            CPU_ON;
+            resize(src, dst, Size(), 0.5, 0.5);
+            CPU_OFF;
 
-            cv::Mat cpu_dst;
-            gdst.download(cpu_dst);
+            d_src.upload(src);
 
-            t1 = (double)cvGetTickCount() - t1;//gpu end
+            WARMUP_ON;
+            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
+            WARMUP_OFF;
 
-            if (j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = 0; j < 2; j ++)
-    {
-        Has_roi(j);
-        gdst = dst;
-        gdst_roi = gdst(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        gsrc_roi = src_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::remap(gsrc_roi, gdst_roi, gmap1_roi, gmap2_roi, interpolation, bordertype[0], val);
-    };
-#endif
 
+    }
 }
+///////////// threshold////////////////////////
+TEST(threshold)
+{
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// resize
-
-PARAM_TEST_CASE(Resize, MatType, cv::Size, double, double, int)
-{
-    int type;
-    cv::Size dsize;
-    double fx, fy;
-    int interpolation;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int src_roicols;
-    int src_roirows;
-    int dst_roicols;
-    int dst_roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        dsize = GET_PARAM(1);
-        fx = GET_PARAM(2);
-        fy = GET_PARAM(3);
-        interpolation = GET_PARAM(4);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+        SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";
 
-        if(dsize == cv::Size() && !(fx > 0 && fy > 0))
-        {
-            cout << "invalid dsize and fx fy" << endl;
-            return;
-        }
+        gen(src, size, size, CV_8U, 0, 100);
 
-        if(dsize == cv::Size())
-        {
-            dsize.width = (int)(size.width * fx);
-            dsize.height = (int)(size.height * fy);
-        }
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
 
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, dsize, type, 5, 16, false);
+        CPU_ON;
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
+        CPU_OFF;
 
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            src_roicols =  mat1.cols - 1; //start
-            src_roirows = mat1.rows - 1;
-            dst_roicols = dst.cols - 1;
-            dst_roirows = dst.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
+        d_src.upload(src);
 
-        }
-        else
-        {
-            src_roicols = mat1.cols;
-            src_roirows = mat1.rows;
-            dst_roicols = dst.cols;
-            dst_roirows = dst.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
+        WARMUP_ON;
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+        WARMUP_OFF;
 
-        };
-        mat1_roi = mat1(Rect(src1x, src1y, src_roicols, src_roirows));
-        dst_roi  = dst(Rect(dstx, dsty, dst_roicols, dst_roirows));
+        GPU_ON;
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+         ;
+        GPU_OFF;
 
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
 
     }
 
-};
-
-TEST_P(Resize, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
+        SUBTEST << size << 'x' << size << "; 32FC1; THRESH_TRUNC [NPP]";
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::resize(mat1_roi, dst_roi, dsize, fx, fy, interpolation);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+        gen(src, size, size, CV_32FC1, 0, 100);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
+        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::resize(gmat1, gdst, dsize, fx, fy, interpolation);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+        CPU_ON;
+        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        CPU_OFF;
 
-            if(j == 0)
-                continue;
+        d_src.upload(src);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+        WARMUP_ON;
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        WARMUP_OFF;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::resize(gmat1, gdst, dsize, fx, fy, interpolation);
-    };
-#endif
+        GPU_ON;
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+         ;
+        GPU_OFF;
 
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
+    }
 }
+///////////// meanShiftFiltering////////////////////////
+TEST(meanShiftFiltering)
+{
+    int sp = 10, sr = 10;
+    Mat src, dst;
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//threshold
+    ocl::oclMat d_src, d_dst;
 
-PARAM_TEST_CASE(Threshold, MatType, ThreshOp)
-{
-    int type;
-    int threshOp;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        threshOp = GET_PARAM(1);
+        SUBTEST << size << 'x' << size << "; 8UC3 vs 8UC4";
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+        gen(src, size, size, CV_8UC3, Scalar::all(0), Scalar::all(256));
 
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
+        pyrMeanShiftFiltering(src, dst, sp, sr);
 
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1; //start
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
+        CPU_ON;
+        pyrMeanShiftFiltering(src, dst, sp, sr);
+        CPU_OFF;
 
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
+        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
 
-        };
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+        d_src.upload(src);
 
+        WARMUP_ON;
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+        WARMUP_OFF;
 
-    }
-};
+        GPU_ON;
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+         ;
+        GPU_OFF;
 
-TEST_P(Threshold, Mat)
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
+    }
+}
+///////////// meanShiftProc////////////////////////
+COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size size, int sp, int sr, int maxIter, float eps, int *tab)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
 
-            double maxVal = randomDouble(20.0, 127.0);
-            double thresh = randomDouble(0.0, maxVal);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::threshold(mat1_roi, dst_roi, thresh, maxVal, threshOp);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+    int isr2 = sr * sr;
+    int c0, c1, c2, c3;
+    int iter;
+    uchar *ptr = NULL;
+    uchar *pstart = NULL;
+    int revx = 0, revy = 0;
+    c0 = sptr[0];
+    c1 = sptr[1];
+    c2 = sptr[2];
+    c3 = sptr[3];
 
-            t1 = (double)cvGetTickCount();//gpu start1
+    // iterate meanshift procedure
+    for (iter = 0; iter < maxIter; iter++)
+    {
+        int count = 0;
+        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
 
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::threshold(gmat1, gdst, thresh, maxVal, threshOp);
-            t2 = (double)cvGetTickCount() - t2;//kernel
+        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
+        int minx = x0 - sp;
+        int miny = y0 - sp;
+        int maxx = x0 + sp;
+        int maxy = y0 + sp;
 
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+        //deal with the image boundary
+        if (minx < 0)
+        {
+            minx = 0;
+        }
 
-            if(j == 0)
-                continue;
+        if (miny < 0)
+        {
+            miny = 0;
+        }
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+        if (maxx >= size.width)
+        {
+            maxx = size.width - 1;
+        }
 
+        if (maxy >= size.height)
+        {
+            maxy = size.height - 1;
         }
-        if(k == 0)
+
+        if (iter == 0)
         {
-            cout << "no roi\n";
+            pstart = sptr;
         }
         else
         {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        double maxVal = randomDouble(20.0, 127.0);
-        double thresh = randomDouble(0.0, maxVal);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
+            pstart = pstart + revy * sstep + (revx << 2); //point to the new position
         }
-        else
+
+        ptr = pstart;
+        ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row
+
+        for (int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
         {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::threshold(gmat1, gdst, thresh, maxVal, threshOp);
-    };
-#endif
+            int rowCount = 0;
+            int x = minx;
+#if CV_ENABLE_UNROLLED
 
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//meanShift
+            for (; x + 4 <= maxx; x += 4, ptr += 16)
+            {
+                int t0, t1, t2;
+                t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
 
-PARAM_TEST_CASE(meanShiftTestBase, MatType, MatType, int, int, cv::TermCriteria)
-{
-    int type, typeCoor;
-    int sp, sr;
-    cv::TermCriteria crit;
-    //src mat
-    cv::Mat src;
-    cv::Mat dst;
-    cv::Mat dstCoor;
-
-    //set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat src_roi;
-    cv::Mat dst_roi;
-    cv::Mat dstCoor_roi;
-
-    //ocl dst mat
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdstCoor;
-
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl mat with roi
-    cv::ocl::oclMat gsrc_roi;
-    cv::ocl::oclMat gdst_roi;
-    cv::ocl::oclMat gdstCoor_roi;
-
-    virtual void SetUp()
-    {
-        type     = GET_PARAM(0);
-        typeCoor = GET_PARAM(1);
-        sp       = GET_PARAM(2);
-        sr       = GET_PARAM(3);
-        crit     = GET_PARAM(4);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        // MWIDTH=256, MHEIGHT=256. defined in utility.hpp
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        src = randomMat(rng, size, type, 5, 16, false);
-        dst = randomMat(rng, size, type, 5, 16, false);
-        dstCoor = randomMat(rng, size, typeCoor, 5, 16, false);
-
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x;
+                    rowCount++;
+                }
 
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            //randomize ROI
-            roicols = src.cols - 1;
-            roirows = src.rows - 1;
-            srcx = 1;
-            srcy = 1;
-            dstx = 1;
-            dsty = 1;
-        }
-        else
-        {
-            roicols = src.cols;
-            roirows = src.rows;
-            srcx = 0;
-            srcy = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        src_roi = src(Rect(srcx, srcy, roicols, roirows));
-        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
-        dstCoor_roi = dstCoor(Rect(dstx, dsty, roicols, roirows));
-
-        gdst = dst;
-        gdstCoor = dstCoor;
-    }
-};
+                t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];
 
-/////////////////////////meanShiftFiltering/////////////////////////////
-struct meanShiftFiltering : meanShiftTestBase {};
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x + 1;
+                    rowCount++;
+                }
 
-TEST_P(meanShiftFiltering, Mat)
-{
+                t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = 0; k < 2; k++)
-    {
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x + 2;
+                    rowCount++;
+                }
 
-            t1 = (double)cvGetTickCount();//gpu start1
+                t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];
 
-            gsrc_roi = src_roi;
-            gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x + 3;
+                    rowCount++;
+                }
+            }
 
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::meanShiftFiltering(gsrc_roi, gdst_roi, sp, sr, crit);
-            t2 = (double)cvGetTickCount() - t2;//kernel
+#endif
 
-            cv::Mat cpu_gdst;
-            gdst.download(cpu_gdst);//download
+            for (; x <= maxx; x++, ptr += 4)
+            {
+                int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
 
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x;
+                    rowCount++;
+                }
+            }
 
-            if(j == 0)
+            if (rowCount == 0)
+            {
                 continue;
+            }
 
-            totalgputick = t1 + totalgputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
+            count += rowCount;
+            sy += y * rowCount;
         }
-        if(k == 0)
+
+        if (count == 0)
         {
-            cout << "no roi\n";
+            break;
         }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
 
-        gsrc_roi = src_roi;
-        gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
+        int x1 = sx / count;
+        int y1 = sy / count;
+        s0 = s0 / count;
+        s1 = s1 / count;
+        s2 = s2 / count;
+
+        bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
+                        tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
+
+        //revise the pointer corresponding to the new (y0,x0)
+        revx = x1 - x0;
+        revy = y1 - y0;
 
-        if(j == 0)
+        x0 = x1;
+        y0 = y1;
+        c0 = s0;
+        c1 = s1;
+        c2 = s2;
+
+        if (stopFlag)
         {
-            cout << "no roi:";
+            break;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::meanShiftFiltering(gsrc_roi, gdst_roi, sp, sr, crit);
-    };
-#endif
+    } //for iter
 
-}
+    dptr[0] = (uchar)c0;
+    dptr[1] = (uchar)c1;
+    dptr[2] = (uchar)c2;
+    dptr[3] = (uchar)c3;
 
-///////////////////////////meanShiftProc//////////////////////////////////
-struct meanShiftProc : meanShiftTestBase {};
+    COOR coor;
+    coor.x = static_cast<short>(x0);
+    coor.y = static_cast<short>(y0);
+    return coor;
+}
 
-TEST_P(meanShiftProc, Mat)
+void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, cv::TermCriteria crit)
 {
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = 0; k < 2; k++)
+    if (src_roi.empty())
     {
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
+        CV_Error(CV_StsBadArg, "The input image is empty");
+    }
 
-            t1 = (double)cvGetTickCount();//gpu start1
+    if (src_roi.depth() != CV_8U || src_roi.channels() != 4)
+    {
+        CV_Error(CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
+    }
 
-            gsrc_roi = src_roi;
-            gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
-            gdstCoor_roi = gdstCoor(Rect(dstx, dsty, roicols, roirows));
+    CV_Assert((src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
+              (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
+    CV_Assert(!(dstCoor_roi.step & 0x3));
 
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::meanShiftProc(gsrc_roi, gdst_roi, gdstCoor_roi, sp, sr, crit);
-            t2 = (double)cvGetTickCount() - t2;//kernel
+    if (!(crit.type & cv::TermCriteria::MAX_ITER))
+    {
+        crit.maxCount = 5;
+    }
 
-            cv::Mat cpu_gdstCoor;
-            gdstCoor.download(cpu_gdstCoor);//download
+    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
+    float eps;
 
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+    if (!(crit.type & cv::TermCriteria::EPS))
+    {
+        eps = 1.f;
+    }
 
-            if(j == 0)
-                continue;
+    eps = (float)std::max(crit.epsilon, 0.0);
 
-            totalgputick = t1 + totalgputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+    int tab[512];
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+    for (int i = 0; i < 512; i++)
     {
-        Has_roi(j);
+        tab[i] = (i - 255) * (i - 255);
+    }
 
-        gsrc_roi = src_roi;
-        gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
-        gdstCoor_roi = gdstCoor(Rect(dstx, dsty, roicols, roirows));
+    uchar *sptr = src_roi.data;
+    uchar *dptr = dst_roi.data;
+    short *dCoorptr = (short *)dstCoor_roi.data;
+    int sstep = (int)src_roi.step;
+    int dstep = (int)dst_roi.step;
+    int dCoorstep = (int)dstCoor_roi.step >> 1;
+    cv::Size size = src_roi.size();
 
-        if(j == 0)
+    for (int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
+            dptr += dstep - (size.width << 2), dCoorptr += dCoorstep - (size.width << 1))
+    {
+        for (int j = 0; j < size.width; j++, sptr += 4, dptr += 4, dCoorptr += 2)
         {
-            cout << "no roi:";
+            *((COOR *)dCoorptr) = do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::meanShiftProc(gsrc_roi, gdst_roi, gdstCoor_roi, sp, sr, crit);
-    };
-#endif
+    }
 
 }
-
-///////////////////////////////////////////////////////////////////////////////////////////
-//hist
-
-void calcHistGold(const cv::Mat &src, cv::Mat &hist)
+TEST(meanShiftProc)
 {
-    hist.create(1, 256, CV_32SC1);
-    hist.setTo(cv::Scalar::all(0));
+    Mat src, dst, dstCoor_roi;
+    ocl::oclMat d_src, d_dst, d_dstCoor_roi;
 
-    int *hist_row = hist.ptr<int>();
-    for (int y = 0; y < src.rows; ++y)
+    TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        const uchar *src_row = src.ptr(y);
+        SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";
 
-        for (int x = 0; x < src.cols; ++x)
-            ++hist_row[src_row[x]];
-    }
-}
+        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
+        gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
+        gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
 
-PARAM_TEST_CASE(histTestBase, MatType, MatType)
-{
-    int type_src;
-
-    //src mat
-    cv::Mat src;
-    cv::Mat dst_hist;
-    //set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    //src mat with roi
-    cv::Mat src_roi;
-    //ocl dst mat, dst_hist and gdst_hist don't have roi
-    cv::ocl::oclMat gdst_hist;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gsrc_roi;
-
-    //    std::vector<cv::ocl::Info> oclinfo;
-
-    virtual void SetUp()
-    {
-        type_src   = GET_PARAM(0);
+        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
+        CPU_ON;
+        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
+        CPU_OFF;
 
-        src = randomMat(rng, size, type_src, 0, 256, false);
+        d_src.upload(src);
 
-        //        int devnums = getDevice(oclinfo);
-        //        CV_Assert(devnums > 0);
-        //if you want to use undefault device, set it here
-        //setDevice(oclinfo[0]);
-    }
+        WARMUP_ON;
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+        WARMUP_OFF;
 
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            //randomize ROI
-            roicols = src.cols - 1;
-            roirows = src.rows - 1;
-            srcx = 1;
-            srcy = 1;
-        }
-        else
-        {
-            roicols = src.cols;
-            roirows = src.rows;
-            srcx = 0;
-            srcy = 0;
-        };
-        src_roi = src(Rect(srcx, srcy, roicols, roirows));
-    }
-};
+        GPU_ON;
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+        d_dst.download(dst);
+        d_dstCoor_roi.download(dstCoor_roi);
+        GPU_FULL_OFF;
 
-///////////////////////////calcHist///////////////////////////////////////
-struct calcHist : histTestBase {};
+    }
+}
 
-TEST_P(calcHist, Mat)
+///////////// remap////////////////////////
+TEST(remap)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = 0; k < 2; k++)
+    Mat src, dst, xmap, ymap;
+    ocl::oclMat d_src, d_dst, d_xmap, d_ymap;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    int interpolation = INTER_LINEAR;
+    int borderMode = BORDER_CONSTANT;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        double totalcputick = 0;
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; src " << type_name[t] << "; map CV_32FC1";
 
-            t0 = (double)cvGetTickCount();//cpu start
-            calcHistGold(src_roi, dst_hist);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[t], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
+            xmap.create(size, size, CV_32FC1);
+            dst.create(size, size, CV_32FC1);
+            ymap.create(size, size, CV_32FC1);
 
-            gsrc_roi = src_roi;
+            for (int i = 0; i < size; ++i)
+            {
+                float *xmap_row = xmap.ptr<float>(i);
+                float *ymap_row = ymap.ptr<float>(i);
 
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::calcHist(gsrc_roi, gdst_hist);
-            t2 = (double)cvGetTickCount() - t2;//kernel
+                for (int j = 0; j < size; ++j)
+                {
+                    xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
+                    ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
+                }
+            }
 
-            cv::Mat cpu_hist;
-            gdst_hist.download(cpu_hist);//download
 
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            remap(src, dst, xmap, ymap, interpolation, borderMode);
 
-            if(j == 0)
-                continue;
+            CPU_ON;
+            remap(src, dst, xmap, ymap, interpolation, borderMode);
+            CPU_OFF;
 
-            totalcputick = t0 + totalcputick;
-            totalgputick = t1 + totalgputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            d_src.upload(src);
+            d_dst.upload(dst);
+            d_xmap.upload(xmap);
+            d_ymap.upload(ymap);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = 0; j < 2; j ++)
-    {
-        Has_roi(j);
+            WARMUP_ON;
+            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
+            WARMUP_OFF;
 
-        gsrc_roi = src_roi;
+            GPU_ON;
+            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
+             ;
+            GPU_OFF;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::calcHist(gsrc_roi, gdst_hist);
-    };
-#endif
-}
-
 
-//************test*******************
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, equalizeHist, Combine(
-                            ONE_TYPE(CV_8UC1),
-                            NULL_TYPE,
-                            ONE_TYPE(CV_8UC1),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, bilateralFilter, Combine(
-                            Values(CV_8UC1, CV_8UC3),
-                            NULL_TYPE,
-                            Values(CV_8UC1, CV_8UC3),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, CopyMakeBorder, Combine(
-                            Values(CV_8UC1, CV_8UC4/*, CV_32SC1*/),
-                            NULL_TYPE,
-                            Values(CV_8UC1, CV_8UC4/*,CV_32SC1*/),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, cornerMinEigenVal, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            NULL_TYPE,
-                            ONE_TYPE(CV_32FC1),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, cornerHarris, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            NULL_TYPE,
-                            ONE_TYPE(CV_32FC1),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, integral, Combine(
-                            ONE_TYPE(CV_8UC1),
-                            NULL_TYPE,
-                            ONE_TYPE(CV_32SC1),
-                            ONE_TYPE(CV_32FC1),
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Imgproc, WarpAffine, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR,
-                                   (MatType)cv::INTER_CUBIC, (MatType)(cv::INTER_NEAREST | cv::WARP_INVERSE_MAP),
-                                   (MatType)(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP), (MatType)(cv::INTER_CUBIC | cv::WARP_INVERSE_MAP))));
-
-
-INSTANTIATE_TEST_CASE_P(Imgproc, WarpPerspective, Combine
-                        (Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                         Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR,
-                                (MatType)cv::INTER_CUBIC, (MatType)(cv::INTER_NEAREST | cv::WARP_INVERSE_MAP),
-                                (MatType)(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP), (MatType)(cv::INTER_CUBIC | cv::WARP_INVERSE_MAP))));
-
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Resize, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),  Values(cv::Size()),
-                            Values(0.5/*, 1.5, 2*/), Values(0.5/*, 1.5, 2*/), Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR)));
-
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Threshold, Combine(
-                            Values(CV_8UC1, CV_32FC1), Values(ThreshOp(cv::THRESH_BINARY),
-                                    ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC),
-                                    ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftFiltering, Combine(
-                            ONE_TYPE(CV_8UC4),
-                            ONE_TYPE(CV_16SC2),//it is no use in meanShiftFiltering
-                            Values(5),
-                            Values(6),
-                            Values(cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 5, 1))
-                        ));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine(
-                            ONE_TYPE(CV_8UC4),
-                            ONE_TYPE(CV_16SC2),
-                            Values(5),
-                            Values(6),
-                            Values(cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 5, 1))
-                        ));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(CV_32FC1, CV_16SC2, CV_32FC2), Values(-1, CV_32FC1),
-                            Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR),
-                            Values((int)cv::BORDER_CONSTANT)));
-
-INSTANTIATE_TEST_CASE_P(histTestBase, calcHist, Combine(
-                            ONE_TYPE(CV_8UC1),
-                            ONE_TYPE(CV_32SC1) //no use
-                        ));
-
-#endif // HAVE_OPENCL
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_match_template.cpp b/modules/ocl/perf/perf_match_template.cpp
index cb5e86bab9..2828efe01a 100644
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@@ -42,191 +42,105 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-//////// Utility
-#ifndef DIFFERENT_SIZES
-#else
-#undef DIFFERENT_SIZES
-#endif
-#define DIFFERENT_SIZES testing::Values(cv::Size(256, 256), cv::Size(3000, 3000))
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-{ \
-public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-private: \
-    type val_; \
-}; \
-    inline void PrintTo( name param, std::ostream* os) \
-{ \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-}
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate
-#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
-
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
-
-const char *TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
-
-PARAM_TEST_CASE(MatchTemplate, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size;
-    cv::Size templ_size;
-    int cn;
-    int method;
-    //vector<cv::ocl::Info> oclinfo;
 
-    virtual void SetUp()
-    {
-        size = GET_PARAM(0);
-        templ_size = GET_PARAM(1);
-        cn = GET_PARAM(2);
-        method = GET_PARAM(3);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
-struct MatchTemplate8U : MatchTemplate {};
-
-TEST_P(MatchTemplate8U, Performance)
+/////////// matchTemplate ////////////////////////
+//void InitMatchTemplate()
+//{
+//	Mat src; gen(src, 500, 500, CV_32F, 0, 1);
+//	Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
+//	ocl::oclMat d_src(src), d_templ(templ), d_dst;
+//	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+//}
+TEST(matchTemplate)
 {
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
+    //InitMatchTemplate();
 
+    Mat src, templ, dst;
+    int templ_size = 5;
 
 
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
+        int all_type[] = {CV_32FC1, CV_32FC4};
+        std::string type_name[] = {"CV_32FC1", "CV_32FC4"};
 
-        t1 = (double)cvGetTickCount();//gpu start1
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
+            {
+                gen(src, size, size, all_type[j], 0, 1);
 
-        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
-        cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
+                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR";
 
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-        t2 = (double)cvGetTickCount() - t2;//kernel
+                gen(templ, templ_size, templ_size, all_type[j], 0, 1);
 
-        cv::Mat cpu_dst;
-        dst.download (cpu_dst);//download
+                matchTemplate(src, templ, dst, CV_TM_CCORR);
 
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
+                CPU_ON;
+                matchTemplate(src, templ, dst, CV_TM_CCORR);
+                CPU_OFF;
 
-        if(j == 0)
-            continue;
+                ocl::oclMat d_src(src), d_templ, d_dst;
 
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
+                d_templ.upload(templ);
 
-    }
+                WARMUP_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                WARMUP_OFF;
 
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+                GPU_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                 ;
+                GPU_OFF;
 
+                GPU_FULL_ON;
+                d_src.upload(src);
+                d_templ.upload(templ);
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+        }
 
-}
+        int all_type_8U[] = {CV_8UC1};
+        std::string type_name_8U[] = {"CV_8UC1"};
 
+        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
+        {
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
+            {
+                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";
 
-struct MatchTemplate32F : MatchTemplate {};
-TEST_P(MatchTemplate32F, Performance)
-{
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
+                gen(src, size, size, all_type_8U[j], 0, 255);
 
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
+                gen(templ, templ_size, templ_size, all_type_8U[j], 0, 255);
 
+                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
 
+                CPU_ON;
+                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
+                CPU_OFF;
 
+                ocl::oclMat d_src(src);
+                ocl::oclMat d_templ(templ), d_dst;
 
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
+                WARMUP_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                WARMUP_OFF;
 
-        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
-        cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
+                GPU_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                 ;
+                GPU_OFF;
 
+                GPU_FULL_ON;
+                d_src.upload(src);
+                d_templ.upload(templ);
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+        }
     }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-
-}
-
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
-                        testing::Combine(
-                            testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT), cv::Size(1800, 1500)),
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-                            testing::Values(Channels(1), Channels(4)/*, Channels(3)*/),
-                            ALL_TEMPLATE_METHODS
-                        )
-                       );
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
-                            testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT), cv::Size(1800, 1500)),
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-                            testing::Values(Channels(1), Channels(4) /*, Channels(3)*/),
-                            testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-
-#endif //HAVE_OPENCL
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_matrix_operation.cpp b/modules/ocl/perf/perf_matrix_operation.cpp
index ba011f8dfc..495b2b82cf 100644
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,697 +42,140 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv::ocl;
-////////////////////////////////converto/////////////////////////////////////////////////
-PARAM_TEST_CASE(ConvertToTestBase, MatType, MatType)
+///////////// ConvertTo////////////////////////
+TEST(ConvertTo)
 {
-    int type;
-    int dst_type;
-
-    //src mat
-    cv::Mat mat;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type     = GET_PARAM(0);
-        dst_type = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    void Has_roi(int b)
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            dstx   = 0;
-            dsty   = 0;
-        };
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " to 32FC1";
 
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+            gen(src, size, size, all_type[j], 0, 256);
+            //gen(dst, size, size, all_type[j], 0, 256);
 
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+            //d_dst.upload(dst);
 
-        //gmat = mat_roi;
-    }
-};
+            src.convertTo(dst, CV_32FC1);
 
+            CPU_ON;
+            src.convertTo(dst, CV_32FC1);
+            CPU_OFF;
 
-struct ConvertTo : ConvertToTestBase {};
+            d_src.upload(src);
 
-TEST_P(ConvertTo, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.convertTo(dst_roi, dst_type);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.convertTo(gdst, dst_type);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            d_src.convertTo(d_dst, CV_32FC1);
+            WARMUP_OFF;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            GPU_ON;
+            d_src.convertTo(d_dst, CV_32FC1);
+             ;
+            GPU_OFF;
 
-        gmat = mat_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.convertTo(d_dst, CV_32FC1);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.convertTo(gdst, dst_type);
-    };
-#endif
 
+    }
 }
-
-
-///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(CopyToTestBase, MatType, bool)
+///////////// copyTo////////////////////////
+TEST(copyTo)
 {
-    int type;
-
-    cv::Mat mat;
-    cv::Mat mask;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-        mat = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            maskx   = 1;
-            masky   = 1;
-        }
-        else
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            dstx   = 0;
-            dsty   = 0;
-            maskx   = 0;
-            masky   = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat = mat_roi;
-        //gmask = mask_roi;
-    }
-};
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-struct CopyTo : CopyToTestBase {};
+            gen(src, size, size, all_type[j], 0, 256);
+            //gen(dst, size, size, all_type[j], 0, 256);
 
-TEST_P(CopyTo, Without_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.copyTo(dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.copyTo(gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            //d_dst.upload(dst);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            src.copyTo(dst);
 
-        gmat = mat_roi;
+            CPU_ON;
+            src.copyTo(dst);
+            CPU_OFF;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.copyTo(gdst);
-    };
-#endif
-}
+            d_src.upload(src);
 
-TEST_P(CopyTo, With_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.copyTo(dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat = mat_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.copyTo(gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            d_src.copyTo(d_dst);
+            WARMUP_OFF;
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat = mat_roi;
-        gmask = mask_roi;
+            GPU_ON;
+            d_src.copyTo(d_dst);
+             ;
+            GPU_OFF;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.copyTo(d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.copyTo(gdst, gmask);
-    };
-#endif
-}
-
-///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
 
-PARAM_TEST_CASE(SetToTestBase, MatType, bool)
+    }
+}
+///////////// setTo////////////////////////
+TEST(setTo)
 {
-    int type;
-    cv::Scalar val;
-
-    cv::Mat mat;
-    cv::Mat mask;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat mask_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gmat_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+    Mat src, dst;
+    Scalar val(1, 2, 3, 4);
+    ocl::oclMat d_src, d_dst;
 
-        mat = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            maskx   = 1;
-            masky   = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            maskx   = 0;
-            masky   = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-
-        //gmat_whole = mat;
-        //gmat = gmat_whole(Rect(srcx,srcy,roicols,roirows));
-
-        //gmask = mask_roi;
-    }
-};
-
-struct SetTo : SetToTestBase {};
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-TEST_P(SetTo, Without_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.setTo(val);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat_whole = mat;
-            gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.setTo(val);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gmat_whole.download(cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            gen(src, size, size, all_type[j], 0, 256);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat_whole = mat;
-        gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
+            src.setTo(val);
 
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.setTo(val);
-    };
-#endif
-}
+            CPU_ON;
+            src.setTo(val);
+            CPU_OFF;
 
-TEST_P(SetTo, With_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.setTo(val, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat_whole = mat;
-            gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.setTo(val, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gmat_whole.download(cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            d_src.upload(src);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat_whole = mat;
-        gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
+            WARMUP_ON;
+            d_src.setTo(val);
+            WARMUP_OFF;
 
-        gmask = mask_roi;
+            GPU_ON;
+            d_src.setTo(val);
+             ;
+            GPU_OFF;
 
-        if(j == 0)
-        {
-            cout << "no roi:";
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.setTo(val);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.setTo(val, gmask);
-    };
-#endif
-}
-PARAM_TEST_CASE(DataTransfer, MatType, bool)
-{
-    int type;
-    cv::Mat mat;
-    cv::ocl::oclMat gmat_whole;
 
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-        mat = randomMat(rng, size, type, 5, 16, false);
-    }
-};
-TEST_P(DataTransfer, perf)
-{
-    double totaluploadtick = 0;
-    double totaldownloadtick = 0;
-    double totaltick = 0;
-    double t0 = 0;
-    double t1 = 0;
-    cv::Mat cpu_dst;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-        t0 = (double)cvGetTickCount();
-        gmat_whole.upload(mat);//upload
-        t0 = (double)cvGetTickCount() - t0;
-
-        t1 = (double)cvGetTickCount();
-        gmat_whole.download(cpu_dst);//download
-        t1 = (double)cvGetTickCount() - t1;
-
-        if(j == 0)
-            continue;
-        totaluploadtick = t0 + totaluploadtick;
-        totaldownloadtick = t1 + totaldownloadtick;
     }
-    totaltick = totaluploadtick + totaldownloadtick;
-    cout << "average upload time is  " << totaluploadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average download time is  " << totaldownloadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average data transfer time is  " << totaltick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-//**********test************
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)));
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-INSTANTIATE_TEST_CASE_P(MatrixOperation, DataTransfer, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-#endif
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_norm.cpp b/modules/ocl/perf/perf_norm.cpp
new file mode 100644
index 0000000000..8b7118a6ea
--- /dev/null
+++ b/modules/ocl/perf/perf_norm.cpp
@@ -0,0 +1,84 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// norm////////////////////////
+TEST(norm)
+{
+    Mat src, buf;
+    ocl::oclMat d_src, d_buf;
+
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
+
+        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+
+        norm(src, NORM_INF);
+
+        CPU_ON;
+        norm(src, NORM_INF);
+        CPU_OFF;
+
+        d_src.upload(src);
+        d_buf.upload(buf);
+
+        WARMUP_ON;
+        ocl::norm(d_src, d_buf, NORM_INF);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::norm(d_src, d_buf, NORM_INF);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::norm(d_src, d_buf, NORM_INF);
+        GPU_FULL_OFF;
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_pyrdown.cpp b/modules/ocl/perf/perf_pyrdown.cpp
index 2cea4df4a3..1d1d2dec11 100644
--- a/modules/ocl/perf/perf_pyrdown.cpp
+++ b/modules/ocl/perf/perf_pyrdown.cpp
@@ -1,4 +1,4 @@
-///////////////////////////////////////////////////////////////////////////////////////
+/*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    fangfang bai, fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,96 +42,46 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
 
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(PyrDown, MatType, int)
+///////////// pyrDown //////////////////////
+TEST(pyrDown)
 {
-    int type;
-    int channels;
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-
-
-};
-
-#define VARNAME(A) string(#A);
-
-////////////////////////////////PyrDown/////////////////////////////////////////////////
-TEST_P(PyrDown, Mat)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::RNG &rng = TS::ptr()->get_rng();
-    mat1 = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-
-
-    cv::ocl::oclMat gdst;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-    for (int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
+            gen(src, size, size, all_type[j], 0, 256);
 
-        t1 = (double)cvGetTickCount();//gpu start1
+            pyrDown(src, dst);
 
-        cv::ocl::oclMat gmat1(mat1);
+            CPU_ON;
+            pyrDown(src, dst);
+            CPU_OFF;
 
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::pyrDown(gmat1, gdst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
 
-        cv::Mat cpu_dst;
-        gdst.download(cpu_dst);
+            WARMUP_ON;
+            ocl::pyrDown(d_src, d_dst);
+            WARMUP_OFF;
 
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
+            GPU_ON;
+            ocl::pyrDown(d_src, d_dst);
+             ;
+            GPU_OFF;
 
-        if (j == 0)
-        {
-            continue;
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrDown(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
     }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-}
-
-//********test****************
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-
-
-#endif // HAVE_OPENCL
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_pyrlk.cpp b/modules/ocl/perf/perf_pyrlk.cpp
new file mode 100644
index 0000000000..f7fc22b9d0
--- /dev/null
+++ b/modules/ocl/perf/perf_pyrlk.cpp
@@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// PyrLKOpticalFlow ////////////////////////
+TEST(PyrLKOpticalFlow)
+{
+    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
+    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
+
+    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
+    {
+        Mat frame0 = imread(abspath(images1[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
+
+        if (frame0.empty())
+        {
+            std::string errstr = "can't open " + images1[i];
+            throw runtime_error(errstr);
+        }
+
+        Mat frame1 = imread(abspath(images2[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
+
+        if (frame1.empty())
+        {
+            std::string errstr = "can't open " + images2[i];
+            throw runtime_error(errstr);
+        }
+
+        Mat gray_frame;
+
+        if (i == 0)
+        {
+            cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
+        }
+
+        for (int points = Min_Size; points <= Max_Size; points *= Multiple)
+        {
+            if (i == 0)
+                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
+            else
+                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
+            Mat nextPts_cpu;
+            Mat status_cpu;
+
+            vector<Point2f> pts;
+            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
+
+            vector<Point2f> nextPts;
+            vector<unsigned char> status;
+
+            vector<float> err;
+
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+
+            CPU_ON;
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+            CPU_OFF;
+
+            ocl::PyrLKOpticalFlow d_pyrLK;
+
+            ocl::oclMat d_frame0(frame0);
+            ocl::oclMat d_frame1(frame1);
+
+            ocl::oclMat d_pts;
+            Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
+            d_pts.upload(pts_mat);
+
+            ocl::oclMat d_nextPts;
+            ocl::oclMat d_status;
+            ocl::oclMat d_err;
+
+            WARMUP_ON;
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_frame0.upload(frame0);
+            d_frame1.upload(frame1);
+            d_pts.upload(pts_mat);
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+
+            if (!d_nextPts.empty())
+            {
+                d_nextPts.download(nextPts_cpu);
+            }
+
+            if (!d_status.empty())
+            {
+                d_status.download(status_cpu);
+            }
+
+            GPU_FULL_OFF;
+        }
+
+    }
+}
diff --git a/modules/ocl/perf/perf_pyrup.cpp b/modules/ocl/perf/perf_pyrup.cpp
index f2400610ad..d3b3003a2e 100644
--- a/modules/ocl/perf/perf_pyrup.cpp
+++ b/modules/ocl/perf/perf_pyrup.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    fangfang bai fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,80 +42,46 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
 
-PARAM_TEST_CASE(PyrUp, MatType, int)
+///////////// pyrUp ////////////////////////
+TEST(pyrUp)
 {
-    int type;
-    int channels;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    virtual void SetUp()
+    for (int size = 500; size <= 2000; size *= 2)
     {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
-
-TEST_P(PyrUp, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat src = randomMat(size, CV_MAKETYPE(type, channels));
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
+            gen(src, size, size, all_type[j], 0, 256);
 
-    double t1 = 0;
-    double t2 = 0;
+            pyrUp(src, dst);
 
-    for (int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-        t1 = (double)cvGetTickCount();//gpu start1
+            CPU_ON;
+            pyrUp(src, dst);
+            CPU_OFF;
 
-        cv::ocl::oclMat srcMat = cv::ocl::oclMat(src);//upload
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
 
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::pyrUp(srcMat, dst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
+            WARMUP_ON;
+            ocl::pyrUp(d_src, d_dst);
+            WARMUP_OFF;
 
-        cv::Mat cpu_dst;
-        dst.download(cpu_dst); //download
+            GPU_ON;
+            ocl::pyrUp(d_src, d_dst);
+             ;
+            GPU_OFF;
 
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if (j == 0)
-        {
-            continue;
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrUp(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
     }
-
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-
-#endif // HAVE_OPENCL
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_split_merge.cpp b/modules/ocl/perf/perf_split_merge.cpp
index 67a3d24aea..48ff1ff15a 100644
--- a/modules/ocl/perf/perf_split_merge.cpp
+++ b/modules/ocl/perf/perf_split_merge.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,446 +42,109 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv::ocl;
-PARAM_TEST_CASE(MergeTestBase, MatType, int)
+///////////// Merge////////////////////////
+TEST(Merge)
 {
-    int type;
-    int channels;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mat3;
-    cv::Mat mat4;
-
-    //dst mat
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int src3x;
-    int src3y;
-    int src4x;
-    int src4y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mat3_roi;
-    cv::Mat mat4_roi;
-
-    //dst mat with roi
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gmat3;
-    cv::ocl::oclMat gmat4;
-    cv::ocl::oclMat gdst;
+    Mat dst;
+    ocl::oclMat d_dst;
 
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+    int channels = 4;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-        mat1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            //randomize ROI
-            roicols =  mat1.cols - 1; //start
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            src2x   = 1;
-            src2y   = 1;
-            src3x   = 1;
-            src3y   = 1;
-            src4x   = 1;
-            src4y   = 1;
-            dstx    = 1;
-            dsty    = 1;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+            Size size1 = Size(size, size);
+            std::vector<Mat> src(channels);
 
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x   = 0;
-            src1y   = 0;
-            src2x   = 0;
-            src2y   = 0;
-            src3x   = 0;
-            src3y   = 0;
-            src4x   = 0;
-            src4y   = 0;
-            dstx    = 0;
-            dsty    = 0;
-        };
+            for (int i = 0; i < channels; ++i)
+            {
+                src[i] = Mat(size1, all_type[j], cv::Scalar::all(i));
+            }
 
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mat3_roi = mat3(Rect(src3x, src3y, roicols, roirows));
-        mat4_roi = mat4(Rect(src4x, src4y, roicols, roirows));
+            merge(src, dst);
 
+            CPU_ON;
+            merge(src, dst);
+            CPU_OFF;
 
-        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
-    }
+            std::vector<ocl::oclMat> d_src(channels);
 
-};
+            for (int i = 0; i < channels; ++i)
+            {
+                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
+            }
 
-struct Merge : MergeTestBase {};
+            WARMUP_ON;
+            ocl::merge(d_src, d_dst);
+            WARMUP_OFF;
 
-TEST_P(Merge, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            std::vector<cv::Mat> dev_src;
-            dev_src.push_back(mat1_roi);
-            dev_src.push_back(mat2_roi);
-            dev_src.push_back(mat3_roi);
-            dev_src.push_back(mat4_roi);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::merge(dev_src, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1	]
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmat3 = mat3_roi;
-            gmat4 = mat4_roi;
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            std::vector<cv::ocl::oclMat> dev_gsrc;
-            dev_gsrc.push_back(gmat1);
-            dev_gsrc.push_back(gmat2);
-            dev_gsrc.push_back(gmat3);
-            dev_gsrc.push_back(gmat4);
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::merge(dev_gsrc, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            GPU_ON;
+            ocl::merge(d_src, d_dst);
+             ;
+            GPU_OFF;
 
-            if(j == 0)
-                continue;
+            GPU_FULL_ON;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            for (int i = 0; i < channels; ++i)
+            {
+                d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
+            }
 
+            ocl::merge(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmat3 = mat3_roi;
-        gmat4 = mat4_roi;
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        std::vector<cv::ocl::oclMat> dev_gsrc;
-        dev_gsrc.push_back(gmat1);
-        dev_gsrc.push_back(gmat2);
-        dev_gsrc.push_back(gmat3);
-        dev_gsrc.push_back(gmat4);
 
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::merge(dev_gsrc, gdst);
-    };
-#endif
+    }
 }
 
-
-PARAM_TEST_CASE(SplitTestBase, MatType, int)
+///////////// Split////////////////////////
+TEST(Split)
 {
-    int type;
-    int channels;
-
-    //src mat
-    cv::Mat mat;
-
-    //dstmat
-    cv::Mat dst1;
-    cv::Mat dst2;
-    cv::Mat dst3;
-    cv::Mat dst4;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dst1x;
-    int dst1y;
-    int dst2x;
-    int dst2y;
-    int dst3x;
-    int dst3y;
-    int dst4x;
-    int dst4y;
+    //int channels = 4;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-    //src mat with roi
-    cv::Mat mat_roi;
-
-    //dst mat with roi
-    cv::Mat dst1_roi;
-    cv::Mat dst2_roi;
-    cv::Mat dst3_roi;
-    cv::Mat dst4_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst1_whole;
-    cv::ocl::oclMat gdst2_whole;
-    cv::ocl::oclMat gdst3_whole;
-    cv::ocl::oclMat gdst4_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst1;
-    cv::ocl::oclMat gdst2;
-    cv::ocl::oclMat gdst3;
-    cv::ocl::oclMat gdst4;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-        dst1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcx   = 1;
-            dst1x    = 1;
-            dst1y    = 1;
-            dst2x    = 1;
-            dst2y    = 1;
-            dst3x    = 1;
-            dst3y    = 1;
-            dst4x    = 1;
-            dst4y    = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx = 0;
-            srcy = 0;
-            dst1x = 0;
-            dst1y = 0;
-            dst2x    = 0;
-            dst2y    = 0;
-            dst3x    = 0;
-            dst3y    = 0;
-            dst4x    = 0;
-            dst4y    = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-
-        dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
-        dst2_roi = dst2(Rect(dst2x, dst2y, roicols, roirows));
-        dst3_roi = dst3(Rect(dst3x, dst3y, roicols, roirows));
-        dst4_roi = dst4(Rect(dst4x, dst4y, roicols, roirows));
-    }
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
+            Size size1 = Size(size, size);
 
-};
+            Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
 
-struct Split : SplitTestBase {};
+            std::vector<cv::Mat> dst;
 
-TEST_P(Split, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
-            cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::split(mat_roi, dev_dst);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            split(src, dst);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
+            CPU_ON;
+            split(src, dst);
+            CPU_OFF;
 
-            gdst2_whole = dst2;
-            gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
+            ocl::oclMat d_src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
+            std::vector<cv::ocl::oclMat> d_dst;
 
-            gdst3_whole = dst3;
-            gdst3 = gdst3_whole(Rect(dst3x, dst3y, roicols, roirows));
+            WARMUP_ON;
+            ocl::split(d_src, d_dst);
+            WARMUP_OFF;
 
-            gdst4_whole = dst4;
-            gdst4 = gdst4_whole(Rect(dst4x, dst4y, roicols, roirows));
+            GPU_ON;
+            ocl::split(d_src, d_dst);
+             ;
+            GPU_OFF;
 
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::split(gmat, dev_gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst1;
-            cv::Mat cpu_dst2;
-            cv::Mat cpu_dst3;
-            cv::Mat cpu_dst4;
-            gdst1_whole.download(cpu_dst1);
-            gdst2_whole.download(cpu_dst2);
-            gdst3_whole.download(cpu_dst3);
-            gdst4_whole.download(cpu_dst4);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::split(d_src, d_dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        //cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
-        cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
-
-        gdst2_whole = dst2;
-        gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
-
-        gdst3_whole = dst3;
-        gdst3 = gdst3_whole(Rect(dst3x, dst3y, roicols, roirows));
 
-        gdst4_whole = dst4;
-        gdst4 = gdst4_whole(Rect(dst4x, dst4y, roicols, roirows));
-        gmat = mat_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::split(gmat, dev_gdst);
-    };
-#endif
+    }
 }
-
-//*************test*****************
-INSTANTIATE_TEST_CASE_P(SplitMerge, Merge, Combine(
-                            Values(CV_8UC4, CV_32FC4), Values(1, 4)));
-
-INSTANTIATE_TEST_CASE_P(SplitMerge, Split , Combine(
-                            Values(CV_8U, CV_32S, CV_32F), Values(1, 4)));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/precomp.cpp b/modules/ocl/perf/precomp.cpp
index 7d287004ee..d2eaffdf1a 100644
--- a/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -41,4 +42,321 @@
 
 #include "precomp.hpp"
 
+// This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
+// All images needed in this test are in samples/gpu folder.
+// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory
+void TestSystem::run()
+{
+    if (is_list_mode_)
+    {
+        for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+        {
+            cout << (*it)->name() << endl;
+        }
+
+        return;
+    }
+
+    // Run test initializers
+    for (vector<Runnable *>::iterator it = inits_.begin(); it != inits_.end(); ++it)
+    {
+        if ((*it)->name().find(test_filter_, 0) != string::npos)
+        {
+            (*it)->run();
+        }
+    }
+
+    printHeading();
+    writeHeading();
+
+    // Run tests
+    for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+    {
+        try
+        {
+            if ((*it)->name().find(test_filter_, 0) != string::npos)
+            {
+                cout << endl << (*it)->name() << ":\n";
+
+                setCurrentTest((*it)->name());
+                //fprintf(record_,"%s\n",(*it)->name().c_str());
+
+                (*it)->run();
+                finishCurrentSubtest();
+            }
+        }
+        catch (const Exception &)
+        {
+            // Message is printed via callback
+            resetCurrentSubtest();
+        }
+        catch (const runtime_error &e)
+        {
+            printError(e.what());
+            resetCurrentSubtest();
+        }
+    }
+
+    printSummary();
+    writeSummary();
+}
+
+
+void TestSystem::finishCurrentSubtest()
+{
+    if (cur_subtest_is_empty_)
+        // There is no need to print subtest statistics
+    {
+        return;
+    }
+
+    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
+
+    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
+    speedup_total_ += speedup;
+
+    double fullspeedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_full_elapsed_);
+    speedup_full_total_ += fullspeedup;
+
+    if (speedup > top_)
+    {
+        speedup_faster_count_++;
+    }
+    else if (speedup < bottom_)
+    {
+        speedup_slower_count_++;
+    }
+    else
+    {
+        speedup_equal_count_++;
+    }
+
+    if (fullspeedup > top_)
+    {
+        speedup_full_faster_count_++;
+    }
+    else if (fullspeedup < bottom_)
+    {
+        speedup_full_slower_count_++;
+    }
+    else
+    {
+        speedup_full_equal_count_++;
+    }
+
+    // compute min, max and
+    std::sort(gpu_times_.begin(), gpu_times_.end());
+    double gpu_min = gpu_times_.front() / getTickFrequency() * 1000.0;
+    double gpu_max = gpu_times_.back() / getTickFrequency() * 1000.0;
+    double deviation = 0;
+
+    if (gpu_times_.size() > 1)
+    {
+        double sum = 0;
+
+        for (size_t i = 0; i < gpu_times_.size(); i++)
+        {
+            int64 diff = gpu_times_[i] - static_cast<int64>(gpu_elapsed_);
+            double diff_time = diff * 1000 / getTickFrequency();
+            sum += diff_time * diff_time;
+        }
+
+        deviation = std::sqrt(sum / gpu_times_.size());
+    }
+
+    printMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
+    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
+
+    num_subtests_called_++;
+    resetCurrentSubtest();
+}
+
+
+double TestSystem::meanTime(const vector<int64> &samples)
+{
+    double sum = accumulate(samples.begin(), samples.end(), 0.);
+    return sum / samples.size();
+}
+
+
+void TestSystem::printHeading()
+{
+    cout << endl;
+    cout << setiosflags(ios_base::left);
+    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
+         << setw(14) << "SPEEDUP" << setw(14) << "GPUTOTAL, ms" << setw(14) << "TOTALSPEEDUP"
+         << "DESCRIPTION\n";
+
+    cout << resetiosflags(ios_base::left);
+}
+
+void TestSystem::writeHeading()
+{
+    if (!record_)
+    {
+        recordname_ += "_OCL.csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "NAME,DESCRIPTION,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
+
+    fflush(record_);
+}
+
+void TestSystem::printSummary()
+{
+    cout << setiosflags(ios_base::fixed);
+    cout << "\naverage GPU speedup: x"
+         << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
+         << endl;
+    cout << "\nGPU exceeded: "
+         << setprecision(3) << speedup_faster_count_
+         << "\nGPU passed: "
+         << setprecision(3) << speedup_equal_count_
+         << "\nGPU failed: "
+         << setprecision(3) << speedup_slower_count_
+         << endl;
+    cout << "\nGPU exceeded rate: "
+         << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPU passed rate: "
+         << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPU failed rate: "
+         << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << endl;
+    cout << "\naverage GPUTOTAL speedup: x"
+         << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
+         << endl;
+    cout << "\nGPUTOTAL exceeded: "
+         << setprecision(3) << speedup_full_faster_count_
+         << "\nGPUTOTAL passed: "
+         << setprecision(3) << speedup_full_equal_count_
+         << "\nGPUTOTAL failed: "
+         << setprecision(3) << speedup_full_slower_count_
+         << endl;
+    cout << "\nGPUTOTAL exceeded rate: "
+         << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPUTOTAL passed rate: "
+         << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPUTOTAL failed rate: "
+         << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << endl;
+    cout << resetiosflags(ios_base::fixed);
+}
+
+
+void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
+{
+    cout << TAB << setiosflags(ios_base::left);
+    stringstream stream;
+
+    stream << cpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << gpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << speedup;
+    cout << setw(14) << stream.str();
+
+    stream.str("");
+    stream << gpu_full_time;
+    cout << setw(14) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << fullspeedup;
+    cout << setw(14) << stream.str();
+
+    cout << cur_subtest_description_.str();
+    cout << resetiosflags(ios_base::left) << endl;
+}
+
+void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
+{
+    if (!record_)
+    {
+        recordname_ += ".csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
+            cur_subtest_description_.str().c_str(),
+            cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
+            gpu_min, gpu_max, std_dev);
+
+    if (itname_changed_)
+    {
+        itname_changed_ = false;
+    }
+
+    fflush(record_);
+}
+
+void TestSystem::writeSummary()
+{
+    if (!record_)
+    {
+        recordname_ += ".csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "\nAverage GPU speedup: %.3f\n"
+            "exceeded: %d (%.3f%%)\n"
+            "passed: %d (%.3f%%)\n"
+            "failed: %d (%.3f%%)\n"
+            "\nAverage GPUTOTAL speedup: %.3f\n"
+            "exceeded: %d (%.3f%%)\n"
+            "passed: %d (%.3f%%)\n"
+            "failed: %d (%.3f%%)\n",
+            speedup_total_ / std::max(1, num_subtests_called_),
+            speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_total_ / std::max(1, num_subtests_called_),
+            speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+           );
+    fflush(record_);
+}
+
+void TestSystem::printError(const std::string &msg)
+{
+    if(msg != "CL_INVALID_BUFFER_SIZE")
+    {
+        cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
+    }
+}
+
+void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
+{
+    mat.create(rows, cols, type);
+    RNG rng(0);
+    rng.fill(mat, RNG::UNIFORM, low, high);
+}
+
+
+string abspath(const string &relpath)
+{
+    return TestSystem::instance().workingDir() + relpath;
+}
+
+
+int CV_CDECL cvErrorCallback(int /*status*/, const char * /*func_name*/,
+                             const char *err_msg, const char * /*file_name*/,
+                             int /*line*/, void * /*userdata*/)
+{
+    TestSystem::instance().printError(err_msg);
+    return 0;
+}
+
 
diff --git a/modules/ocl/perf/precomp.hpp b/modules/ocl/perf/precomp.hpp
index 208e5ebc64..9e757d24da 100644
--- a/modules/ocl/perf/precomp.hpp
+++ b/modules/ocl/perf/precomp.hpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -39,38 +40,354 @@
 //
 //M*/
 
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  if defined __clang__ || defined __APPLE__
-#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
-#    pragma GCC diagnostic ignored "-Wextra"
-#  endif
-#endif
-
-#ifndef __OPENCV_TEST_PRECOMP_HPP__
-#define __OPENCV_TEST_PRECOMP_HPP__
-
-#include <cmath>
-#include <cstdio>
-#include <iostream>
-#include <fstream>
-#include <sstream>
+#include <iomanip>
+#include <stdexcept>
 #include <string>
-#include <limits>
-#include <algorithm>
-#include <iterator>
-#include <string>
-#include <cstdarg>
-#include "opencv2/highgui.hpp"
+#include <iostream>
+#include <cstdio>
+#include <vector>
+#include <numeric>
+#include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/video.hpp"
-#include "opencv2/ts.hpp"
+#include "opencv2/objdetect.hpp"
+#include "opencv2/features2d.hpp"
 #include "opencv2/ocl.hpp"
 
-#include "utility.hpp"
-#include "interpolation.hpp"
+#define Min_Size 1000
+#define Max_Size 4000
+#define Multiple 2
+#define TAB "    "
+
+using namespace std;
+using namespace cv;
+
+void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
+string abspath(const string &relpath);
+int CV_CDECL cvErrorCallback(int, const char *, const char *, const char *, int, void *);
+typedef struct
+{
+    short x;
+    short y;
+} COOR;
+COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep,
+                  cv::Size size, int sp, int sr, int maxIter, float eps, int *tab);
+void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi,
+                    int sp, int sr, cv::TermCriteria crit);
+
+class Runnable
+{
+public:
+    explicit Runnable(const std::string &runname): name_(runname) {}
+    virtual ~Runnable() {}
+
+    const std::string &name() const
+    {
+        return name_;
+    }
+
+    virtual void run() = 0;
+
+private:
+    std::string name_;
+};
+
+class TestSystem
+{
+public:
+    static TestSystem &instance()
+    {
+        static TestSystem me;
+        return me;
+    }
+
+    void setWorkingDir(const std::string &val)
+    {
+        working_dir_ = val;
+    }
+    const std::string &workingDir() const
+    {
+        return working_dir_;
+    }
+
+    void setTestFilter(const std::string &val)
+    {
+        test_filter_ = val;
+    }
+    const std::string &testFilter() const
+    {
+        return test_filter_;
+    }
+
+    void setNumIters(int num_iters)
+    {
+        num_iters_ = num_iters;
+    }
+    void setGPUWarmupIters(int num_iters)
+    {
+        gpu_warmup_iters_ = num_iters;
+    }
+    void setCPUIters(int num_iters)
+    {
+        cpu_num_iters_ = num_iters;
+    }
+
+    void setTopThreshold(double top)
+    {
+        top_ = top;
+    }
+    void setBottomThreshold(double bottom)
+    {
+        bottom_ = bottom;
+    }
+
+    void addInit(Runnable *init)
+    {
+        inits_.push_back(init);
+    }
+    void addTest(Runnable *test)
+    {
+        tests_.push_back(test);
+    }
+    void run();
+
+    // It's public because OpenCV callback uses it
+    void printError(const std::string &msg);
+
+    std::stringstream &startNewSubtest()
+    {
+        finishCurrentSubtest();
+        return cur_subtest_description_;
+    }
+
+    bool stop() const
+    {
+        return cur_iter_idx_ >= num_iters_;
+    }
+
+    bool cpu_stop() const
+    {
+        return cur_iter_idx_ >= cpu_num_iters_;
+    }
+
+    bool warmupStop()
+    {
+        return cur_warmup_idx_++ >= gpu_warmup_iters_;
+    }
+
+    void warmupComplete()
+    {
+        cur_warmup_idx_ = 0;
+    }
+
+    void cpuOn()
+    {
+        cpu_started_ = cv::getTickCount();
+    }
+    void cpuOff()
+    {
+        int64 delta = cv::getTickCount() - cpu_started_;
+        cpu_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void cpuComplete()
+    {
+        cpu_elapsed_ += meanTime(cpu_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    void gpuOn()
+    {
+        gpu_started_ = cv::getTickCount();
+    }
+    void gpuOff()
+    {
+        int64 delta = cv::getTickCount() - gpu_started_;
+        gpu_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void gpuComplete()
+    {
+        gpu_elapsed_ += meanTime(gpu_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    void gpufullOn()
+    {
+        gpu_full_started_ = cv::getTickCount();
+    }
+    void gpufullOff()
+    {
+        int64 delta = cv::getTickCount() - gpu_full_started_;
+        gpu_full_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void gpufullComplete()
+    {
+        gpu_full_elapsed_ += meanTime(gpu_full_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    bool isListMode() const
+    {
+        return is_list_mode_;
+    }
+    void setListMode(bool value)
+    {
+        is_list_mode_ = value;
+    }
+
+    void setRecordName(const std::string &name)
+    {
+        recordname_ = name;
+    }
+
+    void setCurrentTest(const std::string &name)
+    {
+        itname_ = name;
+        itname_changed_ = true;
+    }
+
+private:
+    TestSystem():
+        cur_subtest_is_empty_(true), cpu_elapsed_(0),
+        gpu_elapsed_(0), gpu_full_elapsed_(0), speedup_total_(0.0),
+        num_subtests_called_(0),
+        speedup_faster_count_(0), speedup_slower_count_(0), speedup_equal_count_(0),
+        speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
+        num_iters_(10), cpu_num_iters_(2),
+        gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
+        record_(0), recordname_("performance"), itname_changed_(true)
+    {
+        cpu_times_.reserve(num_iters_);
+        gpu_times_.reserve(num_iters_);
+        gpu_full_times_.reserve(num_iters_);
+    }
+
+    void finishCurrentSubtest();
+    void resetCurrentSubtest()
+    {
+        cpu_elapsed_ = 0;
+        gpu_elapsed_ = 0;
+        gpu_full_elapsed_ = 0;
+        cur_subtest_description_.str("");
+        cur_subtest_is_empty_ = true;
+        cur_iter_idx_ = 0;
+        cpu_times_.clear();
+        gpu_times_.clear();
+        gpu_full_times_.clear();
+    }
+
+    double meanTime(const std::vector<int64> &samples);
+
+    void printHeading();
+    void printSummary();
+    void printMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f, double speedup = 0.0f, double fullspeedup = 0.0f);
+
+    void writeHeading();
+    void writeSummary();
+    void writeMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
+                      double speedup = 0.0f, double fullspeedup = 0.0f,
+                      double gpu_min = 0.0f, double gpu_max = 0.0f, double std_dev = 0.0f);
+
+    std::string working_dir_;
+    std::string test_filter_;
+
+    std::vector<Runnable *> inits_;
+    std::vector<Runnable *> tests_;
+
+    std::stringstream cur_subtest_description_;
+    bool cur_subtest_is_empty_;
+
+    int64 cpu_started_;
+    int64 gpu_started_;
+    int64 gpu_full_started_;
+    double cpu_elapsed_;
+    double gpu_elapsed_;
+    double gpu_full_elapsed_;
+
+    double speedup_total_;
+    double speedup_full_total_;
+    int num_subtests_called_;
+
+    int speedup_faster_count_;
+    int speedup_slower_count_;
+    int speedup_equal_count_;
+
+    int speedup_full_faster_count_;
+    int speedup_full_slower_count_;
+    int speedup_full_equal_count_;
+
+    bool is_list_mode_;
+
+    double top_;
+    double bottom_;
+
+    int num_iters_;
+    int cpu_num_iters_;     //there's no need to set cpu running same times with gpu
+    int gpu_warmup_iters_;  //gpu warm up times, default is 1
+    int cur_iter_idx_;
+    int cur_warmup_idx_;    //current gpu warm up times
+    std::vector<int64> cpu_times_;
+    std::vector<int64> gpu_times_;
+    std::vector<int64> gpu_full_times_;
+
+    FILE *record_;
+    std::string recordname_;
+    std::string itname_;
+    bool itname_changed_;
+};
+
+
+#define GLOBAL_INIT(name) \
+struct name##_init: Runnable { \
+    name##_init(): Runnable(#name) { \
+    TestSystem::instance().addInit(this); \
+} \
+    void run(); \
+} name##_init_instance; \
+    void name##_init::run()
+
+
+#define TEST(name) \
+struct name##_test: Runnable { \
+    name##_test(): Runnable(#name) { \
+    TestSystem::instance().addTest(this); \
+} \
+    void run(); \
+} name##_test_instance; \
+    void name##_test::run()
+
+#define SUBTEST TestSystem::instance().startNewSubtest()
+
+#define CPU_ON \
+    while (!TestSystem::instance().cpu_stop()) { \
+    TestSystem::instance().cpuOn()
+#define CPU_OFF \
+    TestSystem::instance().cpuOff(); \
+    } TestSystem::instance().cpuComplete()
 
-#include "opencv2/core/private.hpp"
+#define GPU_ON \
+    while (!TestSystem::instance().stop()) { \
+    TestSystem::instance().gpuOn()
+#define GPU_OFF \
+    ocl::finish(); \
+    TestSystem::instance().gpuOff(); \
+    } TestSystem::instance().gpuComplete()
 
-#endif
+#define GPU_FULL_ON \
+    while (!TestSystem::instance().stop()) { \
+    TestSystem::instance().gpufullOn()
+#define GPU_FULL_OFF \
+    TestSystem::instance().gpufullOff(); \
+    } TestSystem::instance().gpufullComplete()
 
+#define WARMUP_ON \
+    while (!TestSystem::instance().warmupStop()) {
+#define WARMUP_OFF \
+        ocl::finish(); \
+    } TestSystem::instance().warmupComplete()
diff --git a/modules/ocl/perf/utility.cpp b/modules/ocl/perf/utility.cpp
deleted file mode 100644
index b7fbe4fa0d..0000000000
--- a/modules/ocl/perf/utility.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#define VARNAME(A) #A
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-using namespace cvtest;
-
-
-//std::string generateVarList(int first,...)
-//{
-//	vector<std::string> varname;
-//
-//	va_list argp;
-//	string s;
-//	stringstream ss;
-//	va_start(argp,first);
-//	int i=first;
-//	while(i!=-1)
-//	{
-//		ss<<i<<",";
-//		i=va_arg(argp,int);
-//	};
-//	s=ss.str();
-//	va_end(argp);
-//	return s;
-//};
-
-//std::string generateVarList(int& p1,int& p2)
-//{
-//	stringstream ss;
-//	ss<<VARNAME(p1)<<":"<<src1x<<","<<VARNAME(p2)<<":"<<src1y;
-//	return ss.str();
-//};
-
-int randomInt(int minVal, int maxVal)
-{
-    RNG &rng = TS::ptr()->get_rng();
-    return rng.uniform(minVal, maxVal);
-}
-
-double randomDouble(double minVal, double maxVal)
-{
-    RNG &rng = TS::ptr()->get_rng();
-    return rng.uniform(minVal, maxVal);
-}
-
-Size randomSize(int minVal, int maxVal)
-{
-    return cv::Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
-}
-
-Scalar randomScalar(double minVal, double maxVal)
-{
-    return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
-}
-
-Mat randomMat(Size size, int type, double minVal, double maxVal)
-{
-    return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
-}
-
-
-
-
-
-
-
-/*
-void showDiff(InputArray gold_, InputArray actual_, double eps)
-{
-    Mat gold;
-    if (gold_.kind() == _InputArray::MAT)
-        gold = gold_.getMat();
-    else
-        gold_.getGpuMat().download(gold);
-
-    Mat actual;
-    if (actual_.kind() == _InputArray::MAT)
-        actual = actual_.getMat();
-    else
-        actual_.getGpuMat().download(actual);
-
-    Mat diff;
-    absdiff(gold, actual, diff);
-    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
-
-    namedWindow("gold", WINDOW_NORMAL);
-    namedWindow("actual", WINDOW_NORMAL);
-    namedWindow("diff", WINDOW_NORMAL);
-
-    imshow("gold", gold);
-    imshow("actual", actual);
-    imshow("diff", diff);
-
-    waitKey();
-}
-*/
-
-/*
-bool supportFeature(const DeviceInfo& info, FeatureSet feature)
-{
-    return TargetArchs::builtWith(feature) && info.supports(feature);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
-
-vector<DeviceInfo> devices(FeatureSet feature)
-{
-    const vector<DeviceInfo>& d = devices();
-
-    vector<DeviceInfo> devs_filtered;
-
-    if (TargetArchs::builtWith(feature))
-    {
-        devs_filtered.reserve(d.size());
-
-        for (size_t i = 0, size = d.size(); i < size; ++i)
-        {
-            const DeviceInfo& info = d[i];
-
-            if (info.supports(feature))
-                devs_filtered.push_back(info);
-        }
-    }
-
-    return devs_filtered;
-}
-*/
-
-vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
-{
-    vector<MatType> v;
-
-    v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
-
-    for (int depth = depth_start; depth <= depth_end; ++depth)
-    {
-        for (int cn = cn_start; cn <= cn_end; ++cn)
-        {
-            v.push_back(CV_MAKETYPE(depth, cn));
-        }
-    }
-
-    return v;
-}
-
-const vector<MatType> &all_types()
-{
-    static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
-
-    return v;
-}
-
-Mat readImage(const string &fileName, int flags)
-{
-    return imread(string(cvtest::TS::ptr()->get_data_path()) + fileName, flags);
-}
-
-Mat readImageType(const string &fname, int type)
-{
-    Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
-    if (CV_MAT_CN(type) == 4)
-    {
-        Mat temp;
-        cvtColor(src, temp, cv::COLOR_BGR2BGRA);
-        swap(src, temp);
-    }
-    src.convertTo(src, CV_MAT_DEPTH(type));
-    return src;
-}
-
-double checkNorm(const Mat &m)
-{
-    return norm(m, NORM_INF);
-}
-
-double checkNorm(const Mat &m1, const Mat &m2)
-{
-    return norm(m1, m2, NORM_INF);
-}
-
-double checkSimilarity(const Mat &m1, const Mat &m2)
-{
-    Mat diff;
-    matchTemplate(m1, m2, diff, CV_TM_CCORR_NORMED);
-    return std::abs(diff.at<float>(0, 0) - 1.f);
-}
-
-/*
-void cv::ocl::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    (*os) << info.name();
-}
-*/
-
-void PrintTo(const Inverse &inverse, std::ostream *os)
-{
-    if (inverse)
-        (*os) << "inverse";
-    else
-        (*os) << "direct";
-}
diff --git a/modules/ocl/perf/utility.hpp b/modules/ocl/perf/utility.hpp
deleted file mode 100644
index 7d34b6731a..0000000000
--- a/modules/ocl/perf/utility.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_UTILITY_HPP__
-#define __OPENCV_TEST_UTILITY_HPP__
-//#define PRINT_KERNEL_RUN_TIME
-#ifdef PRINT_KERNEL_RUN_TIME
-#define LOOP_TIMES 1
-#else
-#define LOOP_TIMES 1
-#endif
-#define MWIDTH 1920
-#define MHEIGHT 1080
-#define CLBINPATH ".\\"
-#define LOOPROISTART 0
-#define LOOPROIEND 1
-int randomInt(int minVal, int maxVal);
-double randomDouble(double minVal, double maxVal);
-
-//std::string generateVarList(int first,...);
-std::string generateVarList(int &p1, int &p2);
-cv::Size randomSize(int minVal, int maxVal);
-cv::Scalar randomScalar(double minVal, double maxVal);
-cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
-
-void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
-
-//! return true if device supports specified feature and gpu module was built with support the feature.
-//bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
-
-//! return all devices compatible with current gpu module build.
-//const std::vector<cv::ocl::DeviceInfo>& devices();
-//! return all devices compatible with current gpu module build which support specified feature.
-//std::vector<cv::ocl::DeviceInfo> devices(cv::gpu::FeatureSet feature);
-
-//! read image from testdata folder.
-cv::Mat readImage(const std::string &fileName, int flags = cv::IMREAD_COLOR);
-cv::Mat readImageType(const std::string &fname, int type);
-
-double checkNorm(const cv::Mat &m);
-double checkNorm(const cv::Mat &m1, const cv::Mat &m2);
-double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
-
-#define EXPECT_MAT_NORM(mat, eps) \
-{ \
-    EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
-}
-
-/*#define EXPECT_MAT_NEAR(mat1, mat2, eps) \
-{ \
-   ASSERT_EQ(mat1.type(), mat2.type()); \
-   ASSERT_EQ(mat1.size(), mat2.size()); \
-   EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
-}*/
-
-#define EXPECT_MAT_NEAR(mat1, mat2, eps,s) \
-{ \
-    ASSERT_EQ(mat1.type(), mat2.type()); \
-    ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps)<<s; \
-}
-
-#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
-{ \
-    ASSERT_EQ(mat1.type(), mat2.type()); \
-    ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkSimilarity(cv::Mat(mat1), cv::Mat(mat2)), eps); \
-}
-
-namespace cv
-{
-    namespace ocl
-    {
-        // void PrintTo(const DeviceInfo& info, std::ostream* os);
-    }
-}
-
-using perf::MatDepth;
-using perf::MatType;
-
-//! return vector with types from specified range.
-std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
-
-//! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
-const std::vector<MatType> &all_types();
-
-class Inverse
-{
-public:
-    inline Inverse(bool val = false) : val_(val) {}
-
-    inline operator bool() const
-    {
-        return val_;
-    }
-
-private:
-    bool val_;
-};
-
-void PrintTo(const Inverse &useRoi, std::ostream *os);
-
-CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
-
-CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
-
-enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
-CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
-
-CV_ENUM(ReduceOp, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
-
-CV_FLAGS(GemmFlags, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
-
-CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
-
-CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC)
-
-CV_ENUM(Border, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-
-CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WARP_INVERSE_MAP)
-
-CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
-
-CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
-
-void  run_perf_test();
-
-#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-#define DEVICES(feature) testing::ValuesIn(devices(feature))
-
-#define ALL_TYPES testing::ValuesIn(all_types())
-#define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
-
-#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
-
-#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
-
-#endif // __OPENCV_TEST_UTILITY_HPP__
diff --git a/modules/ocl/src/fft.cpp b/modules/ocl/src/fft.cpp
index d84d01a5bf..d95d15f00c 100644
--- a/modules/ocl/src/fft.cpp
+++ b/modules/ocl/src/fft.cpp
@@ -205,7 +205,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
     clStridesIn[2]  = is_row_dft ? clStridesIn[1]  : dft_size.width * clStridesIn[1];
     clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
 
-    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, (cl_context)getoclContext(), dim, clLengthsIn ) );
+    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getoclContext(), dim, clLengthsIn ) );
 
     openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
     openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
@@ -219,8 +219,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
     openCLSafeCall( clAmdFftSetPlanScale  ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );
 
     //ready to bake
-    cl_command_queue clq = (cl_command_queue)getoclCommandQueue();
-    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &clq, NULL, NULL ) );
+    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getoclCommandQueue(), NULL, NULL ) );
 }
 cv::ocl::FftPlan::~FftPlan()
 {
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index c89a3a2184..fb528e03cd 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -351,6 +351,11 @@ namespace cv
             return &(Context::getContext()->impl->clCmdQueue);
         }
 
+        void finish()
+        {
+            clFinish(Context::getContext()->impl->clCmdQueue);
+        }
+
         void queryDeviceInfo(DEVICE_INFO info_type, void* info)
         {
             static Info::Impl* impl = Context::getContext()->impl;
@@ -709,7 +714,7 @@ namespace cv
             clReleaseEvent(event);
 #endif
 
-            clFinish(clCxt->impl->clCmdQueue);
+            clFlush(clCxt->impl->clCmdQueue);
             openCLSafeCall(clReleaseKernel(kernel));
         }
 
@@ -905,16 +910,18 @@ namespace cv
         std::auto_ptr<Context> Context::clCxt;
         int Context::val = 0;
         static Mutex cs;
-        Context *Context::getContext()
+        static volatile int context_tear_down = 0;
+        Context* Context::getContext()
         {
             if(*((volatile int*)&val) != 1)
             {
                 AutoLock al(cs);
                 if(*((volatile int*)&val) != 1)
                 {
+                    if (context_tear_down)
+                        return clCxt.get();
                     if( 0 == clCxt.get())
                     clCxt.reset(new Context);
-
                     std::vector<Info> oclinfo;
                     CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
                     oclinfo[0].impl->setDevice(0, 0, 0);
@@ -1042,9 +1049,14 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
     {
         // application hangs if call clReleaseCommandQueue here, so release context only
         // without context release application hangs as well
-        cl_context ctx = (cl_context)getoclContext();
-        if(ctx)
-            openCLSafeCall(clReleaseContext(ctx));
+        context_tear_down = 1;
+        Context* cv_ctx = Context::getContext();
+        if(cv_ctx)
+        {
+            cl_context ctx = (cl_context)&(cv_ctx->impl->oclcontext);
+            if(ctx)
+                openCLSafeCall(clReleaseContext(ctx));
+        }
     }
     return TRUE;
 }
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index 9f104f151f..118df52677 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -142,7 +142,7 @@ namespace cv
                 format.image_channel_data_type = CL_FLOAT;
                 break;
             default:
-                throw std::exception();
+                CV_Error(-1, "Image forma is not supported");
                 break;
             }
             switch(channels)
@@ -157,7 +157,7 @@ namespace cv
                 format.image_channel_order     = CL_RGBA;
                 break;
             default:
-                throw std::exception();
+                CV_Error(-1, "Image forma is not supported");
                 break;
             }
 #if CL_VERSION_1_2
@@ -195,7 +195,8 @@ namespace cv
                 const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
                 clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
                     regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
-            }
+                clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
+           }
             else
             {
                 devData = (cl_mem)mat.data;
@@ -204,7 +205,7 @@ namespace cv
             clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0);
             if ((mat.cols * mat.elemSize() != mat.step))
             {
-                clFinish((cl_command_queue)mat.clCxt->oclCommandQueue());
+                clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
                 clReleaseMemObject(devData);
             }
 
@@ -229,7 +230,8 @@ namespace cv
             try
             {
                 cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
-                //_support = true;
+                finish();
+                _support = true;
             }
             catch (const cv::Exception& e)
             {
diff --git a/modules/ocl/src/opencl/arithm_absdiff.cl b/modules/ocl/src/opencl/arithm_absdiff.cl
index 37f154216a..341a0048ff 100644
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ b/modules/ocl/src/opencl/arithm_absdiff.cl
@@ -44,7 +44,11 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -63,6 +67,9 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
     {
         x = x << 2;
 
+#ifdef dst_align
+#undef dst_align
+#endif
         #define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -111,7 +118,10 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -145,7 +155,10 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -249,7 +262,10 @@ __kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -288,7 +304,10 @@ __kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, in
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -319,7 +338,10 @@ __kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global   float *src1, int src1_step, int
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_s_absdiff_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                     __global   double *dst,  int dst_step,  int dst_offset,
-                                     double4 src2, int rows, int cols, int dst_step1)
+                                      __global   double *dst,  int dst_step,  int dst_offset,
+                                      double4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -422,7 +444,10 @@ __kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global   ushort *src1, int src1_step, in
 }
 __kernel void arithm_s_absdiff_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
                                       __global   short *dst,  int dst_step,  int dst_offset,
-                                     int4 src2, int rows, int cols, int dst_step1)
+                                      int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global   int *src1, int src1_step, int s
 }
 __kernel void arithm_s_absdiff_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
                                       __global   float *dst,  int dst_step,  int dst_offset,
-                                     float4 src2, int rows, int cols, int dst_step1)
+                                      float4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -564,7 +589,10 @@ __kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -618,7 +646,10 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -668,7 +699,10 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int s
         int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
         int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
 __kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int
         float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
         float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
 
-       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
 
@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, in
         double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
         double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
 
-       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
     }
 }
 #endif
diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
index 789a42444c..647171578d 100644
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -45,7 +45,11 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_add_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int   *src1, int src1_step, i
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 
@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
     }
 }
 #endif
diff --git a/modules/ocl/src/opencl/arithm_addWeighted.cl b/modules/ocl/src/opencl/arithm_addWeighted.cl
index d76f994aa0..e7ed289281 100644
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -42,8 +42,12 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined DOUBLE_SUPPORT
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 typedef double F;
 #else
 typedef float F;
@@ -52,10 +56,10 @@ typedef float F;
 /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
-                           __global uchar *src2, int src2_step,int src2_offset,
-                           F alpha,F beta,F gama,
-                           __global uchar *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
+                              __global uchar *src2, int src2_step,int src2_offset,
+                              F alpha,F beta,F gama,
+                              __global uchar *dst,  int dst_step,int dst_offset,
+                              int rows,  int cols,int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
     {
 
         x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
 //        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
-         short4 tmp;
+        short4 tmp;
         tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
         tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
         tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
         dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
 
         *((__global uchar4 *)(dst + dst_index)) = dst_data;
-       // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
+        // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
     }
 
 }
@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
 
 
 __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
-                           __global ushort *src2, int src2_step,int src2_offset,
-                           F alpha,F beta,F gama,
-                           __global ushort *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
+                              __global ushort *src2, int src2_step,int src2_offset,
+                              F alpha,F beta,F gama,
+                              __global ushort *dst,  int dst_step,int dst_offset,
+                              int rows,  int cols,int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -122,34 +129,37 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
 
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
         ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-    if(src1_index < 0)
-    {
-        ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
 
         ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-         int4 tmp;
+        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
+        int4 tmp;
         tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
         tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
         tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -182,7 +192,10 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
 
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
 
@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
 
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
         short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
 
-    if(src1_index < 0)
-    {
-        short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-         int4 tmp;
+        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
+        int4 tmp;
         tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
         tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
         tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
 
 __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
                               __global int *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                               __global int *dst,  int dst_step,int dst_offset,
                               int rows,  int cols,int dst_step1)
 {
@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
 
         x = x << 2;
 
-        #define bitOfInt  (sizeof(int)== 4 ? 2: 3)
+#define bitOfInt  (sizeof(int)== 4 ? 2: 3)
 
-        #define dst_align ((dst_offset >> bitOfInt) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> bitOfInt) & 3)
 
         int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
         int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
 
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
 
-    if(src1_index < 0)
-    {
-        int4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        int4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
-       // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-         float4 tmp;
+        // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
+        float4 tmp;
         tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
         tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
         tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
 
 __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
                               __global float *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                               __global float *dst,  int dst_step,int dst_offset,
                               int rows,  int cols,int dst_step1)
 {
@@ -304,7 +320,10 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
 
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
 
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
 
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
         float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
         float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-    if(src1_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
-    //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-
-       // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
-         float4 tmp_data;
+        if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+        //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
+
+        // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
+        float4 tmp_data;
         tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
         tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
         tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
         tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-       // float4 tmp_data = convert_float4(tmp);
+        // float4 tmp_data = convert_float4(tmp);
 
         dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
         dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
 #if defined (DOUBLE_SUPPORT)
 __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
                               __global double *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                               __global double *dst,  int dst_step,int dst_offset,
                               int rows,  int cols,int dst_step1)
 {
@@ -366,7 +385,10 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
 
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
 
         int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
         int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
 
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
         double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
         double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
-    if(src1_index < 0)
-    {
-        double4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        double4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
-      //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
-         double4 tmp_data;
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+        //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
+        double4 tmp_data;
         tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
         tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
         tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl
index 05b813dc8c..3862440978 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@@ -44,9 +44,13 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
 
+#endif
 /**************************************add with scalar without mask**************************************/
 __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
                                   __global   uchar *dst,  int dst_step,  int dst_offset,
@@ -59,7 +63,10 @@ __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -99,7 +106,10 @@ __kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int sr
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -131,7 +141,10 @@ __kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -233,7 +246,10 @@ __kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -378,7 +394,10 @@ __kernel void arithm_s_add_C3_D0 (__global   uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -432,7 +451,10 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -482,7 +504,10 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_s_add_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global   int *src1, int src1_step, int src1_
         int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
         int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
 __kernel void arithm_s_add_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global   float *src1, int src1_step, int src
         float tmp_data_1 = src1_data_1 + src2_data_1;
         float tmp_data_2 = src1_data_2 + src2_data_2;
 
-       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
 
@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global   double *src1, int src1_step, int sr
         double tmp_data_1 = src1_data_1 + src2_data_1;
         double tmp_data_2 = src1_data_2 + src2_data_2;
 
-       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
     }
 }
 #endif
diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
index 4acb5be6a2..6087a9d807 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -44,7 +44,11 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 /**************************************add with scalar with mask**************************************/
@@ -61,7 +65,10 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_ste
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -111,7 +118,10 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_st
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -146,7 +156,10 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_ste
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -267,7 +280,10 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -443,7 +459,10 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -501,7 +520,10 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -555,7 +577,10 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step,
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 __kernel void arithm_s_add_with_mask_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global   float *src1, int src1_ste
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 
@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global   double *src1, int src1_st
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
     }
 }
 #endif
diff --git a/modules/ocl/src/opencl/arithm_bitwise_and.cl b/modules/ocl/src/opencl/arithm_bitwise_and.cl
index 8adc56de5f..f666e0cfb0 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_and.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -51,9 +55,9 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_and without mask**************************************/
 __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global uchar *src2, int src2_step, int src2_offset,
+                                     __global uchar *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -62,30 +66,33 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-     uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-     uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-
-     if(src1_index < 0)
-     {
-        uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = src1_data & src2_data;
@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
 
 
 __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -112,7 +119,10 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-     char4 src1_data = vload4(0, src1 + src1_index_fix);
-     char4 src2_data = vload4(0, src2 + src2_index_fix);
-
-     if(src1_index < 0)
-     {
-        char4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        char4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        char4 src1_data = vload4(0, src1 + src1_index_fix);
+        char4 src2_data = vload4(0, src2 + src2_index_fix);
+
+        if(src1_index < 0)
+        {
+            char4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            char4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         char4 dst_data = *((__global char4 *)(dst + dst_index));
         char4 tmp_data = src1_data & src2_data;
 
@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
 
 
 __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global ushort *src2, int src2_step, int src2_offset,
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -163,7 +173,10 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
 
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
         ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
 
-     if(src1_index < 0)
-     {
-        ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
         ushort4 tmp_data = src1_data & src2_data;
 
@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
 
 
 __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global short *src2, int src2_step, int src2_offset,
+                                     __global short *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -215,7 +228,10 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
 
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
         short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
 
-     if(src1_index < 0)
-     {
-        short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
         short4 tmp_data = src1_data & src2_data;
 
@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
 
 
 __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global int *src2, int src2_step, int src2_offset,
+                                     __global int *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
 }
 
 __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
index 595fb2ceb7..1382aa5478 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
@@ -43,18 +43,22 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_and with mask**************************************/
-__kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
 
 
 
-__kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
 
 
 
-__kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
 
 
 
-__kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
         short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
         uchar2  mask_data = vload2(0, mask + mask_index);
 
-    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-    short2 tmp_data = src1_data & src2_data;
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+        short2 tmp_data = src1_data & src2_data;
 
         data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
         data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
 
 
 
-__kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int   *src1, int src1
 
 
 
-__kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -274,12 +295,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_
 
 
 
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -305,15 +326,15 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_
     }
 
 }
-#endif
 
 
 
-__kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
 }
 
 
-__kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
     }
 }
 
-__kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1
         *((__global short2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int    *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int   *src1, int src1
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -500,12 +532,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_
         *((__global char8 *)((__global char *)dst + dst_index)) = data;
     }
 }
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+
+__kernel void arithm_bitwise_and_with_mask_C2_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -530,15 +563,15 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_
         *((__global char16 *)((__global char *)dst + dst_index)) = data;
     }
 }
-#endif
 
 
 
-__kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
 }
 
 
-__kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
     }
 }
 
-__kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int   *src1, int src1
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
     }
 }
 #endif
 
 
-
-__kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1
 }
 
 
-__kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_
     }
 }
 
-__kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int   *src1, int src1
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
-                                                  __global char *src2, int src2_step, int src2_offset,
-                                                  __global uchar  *mask, int mask_step, int mask_offset,
-                                                  __global char *dst,  int dst_step,  int dst_offset,
-                                                  int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
index a5152ce0bf..ce1ae39f6a 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
@@ -42,19 +42,22 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************and with scalar without mask**************************************/
-__kernel void arithm_s_bitwise_and_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -63,7 +66,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -86,9 +92,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_and_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -97,7 +104,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global   char *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -119,9 +129,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global   char *src1, int src1_step,
     }
 }
 
-__kernel void arithm_s_bitwise_and_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -131,7 +142,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global   ushort *src1, int src1_step
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -150,9 +164,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global   ushort *src1, int src1_step
         *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -162,7 +177,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global   short *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -181,9 +199,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global   short *src1, int src1_step,
         *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -202,9 +221,10 @@ __kernel void arithm_s_bitwise_and_C1_D4 (__global   int *src1, int src1_step, i
         *((__global int *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -232,11 +252,11 @@ __kernel void arithm_s_bitwise_and_C1_D5 (__global   char *src1, int src1_step,
         *((__global char4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, i
     }
 }
 #endif
-__kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -268,7 +289,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -302,7 +327,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step,
     }
 }
 
-__kernel void arithm_s_bitwise_and_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_and_C2_D2 (__global   ushort *src1, int src1_step
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_and_C2_D3 (__global   short *src1, int src1_step,
         *((__global short2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_and_C2_D4 (__global   int *src1, int src1_step, i
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_and_C2_D5 (__global   char *src1, int src1_step,
         char8 tmp_data = src1_data & src2_data;
 
         *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
-      }
+    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, i
     }
 }
 #endif
-__kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -440,7 +474,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -496,7 +534,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step,
     }
 }
 
-__kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -551,7 +593,10 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -601,7 +647,10 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step,
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_and_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_and_C3_D4 (__global   int *src1, int src1_step, i
         int tmp_data_1 = src1_data_1 & src2_data_1;
         int tmp_data_2 = src1_data_2 & src2_data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
-__kernel void arithm_s_bitwise_and_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_and_C3_D5 (__global   char *src1, int src1_step,
         char4 tmp_data_1 = src1_data_1 & src2_data_1;
         char4 tmp_data_2 = src1_data_2 & src2_data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
         short4 tmp_data_1 = src1_data_1 & src2_data_1;
         short4 tmp_data_2 = src1_data_2 & src2_data_2;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
     }
 }
 #endif
-__kernel void arithm_s_bitwise_and_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_and_C4_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_and_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_and_C4_D1 (__global   char *src1, int src1_step,
     }
 }
 
-__kernel void arithm_s_bitwise_and_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_and_C4_D2 (__global   ushort *src1, int src1_step
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_and_C4_D3 (__global   short *src1, int src1_step,
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_and_C4_D4 (__global   int *src1, int src1_step, i
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_and_C4_D5 (__global   char *src1, int src1_step,
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -897,10 +956,10 @@ __kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, i
         short4 tmp_data_2 = src1_data_2 & src2_data_2;
         short4 tmp_data_3 = src1_data_3 & src2_data_3;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
 
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
index beafd7e0a7..b739ea1e72 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
@@ -42,20 +42,22 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_and with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -65,7 +67,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global   uchar *src1, int
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -90,10 +95,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global   uchar *src1, int
 }
 
 
-__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -103,7 +109,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global   char *src1, int s
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -127,10 +136,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global   char *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar  *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -140,7 +150,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global   ushort *src1, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -161,10 +174,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global   ushort *src1, int
         *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -174,7 +188,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global   short *src1, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -195,10 +212,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global   short *src1, int
         *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (
+        __global   int   *src1, int src1_step, int src1_offset,
+        __global   int   *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -223,10 +241,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global   int   *src1, int
     }
 }
 
-__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                                    __global char *dst,  int dst_step,  int dst_offset,
-                                                    __global   uchar *mask, int mask_step, int mask_offset,
-                                                    char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -252,10 +271,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src
 }
 
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                            __global short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -280,10 +300,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int sr
     }
 }
 #endif
-__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -293,7 +314,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global   uchar *src1, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -316,10 +340,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global   uchar *src1, int
 }
 
 
-__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -329,7 +354,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global   char *src1, int s
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -351,10 +379,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global   char *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -378,10 +407,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global   ushort *src1, int
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -405,10 +435,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global   short *src1, int
         *((__global short2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -432,10 +463,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global   int *src1, int sr
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global  char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global  char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -461,10 +493,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global   char *src1, int s
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -489,10 +522,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int sr
     }
 }
 #endif
-__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -502,7 +536,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -549,10 +586,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int
 }
 
 
-__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -562,7 +600,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int s
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -608,10 +649,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -621,7 +663,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global   ushort *src1, int
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -650,22 +695,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global   ushort *src1, int
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -675,7 +721,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global   short *src1, int
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -704,22 +753,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global   short *src1, int
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -753,15 +803,16 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global   int *src1, int sr
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -795,16 +846,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global   char *src1, int s
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar  *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -838,16 +890,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
     }
 }
 #endif
-__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -872,10 +925,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global   uchar *src1, int
 }
 
 
-__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -899,10 +953,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global   char *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -925,10 +980,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global   ushort *src1, int
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -951,10 +1007,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global   short *src1, int
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -977,10 +1034,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global   int *src1, int sr
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                                    __global   char *dst,  int dst_step,  int dst_offset,
-                                                    __global   uchar *mask, int mask_step, int mask_offset,
-                                                    char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1006,10 +1064,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global   char *src1, int s
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_not.cl b/modules/ocl/src/opencl/arithm_bitwise_not.cl
index fd9d2ccf99..9905130013 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_not.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl
@@ -43,9 +43,12 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -61,25 +64,28 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = ~ src1_data;
 
-  /*  if(src1_index < 0)
-    {
-      uchar4 tmp;
-      tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-      src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-  */
+        /*  if(src1_index < 0)
+          {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+          }
+        */
         dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
         dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
         dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
 
 
 __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -101,7 +107,10 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
 
 
 __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -135,7 +144,10 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
 
 
 __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global short *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -170,7 +182,10 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
 
 
 __kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global int *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_or.cl b/modules/ocl/src/opencl/arithm_bitwise_or.cl
index a95e59e0ca..dd7c53c7ff 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_or.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -51,9 +55,9 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_or without mask**************************************/
 __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global uchar *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -62,29 +66,32 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-      int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-      int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-      if(src1_index < 0)
-      {
-        uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-      }
-      if(src2_index < 0)
-      {
-        uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-      }
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = src1_data | src2_data;
 
@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
 
 
 __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global char *src2, int src2_step, int src2_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -110,7 +117,10 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
 
 
 __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global ushort *src2, int src2_step, int src2_offset,
+                                    __global ushort *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -147,7 +157,10 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
 
 
 __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global short *src2, int src2_step, int src2_offset,
+                                    __global short *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -185,7 +198,10 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
 
 
 __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global int *src2, int src2_step, int src2_offset,
+                                    __global int *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
 }
 
 __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global char *src2, int src2_step, int src2_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global char *src2, int src2_step, int src2_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
index aedb68c474..0242c8673e 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
@@ -43,18 +43,22 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_or with mask**************************************/
-__kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
 
 
 
-__kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
 
 
 
-__kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
 
 
 
-__kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
         short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
         uchar2  mask_data = vload2(0, mask + mask_index);
 
-    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-    short2 tmp_data = src1_data | src2_data;
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+        short2 tmp_data = src1_data | src2_data;
 
         data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
         data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
 
 
 
-__kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int   *src1, int src1_
 
 
 
-__kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -273,13 +294,13 @@ __kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_s
 }
 
 
-
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -308,12 +329,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_s
 #endif
 
 
-
-__kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
 }
 
 
-__kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
     }
 }
 
-__kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_
         *((__global short2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int    *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int   *src1, int src1_
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -501,11 +533,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_s
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -533,12 +566,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_s
 #endif
 
 
-
-__kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
 }
 
 
-__kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
     }
 }
 
-__kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int   *src1, int src1_
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_s
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_s
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
     }
 }
 #endif
 
 
-
-__kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_
 }
 
 
-__kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_s
     }
 }
 
-__kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int   *src1, int src1_
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_s
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
-                                                  __global char *src2, int src2_step, int src2_offset,
-                                                  __global uchar  *mask, int mask_step, int mask_offset,
-                                                  __global char *dst,  int dst_step,  int dst_offset,
-                                                  int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
index 5b94591a30..2730f9dada 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
@@ -43,16 +43,21 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************and with scalar without mask**************************************/
-__kernel void arithm_s_bitwise_or_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -61,7 +66,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -84,9 +92,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_or_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -95,7 +104,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global   char *src1, int src1_step, i
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -117,9 +129,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global   char *src1, int src1_step, i
     }
 }
 
-__kernel void arithm_s_bitwise_or_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -129,7 +142,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global   ushort *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -148,9 +164,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global   ushort *src1, int src1_step,
         *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -160,7 +177,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global   short *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -179,9 +199,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global   short *src1, int src1_step,
         *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -200,9 +221,10 @@ __kernel void arithm_s_bitwise_or_C1_D4 (__global   int *src1, int src1_step, in
         *((__global int *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -222,9 +244,10 @@ __kernel void arithm_s_bitwise_or_C1_D5 (__global   char *src1, int src1_step, i
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -245,10 +268,10 @@ __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, in
     }
 }
 #endif
-
-__kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -259,7 +282,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -280,9 +306,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -293,7 +320,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, i
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -313,9 +343,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, i
     }
 }
 
-__kernel void arithm_s_bitwise_or_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -335,9 +366,10 @@ __kernel void arithm_s_bitwise_or_C2_D2 (__global   ushort *src1, int src1_step,
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -358,8 +390,8 @@ __kernel void arithm_s_bitwise_or_C2_D3 (__global   short *src1, int src1_step,
     }
 }
 __kernel void arithm_s_bitwise_or_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -378,9 +410,10 @@ __kernel void arithm_s_bitwise_or_C2_D4 (__global   int *src1, int src1_step, in
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -400,9 +433,10 @@ __kernel void arithm_s_bitwise_or_C2_D5 (__global   char *src1, int src1_step, i
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -423,9 +457,10 @@ __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, in
     }
 }
 #endif
-__kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -436,7 +471,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -480,9 +518,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -493,7 +532,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, i
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -536,9 +578,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, i
     }
 }
 
-__kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -549,7 +592,10 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -575,21 +621,22 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step,
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -600,7 +647,10 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -626,21 +676,22 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step,
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_or_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -668,14 +719,15 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global   int *src1, int src1_step, in
         int tmp_data_1 = src1_data_1 | src2_data_1;
         int tmp_data_2 = src1_data_2 | src2_data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
-__kernel void arithm_s_bitwise_or_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -700,15 +752,16 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global   char *src1, int src1_step, i
         char4 tmp_data_1 = src1_data_1 | src2_data_1;
         char4 tmp_data_2 = src1_data_2 | src2_data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -736,15 +789,16 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
         short4 tmp_data_1 = src1_data_1 | src2_data_1;
         short4 tmp_data_2 = src1_data_2 | src2_data_2;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
     }
 }
 #endif
-__kernel void arithm_s_bitwise_or_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -765,9 +819,10 @@ __kernel void arithm_s_bitwise_or_C4_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_or_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -787,9 +842,10 @@ __kernel void arithm_s_bitwise_or_C4_D1 (__global   char *src1, int src1_step, i
     }
 }
 
-__kernel void arithm_s_bitwise_or_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -808,9 +864,10 @@ __kernel void arithm_s_bitwise_or_C4_D2 (__global   ushort *src1, int src1_step,
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -829,9 +886,10 @@ __kernel void arithm_s_bitwise_or_C4_D3 (__global   short *src1, int src1_step,
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -850,9 +908,10 @@ __kernel void arithm_s_bitwise_or_C4_D4 (__global   int *src1, int src1_step, in
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -874,9 +933,10 @@ __kernel void arithm_s_bitwise_or_C4_D5 (__global   char *src1, int src1_step, i
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -903,10 +963,10 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
         short4 tmp_data_2 = src1_data_2 | src2_data_2;
         short4 tmp_data_3 = src1_data_3 | src2_data_3;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
 
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
index 54066c21a0..9184ff706b 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
@@ -43,17 +43,21 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_or with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -64,7 +68,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global   uchar *src1, int s
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -89,10 +96,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global   uchar *src1, int s
 }
 
 
-__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -103,7 +111,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global   char *src1, int sr
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -127,10 +138,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global   char *src1, int sr
     }
 }
 
-__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar  *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -141,7 +153,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global   ushort *src1, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -162,10 +177,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global   ushort *src1, int
         *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -176,7 +192,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global   short *src1, int s
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -197,10 +216,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global   short *src1, int s
         *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (
+        __global   int   *src1, int src1_step, int src1_offset,
+        __global   int   *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -226,10 +246,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global   int   *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global   char   *src1, int src1_step, int src1_offset,
-                                            __global   char   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (
+        __global   char   *src1, int src1_step, int src1_offset,
+        __global   char   *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -254,12 +275,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global   char   *src1, int
         *((__global char4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                            __global short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -285,10 +306,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
     }
 }
 #endif
-__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -299,7 +321,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global   uchar *src1, int s
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -322,10 +347,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global   uchar *src1, int s
 }
 
 
-__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -336,7 +362,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global   char *src1, int sr
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -358,10 +387,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global   char *src1, int sr
     }
 }
 
-__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -386,10 +416,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global   ushort *src1, int
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -414,10 +445,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global   short *src1, int s
         *((__global short2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -442,10 +474,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global   int *src1, int src
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -463,17 +496,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global   char *src1, int sr
         char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
         char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
         char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));
-          char8 data = src_data1 | src_data2;
+        char8 data = src_data1 | src_data2;
         data = mask_data ? data : dst_data;
         *((__global char8 *)((__global char *)dst + dst_index)) = data;
 
-      }
+    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -499,10 +533,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global   char *src1, int sr
     }
 }
 #endif
-__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -513,7 +548,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int s
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -560,10 +598,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int s
 }
 
 
-__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -574,7 +613,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int sr
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -620,10 +662,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int sr
     }
 }
 
-__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -634,7 +677,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -663,22 +709,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -689,7 +736,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int s
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -718,22 +768,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int s
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -768,15 +819,16 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global   int *src1, int src
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -811,17 +863,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global   char *src1, int sr
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
 
-       }
+    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar  *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -855,16 +908,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
     }
 }
 #endif
-__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -890,10 +944,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global   uchar *src1, int s
 }
 
 
-__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -918,10 +973,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global   char *src1, int sr
     }
 }
 
-__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -945,10 +1001,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global   ushort *src1, int
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -972,10 +1029,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global   short *src1, int s
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -999,10 +1057,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global   int *src1, int src
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 
 {
 
@@ -1029,10 +1088,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global   char *src1, int sr
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor.cl b/modules/ocl/src/opencl/arithm_bitwise_xor.cl
index 4f743776a4..4b34af152c 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor.cl
@@ -43,17 +43,20 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_xor without mask**************************************/
 __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global uchar *src2, int src2_step, int src2_offset,
+                                     __global uchar *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -62,7 +65,10 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
 
-     if(src1_index < 0)
-     {
-        uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = src1_data ^ src2_data;
 
@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
 
 
 __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -112,7 +118,10 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         char4 src1_data = vload4(0, src1 + src1_index_fix);
         char4 src2_data = vload4(0, src2 + src2_index_fix);
 
-     if(src1_index < 0)
-     {
-        char4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        char4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            char4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            char4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         char4 dst_data = *((__global char4 *)(dst + dst_index));
         char4 tmp_data = src1_data ^ src2_data;
 
@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
 
 
 __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global ushort *src2, int src2_step, int src2_offset,
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -163,7 +172,10 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
 
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
         ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
 
-     if(src1_index < 0)
-     {
-        ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
         ushort4 tmp_data = src1_data ^ src2_data;
 
@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
 
 
 __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global short *src2, int src2_step, int src2_offset,
+                                     __global short *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -215,7 +227,10 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
 
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
         short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
 
         short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
 
-     if(src1_index < 0)
-     {
-        short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
 
 
@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
 
 
 __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global int *src2, int src2_step, int src2_offset,
+                                     __global int *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
 }
 
 __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
         *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
     }
 }
-
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
index 4359d860a5..25ed0113a7 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
@@ -43,18 +43,22 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_xor with mask**************************************/
-__kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
 
 
 
-__kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
 
 
 
-__kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
 
 
 
-__kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
         short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
         uchar2  mask_data = vload2(0, mask + mask_index);
 
-    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-    short2 tmp_data = src1_data ^ src2_data;
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+        short2 tmp_data = src1_data ^ src2_data;
 
         data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
         data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
 
 
 
-__kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int   *src1, int src1
 
 
 
-__kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -273,13 +294,13 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_
 }
 
 
-
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -308,12 +329,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_
 
 
 
-
-__kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
 }
 
 
-__kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
     }
 }
 
-__kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1
         *((__global short2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int    *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int   *src1, int src1
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -501,11 +533,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -533,12 +566,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_
 #endif
 
 
-
-__kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
 }
 
 
-__kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
     }
 }
 
-__kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int   *src1, int src1
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
     }
 }
 #endif
 
 
-
-__kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1
 }
 
 
-__kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_
     }
 }
 
-__kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int   *src1, int src1
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
-                                                  __global char *src2, int src2_step, int src2_offset,
-                                                  __global uchar  *mask, int mask_step, int mask_offset,
-                                                  __global char *dst,  int dst_step,  int dst_offset,
-                                                  int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
index 318432a186..4fe1cc31e8 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
@@ -42,19 +42,21 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************xor with scalar without mask**************************************/
-__kernel void arithm_s_bitwise_xor_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -63,7 +65,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -86,9 +91,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_xor_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -97,7 +103,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global   char *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -119,9 +128,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global   char *src1, int src1_step,
     }
 }
 
-__kernel void arithm_s_bitwise_xor_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -131,7 +141,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global   ushort *src1, int src1_step
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -150,9 +163,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global   ushort *src1, int src1_step
         *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -162,7 +176,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global   short *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -181,9 +198,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global   short *src1, int src1_step,
         *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -202,9 +220,10 @@ __kernel void arithm_s_bitwise_xor_C1_D4 (__global   int *src1, int src1_step, i
         *((__global int *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -234,9 +253,10 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (__global   char *src1, int src1_step,
 }
 
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, i
     }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -268,7 +289,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -302,7 +327,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step,
     }
 }
 
-__kernel void arithm_s_bitwise_xor_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (__global   ushort *src1, int src1_step
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (__global   short *src1, int src1_step,
         *((__global short2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_xor_C2_D4 (__global   int *src1, int src1_step, i
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (__global   char *src1, int src1_step,
         char8 tmp_data = src1_data ^ src2_data;
 
         *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
-      }
+    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, i
     }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -440,7 +474,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -496,7 +534,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step,
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step,
     }
 }
 
-__kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -551,7 +593,10 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -601,7 +647,10 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step,
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step,
         data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_xor_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_xor_C3_D4 (__global   int *src1, int src1_step, i
         int tmp_data_1 = src1_data_1 ^ src2_data_1;
         int tmp_data_2 = src1_data_2 ^ src2_data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
-__kernel void arithm_s_bitwise_xor_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_xor_C3_D5 (__global   char *src1, int src1_step,
         char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
         char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
         short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
         short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
     }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_xor_C4_D0 (__global   uchar *src1, int src1_step,
 }
 
 
-__kernel void arithm_s_bitwise_xor_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_xor_C4_D1 (__global   char *src1, int src1_step,
     }
 }
 
-__kernel void arithm_s_bitwise_xor_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_xor_C4_D2 (__global   ushort *src1, int src1_step
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_xor_C4_D3 (__global   short *src1, int src1_step,
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_xor_C4_D4 (__global   int *src1, int src1_step, i
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (__global   char *src1, int src1_step,
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -897,11 +956,11 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, i
         short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
         short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
 
     }
 }
-#endif
+#endif
\ No newline at end of file
diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
index 57ad9ee713..06672b8c37 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
@@ -42,20 +42,23 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_xor with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -65,7 +68,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global   uchar *src1, int
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -90,10 +96,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global   uchar *src1, int
 }
 
 
-__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -103,7 +110,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global   char *src1, int s
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -127,10 +137,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global   char *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar  *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -140,7 +151,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global   ushort *src1, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -161,10 +175,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global   ushort *src1, int
         *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -174,7 +189,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global   short *src1, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -195,10 +213,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global   short *src1, int
         *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (
+        __global   int   *src1, int src1_step, int src1_offset,
+        __global   int   *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -223,10 +242,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global   int   *src1, int
     }
 }
 
-__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                                    __global char *dst,  int dst_step,  int dst_offset,
-                                                    __global   uchar *mask, int mask_step, int mask_offset,
-                                                    char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -252,10 +272,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src
 }
 
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                            __global short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -280,10 +301,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int sr
     }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -293,7 +315,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global   uchar *src1, int
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -316,10 +341,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global   uchar *src1, int
 }
 
 
-__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -329,7 +355,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global   char *src1, int s
     {
         x = x << 1;
 
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -351,10 +380,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global   char *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -378,10 +408,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global   ushort *src1, int
         *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -405,10 +436,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global   short *src1, int
         *((__global short2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -432,10 +464,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global   int *src1, int sr
         *((__global int2 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global  char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global  char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -461,10 +494,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global   char *src1, int s
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -489,10 +523,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int sr
     }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -502,7 +537,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global   uchar *src1, int
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -549,10 +587,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global   uchar *src1, int
 }
 
 
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -562,7 +601,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global   char *src1, int s
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
         int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -608,10 +650,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global   char *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -621,7 +664,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global   ushort *src1, int
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -650,22 +696,23 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global   ushort *src1, int
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -675,7 +722,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global   short *src1, int
     {
         x = x << 1;
 
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
         int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -704,22 +754,23 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global   short *src1, int
         data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
 
         data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
         data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
 
         data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
 
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -753,15 +804,16 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global   int *src1, int sr
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -795,16 +847,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global   char *src1, int s
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar  *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -838,16 +891,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr
         data_1 = mask_data ? tmp_data_1 : data_1;
         data_2 = mask_data ? tmp_data_2 : data_2;
 
-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
     }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -872,10 +926,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global   uchar *src1, int
 }
 
 
-__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -899,10 +954,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global   char *src1, int s
     }
 }
 
-__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -925,10 +981,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global   ushort *src1, int
         *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -951,10 +1008,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global   short *src1, int
         *((__global short4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -977,10 +1035,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global   int *src1, int sr
         *((__global int4 *)((__global char *)dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                                    __global   char *dst,  int dst_step,  int dst_offset,
-                                                    __global   uchar *mask, int mask_step, int mask_offset,
-                                                    char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
@@ -1006,10 +1065,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global   char *src1, int s
     }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {
 
     int x = get_global_id(0);
diff --git a/modules/ocl/src/opencl/arithm_compare_eq.cl b/modules/ocl/src/opencl/arithm_compare_eq.cl
index f818532ba2..16a56acef3 100644
--- a/modules/ocl/src/opencl/arithm_compare_eq.cl
+++ b/modules/ocl/src/opencl/arithm_compare_eq.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -51,9 +55,9 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 
 __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global uchar *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -62,7 +66,10 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -102,9 +109,9 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
 
 
 __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global ushort *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -114,7 +121,10 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1)& 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -153,9 +163,9 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
 
 
 __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global short *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -165,7 +175,10 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -207,9 +220,9 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
 
 
 __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global int *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -217,7 +230,10 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -227,7 +243,7 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
         int src1_index_fix = src1_index < 0 ? 0 : src1_index;
         int src2_index_fix = src2_index < 0 ? 0 : src2_index;
 
-         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
         if(src1_index < 0)
         {
@@ -255,9 +271,9 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
 }
 
 __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global float *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -265,7 +281,10 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -275,7 +294,8 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
         int src1_index_fix = src1_index < 0 ? 0 : src1_index;
         int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));       if(src2_index < 0)
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+        if(src2_index < 0)
         {
             float4 tmp;
             tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
@@ -297,9 +317,9 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global double *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -307,7 +327,10 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
         int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
         int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
@@ -347,9 +370,9 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
 
 /***********************************Compare GT**************************/
 __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global uchar *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -358,7 +381,10 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -397,9 +423,9 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
 }
 
 __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global ushort *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -409,7 +435,10 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -450,9 +479,9 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
 
 
 __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global short *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -462,7 +491,10 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -501,9 +533,9 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
 }
 
 __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global int *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -511,7 +543,10 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -521,7 +556,7 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
         int src1_index_fix = src1_index < 0 ? 0 : src1_index;
         int src2_index_fix = src2_index < 0 ? 0 : src2_index;
 
-         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
         if(src1_index < 0)
         {
@@ -550,9 +585,9 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
 }
 
 __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global float *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -560,7 +595,10 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -599,9 +637,9 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global double *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -609,7 +647,10 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
         int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
         int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
@@ -649,9 +690,9 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
 
 /***********************************Compare GE**************************/
 __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global uchar *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -660,7 +701,10 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -702,9 +746,9 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
 
 
 __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global ushort *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -714,7 +758,10 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -757,9 +804,9 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
 
 
 __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global short *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -769,7 +816,10 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1)& 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -809,9 +859,9 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
 }
 
 __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global int *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -820,7 +870,10 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -845,7 +898,7 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
             tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
             src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
         }
-       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
 
         dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -858,9 +911,9 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
 }
 
 __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global float *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -869,7 +922,10 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -909,9 +965,9 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global double *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -920,7 +976,10 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 3)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3)& 3)
         int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
         int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
@@ -942,7 +1001,8 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
             double4 tmp;
             tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
             src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }               uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        }
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
 
         dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
diff --git a/modules/ocl/src/opencl/arithm_compare_ne.cl b/modules/ocl/src/opencl/arithm_compare_ne.cl
index 713dc13169..fb5859d3b2 100644
--- a/modules/ocl/src/opencl/arithm_compare_ne.cl
+++ b/modules/ocl/src/opencl/arithm_compare_ne.cl
@@ -43,13 +43,17 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 /***********************************Compare NE*******************************/
 __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global uchar *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -58,7 +62,10 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -98,9 +105,9 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
 
 
 __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global ushort *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -110,7 +117,10 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1)& 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -150,9 +160,9 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
 
 
 __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global short *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -162,7 +172,10 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1)& 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -200,9 +213,9 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
 }
 
 __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global int *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -210,7 +223,10 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -249,9 +265,9 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
 }
 
 __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global float *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -259,7 +275,10 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -269,7 +288,8 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
         int src1_index_fix = src1_index < 0 ? 0 : src1_index;
         int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));       if(src1_index < 0)
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
         {
             float4 tmp;
             tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
@@ -282,7 +302,7 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
             src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
         }
 
-       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
 
         dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -296,9 +316,9 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global double *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -306,7 +326,10 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
         int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
         int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
@@ -347,9 +370,9 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
 
 /***********************************Compare LT*******************************/
 __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global  uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global uchar *src2, int src2_step, int src2_offset,
+                                    __global  uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -358,7 +381,10 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -398,9 +424,9 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
 
 
 __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global ushort *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -410,7 +436,10 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -451,9 +480,9 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
 
 
 __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global short *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -463,7 +492,10 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -502,9 +534,9 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
 }
 
 __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global int *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -512,7 +544,10 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -554,9 +589,9 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
 }
 
 __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global float *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -564,7 +599,10 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -589,7 +627,7 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
         }
 
 
-       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
 
         dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -603,9 +641,9 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global double *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -613,7 +651,10 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
         int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
         int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
@@ -638,7 +679,7 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
         }
 
 
-       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
 
         dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -653,9 +694,9 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
 
 /***********************************Compare LE*******************************/
 __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global uchar *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -664,7 +705,10 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -705,9 +749,9 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
 
 
 __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global ushort *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -717,7 +761,10 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -758,9 +805,9 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
 
 
 __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global short *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
@@ -770,7 +817,10 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -809,9 +859,9 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
 }
 
 __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global int *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -819,7 +869,10 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -857,9 +910,9 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
 }
 
 __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global float *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -867,7 +920,10 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
         int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
         int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
@@ -905,9 +961,9 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global double *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -915,7 +971,10 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
     if (x < cols && y < rows)
     {
         x = x << 2;
-        #define dst_align ((dst_offset >> 3)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3)& 3)
         int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
         int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
diff --git a/modules/ocl/src/opencl/arithm_div.cl b/modules/ocl/src/opencl/arithm_div.cl
index dcbe303106..1dce3853ff 100644
--- a/modules/ocl/src/opencl/arithm_div.cl
+++ b/modules/ocl/src/opencl/arithm_div.cl
@@ -44,7 +44,11 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 typedef double F ;
 typedef double4 F4;
 #define convert_F4 convert_double4
@@ -56,34 +60,24 @@ typedef float4 F4;
 #define convert_F  float
 #endif
 
-uchar round2_uchar(F v){
-
-    uchar v1 = convert_uchar_sat(round(v));
-    //uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+inline uchar round2_uchar(F v)
+{
+    return convert_uchar_sat(round(v));
 }
 
-ushort round2_ushort(F v){
-
-    ushort v1 = convert_ushort_sat(round(v));
-    //ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+inline ushort round2_ushort(F v)
+{
+    return convert_ushort_sat(round(v));
 }
-short round2_short(F v){
-
-    short v1 = convert_short_sat(round(v));
-    //short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
 
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+inline short round2_short(F v)
+{
+    return convert_short_sat(round(v));
 }
-int round2_int(F v){
-
-    int v1 = convert_int_sat(round(v));
-    //int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
 
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+inline int round2_int(F v)
+{
+    return convert_int_sat(round(v));
 }
 ///////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////divide///////////////////////////////////////////////////
@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
                              __global uchar *dst,  int dst_step,  int dst_offset,
                              int rows, int cols, int dst_step1, F scalar)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
+    int2 coor = (int2)(get_global_id(0), get_global_id(1));
 
-    if (x < cols && y < rows)
+    if (coor.x < cols && coor.y < rows)
     {
-        x = x << 2;
+        coor.x = coor.x << 2;
 
-        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
+        int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
+                                mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
 
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+        int4 dst_args  = (int4)(mad24(coor.y, dst_step, dst_offset),
+                                mad24(coor.y, dst_step, dst_offset + dst_step1),
+                                mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
+                                0);
 
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_index));
+        uchar4 src1_data = vload4(0, src1 + src_index.x);
+        uchar4 src2_data = vload4(0, src2 + src_index.y);
+        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_args.z));
 
         F4 tmp      = convert_F4(src1_data) * scalar;
-
         uchar4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w);
+        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
+        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
+        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
+        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
 
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+        dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
 
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        *((__global uchar4 *)(dst + dst_args.z)) = dst_data;
     }
 }
 
@@ -142,7 +138,10 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -182,7 +181,10 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -297,7 +299,10 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src_index = mad24(y, src_step, x + src_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -333,7 +338,10 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
@@ -368,7 +376,10 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
diff --git a/modules/ocl/src/opencl/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl
index f4925244a5..944442b0f1 100644
--- a/modules/ocl/src/opencl/arithm_flip.cl
+++ b/modules/ocl/src/opencl/arithm_flip.cl
@@ -44,7 +44,11 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -61,7 +65,10 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
         int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
 
@@ -116,7 +123,10 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
         int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
 
@@ -158,7 +168,10 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset >> 1) & 3) << 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset >> 1) & 3) << 1)
         int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
         int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
 
@@ -200,7 +213,10 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
     {
         x = x << 2;
 
-        #define dst_align (((dst_offset >> 1) & 3) << 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset >> 1) & 3) << 1)
         int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
         int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
 
diff --git a/modules/ocl/src/opencl/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl
index f9f3936a46..b2a11d710e 100644
--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ b/modules/ocl/src/opencl/arithm_mul.cl
@@ -16,7 +16,6 @@
 //
 // @Authors
 //    Jia Haipeng, jiahaipeng95@gmail.com
-//    Dachuan Zhao, dachuan@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -44,11 +43,16 @@
 //
 //M*/
 
-#if defined DOUBLE_SUPPORT
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
-int4 round_int4(float4 v){
+int4 round_int4(float4 v)
+{
     v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
     v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
     v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
@@ -56,7 +60,8 @@ int4 round_int4(float4 v){
 
     return convert_int4_sat(v);
 }
-uint4 round_uint4(float4 v){
+uint4 round_uint4(float4 v)
+{
     v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
     v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
     v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
@@ -64,7 +69,8 @@ uint4 round_uint4(float4 v){
 
     return convert_uint4_sat(v);
 }
-long round_int(float v){
+long round_int(float v)
+{
     v = v + (v > 0 ? 0.5 : -0.5);
 
     return convert_int_sat(v);
@@ -85,7 +91,10 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse
     {
         x = x << 2;
 
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
         int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
         int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
@@ -130,7 +139,10 @@ __kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offs
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -166,7 +178,10 @@ __kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offse
     {
         x = x << 2;
 
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -263,8 +278,8 @@ __kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offs
 #endif
 
 __kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
+                              __global float *dst,  int dst_step,  int dst_offset,
+                              int rows, int cols, int dst_step1, float scalar)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
diff --git a/modules/ocl/src/opencl/stereobp.cl b/modules/ocl/src/opencl/stereobp.cl
new file mode 100644
index 0000000000..3993acae08
--- /dev/null
+++ b/modules/ocl/src/opencl/stereobp.cl
@@ -0,0 +1,380 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao,   pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+
+#endif
+
+#ifdef T_FLOAT
+#define T float
+#else
+#define T short
+#endif
+
+///////////////////////////////////////////////////////////////
+/////////////////common///////////////////////////////////////
+/////////////////////////////////////////////////////////////
+T saturate_cast(float v){
+#ifdef T_SHORT
+    return convert_short_sat_rte(v);
+#else
+    return v;
+#endif
+}
+
+#define FLOAT_MAX 3.402823466e+38f
+typedef struct
+{
+    int   cndisp;
+    float cmax_data_term;
+    float cdata_weight;
+    float cmax_disc_term;
+    float cdisc_single_jump;
+}con_srtuct_t;
+///////////////////////////////////////////////////////////////
+////////////////////////// comp data //////////////////////////
+///////////////////////////////////////////////////////////////
+
+float pix_diff_1(__global const uchar *ls, __global const uchar *rs)
+{
+    return abs((int)(*ls) - *rs);
+}
+
+float pix_diff_3(__global const uchar *ls, __global const uchar *rs)
+{
+    const float tr = 0.299f;
+    const float tg = 0.587f;
+    const float tb = 0.114f;
+
+    float val;
+
+    val =  tb * abs((int)ls[0] - rs[0]);
+    val += tg * abs((int)ls[1] - rs[1]);
+    val += tr * abs((int)ls[2] - rs[2]);
+
+    return val;
+}
+float pix_diff_4(__global const uchar *ls, __global const uchar *rs)
+{
+    uchar4 l, r;
+    l = *((__global uchar4 *)ls);
+    r = *((__global uchar4 *)rs);
+
+    const float tr = 0.299f;
+    const float tg = 0.587f;
+    const float tb = 0.114f;
+
+    float val;
+
+    val  = tb * abs((int)l.x - r.x);
+    val += tg * abs((int)l.y - r.y);
+    val += tr * abs((int)l.z - r.z);
+
+    return val;
+}
+
+
+#ifndef CN
+#define CN 4
+#endif
+
+#define CAT(X,Y) X##Y
+#define CAT2(X,Y) CAT(X,Y)
+
+#define PIX_DIFF CAT2(pix_diff_, CN)
+
+__kernel void comp_data(__global uchar *left,  int left_rows,  int left_cols,  int left_step,
+                        __global uchar *right, int right_step,
+                        __global T *data, int data_step,
+                        __constant con_srtuct_t *con_st)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y > 0 && y < (left_rows - 1) && x > 0 && x < (left_cols - 1))
+    {
+        data_step /= sizeof(T);
+        const __global uchar* ls = left  + y * left_step  + x * CN;
+        const __global uchar* rs = right + y * right_step + x * CN;
+
+        __global T *ds = data + y * data_step + x;
+
+        const unsigned int disp_step = data_step * left_rows;
+
+        for (int disp = 0; disp < con_st -> cndisp; disp++)
+        {
+            if (x - disp >= 1)
+            {
+                float val = 0;
+                val = PIX_DIFF(ls, rs - disp * CN);
+                ds[disp * disp_step] =  saturate_cast(fmin(con_st -> cdata_weight * val,
+                    con_st -> cdata_weight * con_st -> cmax_data_term));
+            }
+            else
+            {
+                ds[disp * disp_step] =  saturate_cast(con_st -> cdata_weight * con_st -> cmax_data_term);
+            }
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////
+//////////////////////// data step down ///////////////////////
+///////////////////////////////////////////////////////////////
+__kernel void data_step_down(__global T *src, int src_rows,
+                             __global T *dst, int dst_rows, int dst_cols,
+                             int src_step, int dst_step,
+                             int cndisp)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        src_step /= sizeof(T);
+        dst_step /= sizeof(T);
+        for (int d = 0; d < cndisp; ++d)
+        {
+            float dst_reg;
+            dst_reg  = src[(d * src_rows + (2*y+0)) * src_step + 2*x+0];
+            dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+0];
+            dst_reg += src[(d * src_rows + (2*y+0)) * src_step + 2*x+1];
+            dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+1];
+
+            dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////
+/////////////////// level up messages  ////////////////////////
+///////////////////////////////////////////////////////////////
+__kernel void level_up_message(__global T *src, int src_rows, int src_step,
+                               __global T *dst, int dst_rows, int dst_cols, int dst_step,
+                               int cndisp)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        src_step /= sizeof(T);
+        dst_step /= sizeof(T);
+
+        const int dst_disp_step = dst_step * dst_rows;
+        const int src_disp_step = src_step * src_rows;
+
+        __global T       *dstr = dst + y * dst_step + x;
+        __global const T *srcr = src + (y / 2 * src_step) + (x / 2);
+
+        for (int d = 0; d < cndisp; ++d)
+            dstr[d * dst_disp_step] = srcr[d * src_disp_step];
+    }
+}
+
+///////////////////////////////////////////////////////////////
+////////////////////  calc all iterations /////////////////////
+///////////////////////////////////////////////////////////////
+void calc_min_linear_penalty(__global T * dst, int disp_step,
+                             int cndisp, float cdisc_single_jump)
+{
+    float prev = dst[0];
+    float cur;
+
+    for (int disp = 1; disp < cndisp; ++disp)
+    {
+        prev += cdisc_single_jump;
+        cur = dst[disp_step * disp];
+
+        if (prev < cur)
+        {
+            cur = prev;
+            dst[disp_step * disp] = saturate_cast(prev);
+        }
+
+        prev = cur;
+    }
+
+    prev = dst[(cndisp - 1) * disp_step];
+    for (int disp = cndisp - 2; disp >= 0; disp--)
+    {
+        prev += cdisc_single_jump;
+        cur = dst[disp_step * disp];
+
+        if (prev < cur)
+        {
+            cur = prev;
+            dst[disp_step * disp] = saturate_cast(prev);
+        }
+        prev = cur;
+    }
+}
+void message(const __global T *msg1, const __global T *msg2,
+             const __global T *msg3, const __global T *data, __global T *dst,
+             int msg_disp_step, int data_disp_step, int cndisp, float cmax_disc_term, float cdisc_single_jump)
+{
+    float minimum = FLOAT_MAX;
+
+    for(int i = 0; i < cndisp; ++i)
+    {
+        float dst_reg;
+        dst_reg  = msg1[msg_disp_step * i];
+        dst_reg += msg2[msg_disp_step * i];
+        dst_reg += msg3[msg_disp_step * i];
+        dst_reg += data[data_disp_step * i];
+
+        if (dst_reg < minimum)
+            minimum = dst_reg;
+
+        dst[msg_disp_step * i] = saturate_cast(dst_reg);
+    }
+
+    calc_min_linear_penalty(dst, msg_disp_step, cndisp, cdisc_single_jump);
+
+    minimum += cmax_disc_term;
+
+    float sum = 0;
+    for(int i = 0; i < cndisp; ++i)
+    {
+        float dst_reg = dst[msg_disp_step * i];
+        if (dst_reg > minimum)
+        {
+            dst_reg = minimum;
+            dst[msg_disp_step * i] = saturate_cast(minimum);
+        }
+        sum += dst_reg;
+    }
+    sum /= cndisp;
+
+    for(int i = 0; i < cndisp; ++i)
+        dst[msg_disp_step * i] -= sum;
+}
+__kernel void one_iteration(__global T *u,    int u_step,
+                            __global T *data, int data_step,
+                            __global T *d,    __global T *l, __global T *r,
+                            int t, int cols, int rows,
+                            int cndisp, float cmax_disc_term, float cdisc_single_jump)
+{
+    const int y = get_global_id(1);
+    const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
+
+    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
+    {
+        u_step    /= sizeof(T);
+        data_step /= sizeof(T);
+
+        __global T *us = u + y * u_step + x;
+        __global T *ds = d + y * u_step + x;
+        __global T *ls = l + y * u_step + x;
+        __global T *rs = r + y * u_step + x;
+        const __global  T *dt = data + y * data_step + x;
+
+        int msg_disp_step = u_step * rows;
+        int data_disp_step = data_step * rows;
+
+        message(us + u_step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step, cndisp,
+            cmax_disc_term, cdisc_single_jump);
+        message(ds - u_step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step, cndisp,
+            cmax_disc_term, cdisc_single_jump);
+
+        message(us + u_step, ds - u_step, rs - 1, dt, rs, msg_disp_step, data_disp_step, cndisp,
+            cmax_disc_term, cdisc_single_jump);
+        message(us + u_step, ds - u_step, ls + 1, dt, ls, msg_disp_step, data_disp_step, cndisp,
+            cmax_disc_term, cdisc_single_jump);
+    }
+}
+
+///////////////////////////////////////////////////////////////
+/////////////////////////// output ////////////////////////////
+///////////////////////////////////////////////////////////////
+__kernel void output(const __global T *u, int u_step,
+                     const __global T *d, const __global T *l,
+                     const __global T *r, const __global T *data,
+                     __global T *disp, int disp_rows, int disp_cols, int disp_step,
+                     int cndisp)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
+    {
+        u_step    /= sizeof(T);
+        disp_step /= sizeof(T);
+        const __global T *us = u + (y + 1) * u_step + x;
+        const __global T *ds = d + (y - 1) * u_step + x;
+        const __global T *ls = l + y * u_step + (x + 1);
+        const __global T *rs = r + y * u_step + (x - 1);
+        const __global T *dt = data + y * u_step + x;
+
+        int disp_steps = disp_rows * u_step;
+
+        int best = 0;
+        float best_val = FLOAT_MAX;
+        for (int d = 0; d < cndisp; ++d)
+        {
+            float val;
+            val  = us[d * disp_steps];
+            val += ds[d * disp_steps];
+            val += ls[d * disp_steps];
+            val += rs[d * disp_steps];
+            val += dt[d * disp_steps];
+
+            if (val < best_val)
+            {
+                best_val = val;
+                best = d;
+            }
+        }
+
+        (disp + y * disp_step)[x] = convert_short_sat(best);
+    }
+}
diff --git a/modules/ocl/src/stereobp.cpp b/modules/ocl/src/stereobp.cpp
new file mode 100644
index 0000000000..50072c2d57
--- /dev/null
+++ b/modules/ocl/src/stereobp.cpp
@@ -0,0 +1,517 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao,   pengxiao@outlook.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <vector>
+#include <cstdio>
+
+using namespace cv;
+using namespace cv::ocl;
+
+////////////////////////////////////////////////////////////////////////
+///////////////// stereoBP /////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+namespace cv
+{
+    namespace ocl
+    {
+
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *stereobp;
+    }
+
+}
+namespace cv
+{
+    namespace ocl
+    {
+        namespace stereoBP
+        {
+            //////////////////////////////////////////////////////////////////////////
+            //////////////////////////////common////////////////////////////////////
+            ////////////////////////////////////////////////////////////////////////
+            typedef struct
+            {
+                int   cndisp;
+                float cmax_data_term;
+                float cdata_weight;
+                float cmax_disc_term;
+                float cdisc_single_jump;
+            } con_struct_t;
+
+            cl_mem cl_con_struct =  NULL;
+            static void load_constants(int ndisp, float max_data_term, float data_weight,
+                                float max_disc_term, float disc_single_jump)
+            {
+                con_struct_t *con_struct = new con_struct_t;
+                con_struct -> cndisp            = ndisp;
+                con_struct -> cmax_data_term    = max_data_term;
+                con_struct -> cdata_weight      = data_weight;
+                con_struct -> cmax_disc_term    = max_disc_term;
+                con_struct -> cdisc_single_jump = disc_single_jump;
+
+                cl_con_struct = load_constant(*((cl_context*)getoclContext()), *((cl_command_queue*)getoclCommandQueue()), (void *)con_struct,
+                                              sizeof(con_struct_t));
+
+                delete con_struct;
+            }
+            static void release_constants()
+            {
+                openCLFree(cl_con_struct);
+            }
+            static inline int divUp(int total, int grain)
+            {
+                return (total + grain - 1) / grain;
+            }
+            /////////////////////////////////////////////////////////////////////////////
+            ///////////////////////////comp data////////////////////////////////////////
+            /////////////////////////////////////////////////////////////////////////
+            static void  comp_data_call(const oclMat &left, const oclMat &right, oclMat &data, int /*disp*/,
+                float /*cmax_data_term*/, float /*cdata_weight*/)
+            {
+                Context  *clCxt = left.clCxt;
+                int channels = left.oclchannels();
+                int data_type = data.type();
+
+                String kernelName = "comp_data";
+
+                std::vector<std::pair<size_t , const void *> > args;
+
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&left.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.rows));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.cols));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.step));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&right.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&right.step));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&data.step));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&cl_con_struct));
+
+                size_t gt[3] = {left.cols, left.rows, 1}, lt[3] = {16, 16, 1};
+
+                const int OPT_SIZE = 50;
+                char cn_opt [OPT_SIZE] = "";
+                sprintf( cn_opt, "%s -D CN=%d",
+                    (data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT"),
+                    channels
+                    );
+                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, cn_opt);
+            }
+            ///////////////////////////////////////////////////////////////////////////////////
+            /////////////////////////data set down////////////////////////////////////////////
+            /////////////////////////////////////////////////////////////////////////////////
+            static void data_step_down_call(int dst_cols, int dst_rows, int src_rows,
+                const oclMat &src, oclMat &dst, int disp)
+            {
+                Context  *clCxt = src.clCxt;
+                int data_type = src.type();
+
+                String kernelName = "data_step_down";
+
+                std::vector<std::pair<size_t , const void *> > args;
+
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_rows));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_rows));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_cols));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp));
+
+                size_t gt[3] = {dst_cols, dst_rows, 1}, lt[3] = {16, 16, 1};
+                const char* t_opt  = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
+                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
+            }
+            /////////////////////////////////////////////////////////////////////////////////
+            ///////////////////////////live up message////////////////////////////////////////
+            /////////////////////////////////////////////////////////////////////////////////
+            static void level_up_message_call(int dst_cols, int dst_rows, int src_rows,
+                oclMat &src, oclMat &dst, int ndisp)
+            {
+                Context  *clCxt = src.clCxt;
+                int data_type = src.type();
+
+                String kernelName = "level_up_message";
+                std::vector<std::pair<size_t , const void *> > args;
+
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_rows));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_rows));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_cols));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&ndisp));
+
+                size_t gt[3] = {dst_cols, dst_rows, 1}, lt[3] = {16, 16, 1};
+                const char* t_opt  = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
+                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
+            }
+            static void level_up_messages_calls(int dst_idx, int dst_cols, int dst_rows, int src_rows,
+                                         oclMat *mus, oclMat *mds, oclMat *mls, oclMat *mrs,
+                                         int ndisp)
+            {
+                int src_idx = (dst_idx + 1) & 1;
+
+                level_up_message_call(dst_cols, dst_rows, src_rows,
+                                      mus[src_idx], mus[dst_idx], ndisp);
+
+                level_up_message_call(dst_cols, dst_rows, src_rows,
+                                      mds[src_idx], mds[dst_idx], ndisp);
+
+                level_up_message_call(dst_cols, dst_rows, src_rows,
+                                      mls[src_idx], mls[dst_idx], ndisp);
+
+                level_up_message_call(dst_cols, dst_rows, src_rows,
+                                      mrs[src_idx], mrs[dst_idx], ndisp);
+            }
+            //////////////////////////////////////////////////////////////////////////////////
+            //////////////////////////////cals_all_iterations_call///////////////////////////
+            /////////////////////////////////////////////////////////////////////////////////
+            static void calc_all_iterations_call(int cols, int rows, oclMat &u, oclMat &d,
+                oclMat &l, oclMat &r, oclMat &data,
+                int t, int cndisp, float cmax_disc_term,
+                float cdisc_single_jump)
+            {
+                Context  *clCxt = l.clCxt;
+                int data_type = u.type();
+
+                String kernelName = "one_iteration";
+
+                std::vector<std::pair<size_t , const void *> > args;
+
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&u.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&u.step));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&data.step));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&d.data));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&l.data));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&r.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&rows));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cndisp));
+                args.push_back( std::make_pair( sizeof(cl_float) , (void *)&cmax_disc_term));
+                args.push_back( std::make_pair( sizeof(cl_float) , (void *)&cdisc_single_jump));
+
+                size_t gt[3] = {cols, rows, 1}, lt[3] = {16, 16, 1};
+                const char* t_opt  = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
+                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
+            }
+
+            static void calc_all_iterations_calls(int cols, int rows, int iters, oclMat &u,
+                                           oclMat &d, oclMat &l, oclMat &r,
+                                           oclMat &data, int cndisp, float cmax_disc_term,
+                                           float cdisc_single_jump)
+            {
+                for(int t = 0; t < iters; ++t)
+                    calc_all_iterations_call(cols, rows, u, d, l, r, data, t, cndisp,
+                                             cmax_disc_term, cdisc_single_jump);
+            }
+            ///////////////////////////////////////////////////////////////////////////////
+            ///////////////////////output///////////////////////////////////////////////////
+            ////////////////////////////////////////////////////////////////////////////////
+            static void output_call(const oclMat &u, const oclMat &d, const oclMat l, const oclMat &r,
+                const oclMat &data, oclMat &disp, int ndisp)
+            {
+                Context  *clCxt = u.clCxt;
+                int data_type = u.type();
+
+                String kernelName = "output";
+
+                std::vector<std::pair<size_t , const void *> > args;
+
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&u.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&u.step));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&d.data));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&l.data));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&r.data));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
+                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&disp.data));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.rows));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.cols));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.step));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&ndisp));
+
+                size_t gt[3] = {disp.cols, disp.rows, 1}, lt[3] = {16, 16, 1};
+                const char* t_opt  = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
+                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
+            }
+        }
+    }
+}
+namespace
+{
+    const float DEFAULT_MAX_DATA_TERM = 10.0f;
+    const float DEFAULT_DATA_WEIGHT = 0.07f;
+    const float DEFAULT_MAX_DISC_TERM = 1.7f;
+    const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
+}
+
+void cv::ocl::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels)
+{
+    ndisp = width / 4;
+    if ((ndisp & 1) != 0)
+        ndisp++;
+
+    int mm = ::max(width, height);
+    iters = mm / 100 + 2;
+
+    levels = (int)(::log(static_cast<double>(mm)) + 1) * 4 / 5;
+    if (levels == 0) levels++;
+}
+
+cv::ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, int msg_type_)
+    : ndisp(ndisp_), iters(iters_), levels(levels_),
+      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
+      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP),
+      msg_type(msg_type_), datas(levels_)
+{
+}
+
+cv::ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, int msg_type_)
+    : ndisp(ndisp_), iters(iters_), levels(levels_),
+      max_data_term(max_data_term_), data_weight(data_weight_),
+      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_),
+      msg_type(msg_type_), datas(levels_)
+{
+}
+
+namespace
+{
+    class StereoBeliefPropagationImpl
+    {
+    public:
+        StereoBeliefPropagationImpl(StereoBeliefPropagation &rthis_,
+                                    oclMat &u_, oclMat &d_, oclMat &l_, oclMat &r_,
+                                    oclMat &u2_, oclMat &d2_, oclMat &l2_, oclMat &r2_,
+                                    std::vector<oclMat> &datas_, oclMat &out_)
+            : rthis(rthis_), u(u_), d(d_), l(l_), r(r_), u2(u2_), d2(d2_), l2(l2_), r2(r2_), datas(datas_), out(out_),
+              zero(Scalar::all(0)), scale(rthis_.msg_type == CV_32F ? 1.0f : 10.0f)
+        {
+            CV_Assert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels);
+            CV_Assert(rthis.msg_type == CV_32F || rthis.msg_type == CV_16S);
+            CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < std::numeric_limits<short>::max());
+        }
+
+        void operator()(const oclMat &left, const oclMat &right, oclMat &disp)
+        {
+            CV_Assert(left.size() == right.size() && left.type() == right.type());
+            CV_Assert(left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4);
+
+            rows = left.rows;
+            cols = left.cols;
+
+            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
+            int lowest_cols = cols / divisor;
+            int lowest_rows = rows / divisor;
+            const int min_image_dim_size = 2;
+            CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size);
+
+            init();
+
+            datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type);
+            datas[0].setTo(Scalar_<short>::all(0));
+
+            cv::ocl::stereoBP::comp_data_call(left, right, datas[0], rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight);
+            calcBP(disp);
+        }
+
+        void operator()(const oclMat &data, oclMat &disp)
+        {
+            CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0));
+
+            rows = data.rows / rthis.ndisp;
+            cols = data.cols;
+
+            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
+            int lowest_cols = cols / divisor;
+            int lowest_rows = rows / divisor;
+            const int min_image_dim_size = 2;
+            CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size);
+
+            init();
+
+            datas[0] = data;
+
+            calcBP(disp);
+        }
+    private:
+        void init()
+        {
+            u.create(rows * rthis.ndisp, cols, rthis.msg_type);
+            d.create(rows * rthis.ndisp, cols, rthis.msg_type);
+            l.create(rows * rthis.ndisp, cols, rthis.msg_type);
+            r.create(rows * rthis.ndisp, cols, rthis.msg_type);
+
+            if (rthis.levels & 1)
+            {
+                //can clear less area
+                u = zero;
+                d = zero;
+                l = zero;
+                r = zero;
+            }
+
+            if (rthis.levels > 1)
+            {
+                int less_rows = (rows + 1) / 2;
+                int less_cols = (cols + 1) / 2;
+
+                u2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
+                d2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
+                l2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
+                r2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
+
+                if ((rthis.levels & 1) == 0)
+                {
+                    u2 = zero;
+                    d2 = zero;
+                    l2 = zero;
+                    r2 = zero;
+                }
+            }
+
+            cv::ocl::stereoBP::load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight,
+                                              scale * rthis.max_disc_term, scale * rthis.disc_single_jump);
+
+            datas.resize(rthis.levels);
+            cols_all.resize(rthis.levels);
+            rows_all.resize(rthis.levels);
+
+            cols_all[0] = cols;
+            rows_all[0] = rows;
+        }
+
+        void calcBP(oclMat &disp)
+        {
+            using namespace cv::ocl::stereoBP;
+
+            for (int i = 1; i < rthis.levels; ++i)
+            {
+                cols_all[i] = (cols_all[i - 1] + 1) / 2;
+                rows_all[i] = (rows_all[i - 1] + 1) / 2;
+
+                datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type);
+                datas[i].setTo(Scalar_<short>::all(0));
+
+                data_step_down_call(cols_all[i], rows_all[i], rows_all[i - 1],
+                                    datas[i - 1], datas[i], rthis.ndisp);
+            }
+
+            oclMat mus[] = {u, u2};
+            oclMat mds[] = {d, d2};
+            oclMat mrs[] = {r, r2};
+            oclMat mls[] = {l, l2};
+
+            int mem_idx = (rthis.levels & 1) ? 0 : 1;
+
+            for (int i = rthis.levels - 1; i >= 0; --i)
+            {
+                // for lower level we have already computed messages by setting to zero
+                if (i != rthis.levels - 1)
+                    level_up_messages_calls(mem_idx, cols_all[i], rows_all[i], rows_all[i + 1],
+                                            mus, mds, mls, mrs, rthis.ndisp);
+
+                calc_all_iterations_calls(cols_all[i], rows_all[i], rthis.iters, mus[mem_idx],
+                                          mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i],
+                                          rthis.ndisp, scale * rthis.max_disc_term,
+                                          scale * rthis.disc_single_jump);
+
+                mem_idx = (mem_idx + 1) & 1;
+            }
+            if (disp.empty())
+                disp.create(rows, cols, CV_16S);
+
+            out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
+            out = zero;
+
+            output_call(u, d, l, r, datas.front(), out, rthis.ndisp);
+
+            if (disp.type() != CV_16S)
+                out.convertTo(disp, disp.type());
+
+            release_constants();
+        }
+        StereoBeliefPropagationImpl& operator=(const StereoBeliefPropagationImpl&);
+
+        StereoBeliefPropagation &rthis;
+
+        oclMat &u;
+        oclMat &d;
+        oclMat &l;
+        oclMat &r;
+
+        oclMat &u2;
+        oclMat &d2;
+        oclMat &l2;
+        oclMat &r2;
+
+        std::vector<oclMat> &datas;
+        oclMat &out;
+
+        const Scalar zero;
+        const float scale;
+
+        int rows, cols;
+
+        std::vector<int> cols_all, rows_all;
+    };
+}
+
+void cv::ocl::StereoBeliefPropagation::operator()(const oclMat &left, const oclMat &right, oclMat &disp)
+{
+    ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
+    impl(left, right, disp);
+}
+
+void cv::ocl::StereoBeliefPropagation::operator()(const oclMat &data, oclMat &disp)
+{
+    ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
+    impl(data, disp);
+}
diff --git a/modules/ocl/test/test_calib3d.cpp b/modules/ocl/test/test_calib3d.cpp
index b429625829..b556e5a3c9 100644
--- a/modules/ocl/test/test_calib3d.cpp
+++ b/modules/ocl/test/test_calib3d.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-
+//     Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -63,12 +63,12 @@ PARAM_TEST_CASE(StereoMatchBM, int, int)
     }
 };
 
-TEST_P(StereoMatchBM, Accuracy)
-        {
+TEST_P(StereoMatchBM, Regression)
+{
 
-    Mat left_image  = readImage(workdir + "../ocl/aloe-L.png", IMREAD_GRAYSCALE);
-    Mat right_image = readImage(workdir + "../ocl/aloe-R.png", IMREAD_GRAYSCALE);
-    Mat disp_gold   = readImage(workdir + "../ocl/aloe-disp.png", IMREAD_GRAYSCALE);
+    Mat left_image  = readImage("stereobm/aloe-L.png", IMREAD_GRAYSCALE);
+    Mat right_image = readImage("stereobm/aloe-R.png", IMREAD_GRAYSCALE);
+    Mat disp_gold   = readImage("stereobm/aloe-disp.png", IMREAD_GRAYSCALE);
     ocl::oclMat d_left, d_right;
     ocl::oclMat d_disp(left_image.size(), CV_8U);
     Mat  disp;
@@ -88,7 +88,50 @@ TEST_P(StereoMatchBM, Accuracy)
     EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-3);
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, StereoMatchBM, testing::Combine(testing::Values(128),
+INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBM, testing::Combine(testing::Values(128),
                                        testing::Values(19)));
 
+PARAM_TEST_CASE(StereoMatchBP, int, int, int, float, float, float, float)
+{
+    int ndisp_;
+    int iters_;
+    int levels_;
+    float max_data_term_;
+    float data_weight_;
+    float max_disc_term_;
+    float disc_single_jump_;
+    virtual void SetUp()
+    {
+        ndisp_          = GET_PARAM(0);
+        iters_          = GET_PARAM(1);
+        levels_         = GET_PARAM(2);
+        max_data_term_  = GET_PARAM(3);
+        data_weight_    = GET_PARAM(4);
+        max_disc_term_     = GET_PARAM(5);
+        disc_single_jump_  = GET_PARAM(6);
+    }
+};
+TEST_P(StereoMatchBP, Regression)
+{
+    Mat left_image  = readImage("stereobp/aloe-L.png");
+    Mat right_image = readImage("stereobp/aloe-R.png");
+    Mat disp_gold   = readImage("stereobp/aloe-disp.png", IMREAD_GRAYSCALE);
+    ocl::oclMat d_left, d_right;
+    ocl::oclMat d_disp;
+    Mat  disp;
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+    d_left.upload(left_image);
+    d_right.upload(right_image);
+    ocl::StereoBeliefPropagation bp(ndisp_, iters_, levels_, max_data_term_, data_weight_,
+        max_disc_term_, disc_single_jump_, CV_16S);
+    bp(d_left, d_right, d_disp);
+    d_disp.download(disp);
+    disp.convertTo(disp, disp_gold.depth());
+    EXPECT_MAT_NEAR(disp_gold, disp, 0.0, "");
+}
+INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBP, testing::Combine(testing::Values(64),
+    testing::Values(8),testing::Values(2),testing::Values(25.0f),
+    testing::Values(0.1f),testing::Values(15.0f),testing::Values(1.0f)));
 #endif // HAVE_OPENCL
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index 47ba05388b..851309c033 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -975,7 +975,7 @@ static inline PyObject* pyopencv_from(const Moments& m)
                          "mu20", m.mu20, "mu11", m.mu11, "mu02", m.mu02,
                          "mu30", m.mu30, "mu21", m.mu21, "mu12", m.mu12, "mu03", m.mu03,
                          "nu20", m.nu20, "nu11", m.nu11, "nu02", m.nu02,
-                         "nu30", m.nu30, "nu21", m.nu21, "nu12", m.nu12, "mu03", m.nu03);
+                         "nu30", m.nu30, "nu21", m.nu21, "nu12", m.nu12, "nu03", m.nu03);
 }
 
 static inline PyObject* pyopencv_from(const CvDTreeNode* node)
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
index 5a6393a69b..693363d59f 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -458,7 +458,7 @@ struct SphericalPortraitProjector : ProjectorBase
 
 // Projects image onto unit sphere with origin at (0, 0, 0).
 // Poles are located NOT at (0, -1, 0) and (0, 1, 0) points, BUT at (1, 0, 0) and (-1, 0, 0) points.
-class SphericalPortraitWarper : public RotationWarperBase<SphericalPortraitProjector>
+class CV_EXPORTS SphericalPortraitWarper : public RotationWarperBase<SphericalPortraitProjector>
 {
 public:
     SphericalPortraitWarper(float scale) { projector_.scale = scale; }
@@ -474,7 +474,7 @@ struct CylindricalPortraitProjector : ProjectorBase
 };
 
 
-class CylindricalPortraitWarper : public RotationWarperBase<CylindricalPortraitProjector>
+class CV_EXPORTS CylindricalPortraitWarper : public RotationWarperBase<CylindricalPortraitProjector>
 {
 public:
     CylindricalPortraitWarper(float scale) { projector_.scale = scale; }
@@ -493,7 +493,7 @@ struct PlanePortraitProjector : ProjectorBase
 };
 
 
-class PlanePortraitWarper : public RotationWarperBase<PlanePortraitProjector>
+class CV_EXPORTS PlanePortraitWarper : public RotationWarperBase<PlanePortraitProjector>
 {
 public:
     PlanePortraitWarper(float scale) { projector_.scale = scale; }
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 013639fb4e..b564193eb3 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -347,8 +347,15 @@ SurfFeaturesFinder::SurfFeaturesFinder(double hess_thresh, int num_octaves, int
 void SurfFeaturesFinder::find(const Mat &image, ImageFeatures &features)
 {
     Mat gray_image;
-    CV_Assert(image.type() == CV_8UC3);
-    cvtColor(image, gray_image, CV_BGR2GRAY);
+    CV_Assert((image.type() == CV_8UC3) || (image.type() == CV_8UC1));
+    if(image.type() == CV_8UC3)
+    {
+        cvtColor(image, gray_image, CV_BGR2GRAY);
+    }
+    else
+    {
+        gray_image = image;
+    }
     if (surf.empty())
     {
         detector_->detect(gray_image, features.keypoints);
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
index 5e82629ae8..92ce01c2d2 100644
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -9,8 +9,6 @@ ocv_module_include_directories()
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef /wd4127)
 
 if(HAVE_CUDA)
-  string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-
   ocv_source_group("Src\\Cuda" GLOB "src/cuda/*.cu")
   ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include" ${CUDA_INCLUDE_DIRS})
 
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index 7aa7debd45..d6d1ddd5d4 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -168,13 +168,12 @@ void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray
     I0.convertTo(I0s[0], I0s[0].depth(), I0.depth() == CV_8U ? 1.0 : 255.0);
     I1.convertTo(I1s[0], I1s[0].depth(), I1.depth() == CV_8U ? 1.0 : 255.0);
 
+    u1s[0].create(I0.size());
+    u2s[0].create(I0.size());
+
     if (useInitialFlow)
     {
-        u1s[0].create(I0.size());
-        u2s[0].create(I0.size());
-
         Mat_<float> mv[] = {u1s[0], u2s[0]};
-
         split(_flow.getMat(), mv);
     }
 
@@ -227,6 +226,17 @@ void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray
             multiply(u1s[s], Scalar::all(0.5), u1s[s]);
             multiply(u2s[s], Scalar::all(0.5), u2s[s]);
         }
+        else
+        {
+            u1s[s].create(I0s[s].size());
+            u2s[s].create(I0s[s].size());
+        }
+    }
+
+    if (!useInitialFlow)
+    {
+        u1s[nscales-1].setTo(Scalar::all(0));
+        u2s[nscales-1].setTo(Scalar::all(0));
     }
 
     // pyramidal structure for computing the optical flow
@@ -792,18 +802,9 @@ void OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>
 
     CV_DbgAssert( I1.size() == I0.size() );
     CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
+    CV_DbgAssert( u1.size() == I0.size() );
     CV_DbgAssert( u2.size() == u1.size() );
 
-    if (u1.empty())
-    {
-        u1.create(I0.size());
-        u1.setTo(Scalar::all(0));
-
-        u2.create(I0.size());
-        u2.setTo(Scalar::all(0));
-    }
-
     Mat_<float> I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
     Mat_<float> I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
     centeredGradient(I1, I1x, I1y);
diff --git a/platforms/winrt/arm.winrt.toolchain.cmake b/platforms/winrt/arm.winrt.toolchain.cmake
new file mode 100644
index 0000000000..b34056cd5e
--- /dev/null
+++ b/platforms/winrt/arm.winrt.toolchain.cmake
@@ -0,0 +1,6 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR "arm-v7a")
+
+set(CMAKE_FIND_ROOT_PATH "${CMAKE_SOURCE_DIR}/platforms/winrt")
+set(CMAKE_REQUIRED_DEFINITIONS -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE)
+add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE)
\ No newline at end of file
diff --git a/platforms/winrt/scripts/cmake_winrt.cmd b/platforms/winrt/scripts/cmake_winrt.cmd
new file mode 100644
index 0000000000..aafed7d09d
--- /dev/null
+++ b/platforms/winrt/scripts/cmake_winrt.cmd
@@ -0,0 +1,6 @@
+mkdir build
+cd build
+
+rem call "C:\Program Files\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+
+cmake.exe -GNinja -DCMAKE_BUILD_TYPE=Release -DWITH_FFMPEG=OFF -DBUILD_opencv_gpu=OFF -DBUILD_opencv_python=OFF -DCMAKE_TOOLCHAIN_FILE=..\..\winrt\arm.winrt.toolchain.cmake ..\..\..
diff --git a/samples/android/native-activity/.classpath b/samples/android/native-activity/.classpath
new file mode 100644
index 0000000000..3f9691c5dd
--- /dev/null
+++ b/samples/android/native-activity/.classpath
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="con" path="com.android.ide.eclipse.adt.ANDROID_FRAMEWORK"/>
+	<classpathentry kind="con" path="com.android.ide.eclipse.adt.LIBRARIES"/>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="src" path="gen"/>
+	<classpathentry kind="output" path="bin/classes"/>
+</classpath>
diff --git a/samples/android/native-activity/.cproject b/samples/android/native-activity/.cproject
new file mode 100644
index 0000000000..09687f3ac0
--- /dev/null
+++ b/samples/android/native-activity/.cproject
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?>
+
+<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="0.129633445">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="0.129633445" moduleId="org.eclipse.cdt.core.settings" name="Default">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildProperties="" description="" id="0.129633445" name="Default" parent="org.eclipse.cdt.build.core.prefbase.cfg">
+					<folderInfo id="0.129633445." name="/" resourcePath="">
+						<toolChain id="org.eclipse.cdt.build.core.prefbase.toolchain.2006441180" name="No ToolChain" resourceTypeBasedDiscovery="false" superClass="org.eclipse.cdt.build.core.prefbase.toolchain">
+							<targetPlatform id="org.eclipse.cdt.build.core.prefbase.toolchain.2006441180.527973180" name=""/>
+							<builder autoBuildTarget="" command="${NDKROOT}/ndk-build.cmd" enableAutoBuild="true" enableCleanBuild="false" id="org.eclipse.cdt.build.core.settings.default.builder.180541221" incrementalBuildTarget="" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="org.eclipse.cdt.build.core.settings.default.builder"/>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.libs.791069665" name="holder for library settings" superClass="org.eclipse.cdt.build.core.settings.holder.libs"/>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.1894181736" name="Assembly" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.588929884" languageId="org.eclipse.cdt.core.assembly" languageName="Assembly" sourceContentType="org.eclipse.cdt.core.asmSource" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.303359177" name="GNU C++" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<option id="org.eclipse.cdt.build.core.settings.holder.incpaths.373249505" name="Include Paths" superClass="org.eclipse.cdt.build.core.settings.holder.incpaths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/platforms/android-9/arch-arm/usr/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${ProjDirPath}/../../sdk/native/jni/include&quot;"/>
+								</option>
+								<option id="org.eclipse.cdt.build.core.settings.holder.symbols.1424359063" name="Symbols" superClass="org.eclipse.cdt.build.core.settings.holder.symbols" valueType="definedSymbols">
+									<listOptionValue builtIn="false" value="ANDROID=1"/>
+								</option>
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.360067880" languageId="org.eclipse.cdt.core.g++" languageName="GNU C++" sourceContentType="org.eclipse.cdt.core.cxxSource,org.eclipse.cdt.core.cxxHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.1156172258" name="GNU C" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<option id="org.eclipse.cdt.build.core.settings.holder.incpaths.149918263" name="Include Paths" superClass="org.eclipse.cdt.build.core.settings.holder.incpaths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/platforms/android-9/arch-arm/usr/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${ProjDirPath}/../../sdk/native/jni/include&quot;"/>
+								</option>
+								<option id="org.eclipse.cdt.build.core.settings.holder.symbols.719752707" name="Symbols" superClass="org.eclipse.cdt.build.core.settings.holder.symbols" valueType="definedSymbols">
+									<listOptionValue builtIn="false" value="ANDROID=1"/>
+								</option>
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.232493949" languageId="org.eclipse.cdt.core.gcc" languageName="GNU C" sourceContentType="org.eclipse.cdt.core.cSource,org.eclipse.cdt.core.cHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+					<sourceEntries>
+						<entry flags="VALUE_WORKSPACE_PATH" kind="sourcePath" name="jni"/>
+					</sourceEntries>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="OpenCV Sample - face-detection.null.1639518055" name="OpenCV Sample - face-detection"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="0.129633445">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="refreshScope" versionNumber="1">
+		<resource resourceType="PROJECT" workspacePath="/OpenCV Sample - face-detection"/>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+</cproject>
diff --git a/samples/android/native-activity/.project b/samples/android/native-activity/.project
new file mode 100644
index 0000000000..cf0823c0b3
--- /dev/null
+++ b/samples/android/native-activity/.project
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>OpenCV Sample - native-activity</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.ResourceManagerBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.PreCompilerBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.ApkBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>com.android.ide.eclipse.adt.AndroidNature</nature>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/samples/android/native-activity/.settings/org.eclipse.jdt.core.prefs b/samples/android/native-activity/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000000..b080d2ddc8
--- /dev/null
+++ b/samples/android/native-activity/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,4 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.source=1.6
diff --git a/samples/android/native-activity/CMakeLists.txt b/samples/android/native-activity/CMakeLists.txt
new file mode 100644
index 0000000000..1f6d97439a
--- /dev/null
+++ b/samples/android/native-activity/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(sample example-native-activity)
+
+if(BUILD_FAT_JAVA_LIB)
+  set(native_deps opencv_java)
+else()
+  set(native_deps opencv_highgui opencv_imgproc)
+endif()
+
+add_android_project(${sample} "${CMAKE_CURRENT_SOURCE_DIR}" LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 9 ${ANDROID_SDK_TARGET} NATIVE_DEPS ${native_deps})
+if(TARGET ${sample})
+  add_dependencies(opencv_android_examples ${sample})
+endif()
diff --git a/samples/android/native-activity/jni/native.cpp b/samples/android/native-activity/jni/native.cpp
index 38dda06038..66bc006db1 100644
--- a/samples/android/native-activity/jni/native.cpp
+++ b/samples/android/native-activity/jni/native.cpp
@@ -28,7 +28,7 @@ struct Engine
     cv::Ptr<cv::VideoCapture> capture;
 };
 
-cv::Size calc_optimal_camera_resolution(const char* supported, int width, int height)
+static cv::Size calc_optimal_camera_resolution(const char* supported, int width, int height)
 {
     int frame_width = 0;
     int frame_height = 0;
diff --git a/samples/android/tutorial-2-mixedprocessing/jni/jni_part.cpp b/samples/android/tutorial-2-mixedprocessing/jni/jni_part.cpp
index e7ed75d213..f8e3ada726 100644
--- a/samples/android/tutorial-2-mixedprocessing/jni/jni_part.cpp
+++ b/samples/android/tutorial-2-mixedprocessing/jni/jni_part.cpp
@@ -8,7 +8,7 @@ using namespace std;
 using namespace cv;
 
 extern "C" {
-JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial2_Tuturial2Activity_FindFeatures(JNIEnv*, jobject, jlong addrGray, jlong addrRgba);
+JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial2_Tutorial2Activity_FindFeatures(JNIEnv*, jobject, jlong addrGray, jlong addrRgba);
 
 JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial2_Tutorial2Activity_FindFeatures(JNIEnv*, jobject, jlong addrGray, jlong addrRgba)
 {
diff --git a/samples/cpp/Qt_sample/main.cpp b/samples/cpp/Qt_sample/main.cpp
index b43473310e..e40d72bd4e 100644
--- a/samples/cpp/Qt_sample/main.cpp
+++ b/samples/cpp/Qt_sample/main.cpp
@@ -33,7 +33,7 @@ static void help()
             "It works off of the video: cube4.avi\n"
             "Using OpenCV version %s\n" << CV_VERSION << "\n\n"
 " 1). This demo is mainly based on work from Javier Barandiaran Martirena\n"
-"     See this page http://opencv.willowgarage.com/wiki/Posit.\n"
+"     See this page http://code.opencv.org/projects/opencv/wiki/Posit.\n"
 " 2). This is a demo to illustrate how to use **OpenGL Callback**.\n"
 " 3). You need Qt binding to compile this sample with OpenGL support enabled.\n"
 " 4). The features' detection is very basic and could highly be improved \n"
diff --git a/samples/cpp/stereo_calib.cpp b/samples/cpp/stereo_calib.cpp
index a6d276f8b4..07621cef8c 100644
--- a/samples/cpp/stereo_calib.cpp
+++ b/samples/cpp/stereo_calib.cpp
@@ -22,7 +22,7 @@
    * An active user group is at:
      http://tech.groups.yahoo.com/group/OpenCV/
    * The minutes of weekly OpenCV development meetings are at:
-     http://pr.willowgarage.com/wiki/OpenCV
+     http://code.opencv.org/projects/opencv/wiki/Meeting_notes
    ************************************************** */
 
 #include "opencv2/calib3d/calib3d.hpp"
diff --git a/samples/gpu/super_resolution.cpp b/samples/gpu/super_resolution.cpp
index bdd4b43c54..dca9e8b017 100644
--- a/samples/gpu/super_resolution.cpp
+++ b/samples/gpu/super_resolution.cpp
@@ -48,8 +48,7 @@ static Ptr<DenseOpticalFlowExt> createOptFlow(const string& name, bool useGpu)
     {
         cerr << "Incorrect Optical Flow algorithm - " << name << endl;
     }
-
-    return Ptr<DenseOpticalFlowExt>();
+    return 0;
 }
 
 int main(int argc, const char* argv[])
diff --git a/samples/ocl/performance.cpp b/samples/ocl/performance.cpp
deleted file mode 100644
index 46f71a429d..0000000000
--- a/samples/ocl/performance.cpp
+++ /dev/null
@@ -1,4397 +0,0 @@
-#include <iomanip>
-#include <stdexcept>
-#include <string>
-#include <iostream>
-#include <cstdio>
-#include <vector>
-#include <numeric>
-#include <opencv2/core/utility.hpp>
-#include "opencv2/imgproc.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/calib3d.hpp"
-#include "opencv2/video.hpp"
-#include "opencv2/nonfree.hpp"
-#include "opencv2/objdetect.hpp"
-#include "opencv2/features2d.hpp"
-#define USE_OPENCL
-#ifdef USE_OPENCL
-#include "opencv2/ocl.hpp"
-#include "opencv2/nonfree/ocl.hpp"
-#endif
-
-#define TAB "    "
-
-using namespace std;
-using namespace cv;
-
-// This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
-// All images needed in this test are in samples/gpu folder.
-// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory
-
-void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
-string abspath(const string &relpath);
-int CV_CDECL cvErrorCallback(int, const char *, const char *, const char *, int, void *);
-typedef struct
-{
-    short x;
-    short y;
-} COOR;
-COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep,
-                  cv::Size size, int sp, int sr, int maxIter, float eps, int *tab);
-void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi,
-                    int sp, int sr, cv::TermCriteria crit);
-
-class Runnable
-{
-public:
-    explicit Runnable(const std::string &runname): name_(runname) {}
-    virtual ~Runnable() {}
-
-    const std::string &name() const
-    {
-        return name_;
-    }
-
-    virtual void run() = 0;
-
-private:
-    std::string name_;
-};
-
-class TestSystem
-{
-public:
-    static TestSystem &instance()
-    {
-        static TestSystem me;
-        return me;
-    }
-
-    void setWorkingDir(const std::string &val)
-    {
-        working_dir_ = val;
-    }
-    const std::string &workingDir() const
-    {
-        return working_dir_;
-    }
-
-    void setTestFilter(const std::string &val)
-    {
-        test_filter_ = val;
-    }
-    const std::string &testFilter() const
-    {
-        return test_filter_;
-    }
-
-    void setNumIters(int num_iters)
-    {
-        num_iters_ = num_iters;
-    }
-    void setGPUWarmupIters(int num_iters)
-    {
-        gpu_warmup_iters_ = num_iters;
-    }
-    void setCPUIters(int num_iters)
-    {
-        cpu_num_iters_ = num_iters;
-    }
-
-    void setTopThreshold(double top)
-    {
-        top_ = top;
-    }
-    void setBottomThreshold(double bottom)
-    {
-        bottom_ = bottom;
-    }
-
-    void addInit(Runnable *init)
-    {
-        inits_.push_back(init);
-    }
-    void addTest(Runnable *test)
-    {
-        tests_.push_back(test);
-    }
-    void run();
-
-    // It's public because OpenCV callback uses it
-    void printError(const std::string &msg);
-
-    std::stringstream &startNewSubtest()
-    {
-        finishCurrentSubtest();
-        return cur_subtest_description_;
-    }
-
-    bool stop() const
-    {
-        return cur_iter_idx_ >= num_iters_;
-    }
-
-    bool cpu_stop() const
-    {
-        return cur_iter_idx_ >= cpu_num_iters_;
-    }
-
-    bool warmupStop()
-    {
-        return cur_warmup_idx_++ >= gpu_warmup_iters_;
-    }
-
-    void warmupComplete()
-    {
-        cur_warmup_idx_ = 0;
-    }
-
-    void cpuOn()
-    {
-        cpu_started_ = cv::getTickCount();
-    }
-    void cpuOff()
-    {
-        int64 delta = cv::getTickCount() - cpu_started_;
-        cpu_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void cpuComplete()
-    {
-        cpu_elapsed_ += meanTime(cpu_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    void gpuOn()
-    {
-        gpu_started_ = cv::getTickCount();
-    }
-    void gpuOff()
-    {
-        int64 delta = cv::getTickCount() - gpu_started_;
-        gpu_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void gpuComplete()
-    {
-        gpu_elapsed_ += meanTime(gpu_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    void gpufullOn()
-    {
-        gpu_full_started_ = cv::getTickCount();
-    }
-    void gpufullOff()
-    {
-        int64 delta = cv::getTickCount() - gpu_full_started_;
-        gpu_full_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void gpufullComplete()
-    {
-        gpu_full_elapsed_ += meanTime(gpu_full_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    bool isListMode() const
-    {
-        return is_list_mode_;
-    }
-    void setListMode(bool value)
-    {
-        is_list_mode_ = value;
-    }
-
-    void setRecordName(const std::string &name)
-    {
-        recordname_ = name;
-    }
-
-    void setCurrentTest(const std::string &name)
-    {
-        itname_ = name;
-        itname_changed_ = true;
-    }
-
-private:
-    TestSystem():
-        cur_subtest_is_empty_(true), cpu_elapsed_(0),
-        gpu_elapsed_(0), gpu_full_elapsed_(0), speedup_total_(0.0),
-        num_subtests_called_(0),
-        speedup_faster_count_(0), speedup_slower_count_(0), speedup_equal_count_(0),
-        speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
-        num_iters_(10), cpu_num_iters_(2),
-        gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
-        record_(0), recordname_("performance"), itname_changed_(true)
-    {
-        cpu_times_.reserve(num_iters_);
-        gpu_times_.reserve(num_iters_);
-        gpu_full_times_.reserve(num_iters_);
-    }
-
-    void finishCurrentSubtest();
-    void resetCurrentSubtest()
-    {
-        cpu_elapsed_ = 0;
-        gpu_elapsed_ = 0;
-        gpu_full_elapsed_ = 0;
-        cur_subtest_description_.str("");
-        cur_subtest_is_empty_ = true;
-        cur_iter_idx_ = 0;
-        cpu_times_.clear();
-        gpu_times_.clear();
-        gpu_full_times_.clear();
-    }
-
-    double meanTime(const std::vector<int64> &samples);
-
-    void printHeading();
-    void printSummary();
-    void printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup);
-
-    void writeHeading();
-    void writeSummary();
-    void writeMetrics(double cpu_time, double gpu_time, double gpu_full_time,
-                      double speedup, double fullspeedup,
-                      double gpu_min, double gpu_max, double std_dev);
-
-    std::string working_dir_;
-    std::string test_filter_;
-
-    std::vector<Runnable *> inits_;
-    std::vector<Runnable *> tests_;
-
-    std::stringstream cur_subtest_description_;
-    bool cur_subtest_is_empty_;
-
-    int64 cpu_started_;
-    int64 gpu_started_;
-    int64 gpu_full_started_;
-    double cpu_elapsed_;
-    double gpu_elapsed_;
-    double gpu_full_elapsed_;
-
-    double speedup_total_;
-    double speedup_full_total_;
-    int num_subtests_called_;
-
-    int speedup_faster_count_;
-    int speedup_slower_count_;
-    int speedup_equal_count_;
-
-    int speedup_full_faster_count_;
-    int speedup_full_slower_count_;
-    int speedup_full_equal_count_;
-
-    bool is_list_mode_;
-
-    double top_;
-    double bottom_;
-
-    int num_iters_;
-    int cpu_num_iters_;		//there's no need to set cpu running same times with gpu
-    int gpu_warmup_iters_;	//gpu warm up times, default is 1
-    int cur_iter_idx_;
-    int cur_warmup_idx_;	//current gpu warm up times
-    std::vector<int64> cpu_times_;
-    std::vector<int64> gpu_times_;
-    std::vector<int64> gpu_full_times_;
-
-    FILE *record_;
-    std::string recordname_;
-    std::string itname_;
-    bool itname_changed_;
-};
-
-
-#define GLOBAL_INIT(name) \
-    struct name##_init: Runnable { \
-        name##_init(): Runnable(#name) { \
-            TestSystem::instance().addInit(this); \
-        } \
-        void run(); \
-    } name##_init_instance; \
-    void name##_init::run()
-
-
-#define TEST(name) \
-    struct name##_test: Runnable { \
-        name##_test(): Runnable(#name) { \
-            TestSystem::instance().addTest(this); \
-        } \
-        void run(); \
-    } name##_test_instance; \
-    void name##_test::run()
-
-#define SUBTEST TestSystem::instance().startNewSubtest()
-
-#define CPU_ON \
-    while (!TestSystem::instance().cpu_stop()) { \
-        TestSystem::instance().cpuOn()
-#define CPU_OFF \
-        TestSystem::instance().cpuOff(); \
-    } TestSystem::instance().cpuComplete()
-
-#define GPU_ON \
-    while (!TestSystem::instance().stop()) { \
-        TestSystem::instance().gpuOn()
-#define GPU_OFF \
-        TestSystem::instance().gpuOff(); \
-    } TestSystem::instance().gpuComplete()
-
-#define GPU_FULL_ON \
-    while (!TestSystem::instance().stop()) { \
-        TestSystem::instance().gpufullOn()
-#define GPU_FULL_OFF \
-        TestSystem::instance().gpufullOff(); \
-    } TestSystem::instance().gpufullComplete()
-
-#define WARMUP_ON \
-    while (!TestSystem::instance().warmupStop()) {
-#define WARMUP_OFF \
-    } TestSystem::instance().warmupComplete()
-
-void TestSystem::run()
-{
-    if (is_list_mode_)
-    {
-        for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
-        {
-            cout << (*it)->name() << endl;
-        }
-
-        return;
-    }
-
-    // Run test initializers
-    for (vector<Runnable *>::iterator it = inits_.begin(); it != inits_.end(); ++it)
-    {
-        if ((*it)->name().find(test_filter_, 0) != string::npos)
-        {
-            (*it)->run();
-        }
-    }
-
-    printHeading();
-    writeHeading();
-
-    // Run tests
-    for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
-    {
-        try
-        {
-            if ((*it)->name().find(test_filter_, 0) != string::npos)
-            {
-                cout << endl << (*it)->name() << ":\n";
-
-                setCurrentTest((*it)->name());
-                //fprintf(record_,"%s\n",(*it)->name().c_str());
-
-                (*it)->run();
-                finishCurrentSubtest();
-            }
-        }
-        catch (const Exception &)
-        {
-            // Message is printed via callback
-            resetCurrentSubtest();
-        }
-        catch (const runtime_error &e)
-        {
-            printError(e.what());
-            resetCurrentSubtest();
-        }
-    }
-
-#ifdef USE_OPENCL
-    printSummary();
-    writeSummary();
-#endif
-}
-
-
-void TestSystem::finishCurrentSubtest()
-{
-    if (cur_subtest_is_empty_)
-        // There is no need to print subtest statistics
-    {
-        return;
-    }
-
-    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
-    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
-    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
-
-    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
-    speedup_total_ += speedup;
-
-    double fullspeedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_full_elapsed_);
-    speedup_full_total_ += fullspeedup;
-
-    if (speedup > top_)
-    {
-        speedup_faster_count_++;
-    }
-    else if (speedup < bottom_)
-    {
-        speedup_slower_count_++;
-    }
-    else
-    {
-        speedup_equal_count_++;
-    }
-
-    if (fullspeedup > top_)
-    {
-        speedup_full_faster_count_++;
-    }
-    else if (fullspeedup < bottom_)
-    {
-        speedup_full_slower_count_++;
-    }
-    else
-    {
-        speedup_full_equal_count_++;
-    }
-
-    // compute min, max and
-    std::sort(gpu_times_.begin(), gpu_times_.end());
-    double gpu_min = gpu_times_.front() / getTickFrequency() * 1000.0;
-    double gpu_max = gpu_times_.back() / getTickFrequency() * 1000.0;
-    double deviation = 0;
-
-    if (gpu_times_.size() > 1)
-    {
-        double sum = 0;
-
-        for (size_t i = 0; i < gpu_times_.size(); i++)
-        {
-            int64 diff = gpu_times_[i] - static_cast<int64>(gpu_elapsed_);
-            double diff_time = diff * 1000 / getTickFrequency();
-            sum += diff_time * diff_time;
-        }
-
-        deviation = std::sqrt(sum / gpu_times_.size());
-    }
-
-    printMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
-    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
-
-    num_subtests_called_++;
-    resetCurrentSubtest();
-}
-
-
-double TestSystem::meanTime(const vector<int64> &samples)
-{
-    double sum = accumulate(samples.begin(), samples.end(), 0.);
-    return sum / samples.size();
-}
-
-
-void TestSystem::printHeading()
-{
-    cout << endl;
-    cout << setiosflags(ios_base::left);
-#ifdef USE_OPENCL
-    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
-         << setw(14) << "SPEEDUP" << setw(14) << "GPUTOTAL, ms" << setw(14) << "TOTALSPEEDUP"
-         << "DESCRIPTION\n";
-#else
-    cout << TAB << setw(10) << "CPU, ms\n";
-#endif
-    cout << resetiosflags(ios_base::left);
-}
-
-void TestSystem::writeHeading()
-{
-    if (!record_)
-    {
-#ifdef USE_OPENCL
-        recordname_ += "_OCL.csv";
-#else
-        recordname_ += "_CPU.csv";
-#endif
-        record_ = fopen(recordname_.c_str(), "w");
-    }
-
-#ifdef USE_OPENCL
-    fprintf(record_, "NAME,DESCRIPTION,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
-#else
-    fprintf(record_, "NAME,DESCRIPTION,CPU (ms)\n");
-#endif
-    fflush(record_);
-}
-
-void TestSystem::printSummary()
-{
-    cout << setiosflags(ios_base::fixed);
-    cout << "\naverage GPU speedup: x"
-         << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
-         << endl;
-    cout << "\nGPU exceeded: "
-         << setprecision(3) << speedup_faster_count_
-         << "\nGPU passed: "
-         << setprecision(3) << speedup_equal_count_
-         << "\nGPU failed: "
-         << setprecision(3) << speedup_slower_count_
-         << endl;
-    cout << "\nGPU exceeded rate: "
-         << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPU passed rate: "
-         << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPU failed rate: "
-         << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << endl;
-    cout << "\naverage GPUTOTAL speedup: x"
-         << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
-         << endl;
-    cout << "\nGPUTOTAL exceeded: "
-         << setprecision(3) << speedup_full_faster_count_
-         << "\nGPUTOTAL passed: "
-         << setprecision(3) << speedup_full_equal_count_
-         << "\nGPUTOTAL failed: "
-         << setprecision(3) << speedup_full_slower_count_
-         << endl;
-    cout << "\nGPUTOTAL exceeded rate: "
-         << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPUTOTAL passed rate: "
-         << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPUTOTAL failed rate: "
-         << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << endl;
-    cout << resetiosflags(ios_base::fixed);
-}
-
-
-void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
-{
-    cout << TAB << setiosflags(ios_base::left);
-    stringstream stream;
-
-    stream << cpu_time;
-    cout << setw(10) << stream.str();
-#ifdef USE_OPENCL
-    stream.str("");
-    stream << gpu_time;
-    cout << setw(10) << stream.str();
-
-    stream.str("");
-    stream << "x" << setprecision(3) << speedup;
-    cout << setw(14) << stream.str();
-
-    stream.str("");
-    stream << gpu_full_time;
-    cout << setw(14) << stream.str();
-
-    stream.str("");
-    stream << "x" << setprecision(3) << fullspeedup;
-    cout << setw(14) << stream.str();
-#endif
-    cout << cur_subtest_description_.str();
-    cout << resetiosflags(ios_base::left) << endl;
-}
-
-void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
-{
-    if (!record_)
-    {
-        recordname_ += ".csv";
-        record_ = fopen(recordname_.c_str(), "w");
-    }
-
-#ifdef USE_OPENCL
-    fprintf(record_, "%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
-            cur_subtest_description_.str().c_str(),
-            cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
-            gpu_min, gpu_max, std_dev);
-#else
-    fprintf(record_, "%s,%s,%.3f\n",
-            itname_changed_ ? itname_.c_str() : "", cur_subtest_description_.str().c_str(), cpu_time);
-#endif
-
-    if (itname_changed_)
-    {
-        itname_changed_ = false;
-    }
-
-    fflush(record_);
-}
-
-void TestSystem::writeSummary()
-{
-    if (!record_)
-    {
-        recordname_ += ".csv";
-        record_ = fopen(recordname_.c_str(), "w");
-    }
-
-    fprintf(record_, "\nAverage GPU speedup: %.3f\n"
-            "exceeded: %d (%.3f%%)\n"
-            "passed: %d (%.3f%%)\n"
-            "failed: %d (%.3f%%)\n"
-            "\nAverage GPUTOTAL speedup: %.3f\n"
-            "exceeded: %d (%.3f%%)\n"
-            "passed: %d (%.3f%%)\n"
-            "failed: %d (%.3f%%)\n",
-            speedup_total_ / std::max(1, num_subtests_called_),
-            speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_total_ / std::max(1, num_subtests_called_),
-            speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
-           );
-    fflush(record_);
-}
-
-void TestSystem::printError(const std::string &msg)
-{
-    cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
-}
-
-void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
-{
-    mat.create(rows, cols, type);
-    RNG rng(0);
-    rng.fill(mat, RNG::UNIFORM, low, high);
-}
-
-
-string abspath(const string &relpath)
-{
-    return TestSystem::instance().workingDir() + relpath;
-}
-
-
-int CV_CDECL cvErrorCallback(int /*status*/, const char * /*func_name*/,
-                             const char *err_msg, const char * /*file_name*/,
-                             int /*line*/, void * /*userdata*/)
-{
-    TestSystem::instance().printError(err_msg);
-    return 0;
-}
-
-/////////// matchTemplate ////////////////////////
-//void InitMatchTemplate()
-//{
-//	Mat src; gen(src, 500, 500, CV_32F, 0, 1);
-//	Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
-//#ifdef USE_OPENCL
-//	ocl::oclMat d_src(src), d_templ(templ), d_dst;
-//	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-//#endif
-//}
-TEST(matchTemplate)
-{
-    //InitMatchTemplate();
-
-    Mat src, templ, dst;
-    int templ_size = 5;
-
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        int all_type[] = {CV_32FC1, CV_32FC4};
-        std::string type_name[] = {"CV_32FC1", "CV_32FC4"};
-
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
-            {
-                gen(src, size, size, all_type[j], 0, 1);
-
-                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR";
-
-                gen(templ, templ_size, templ_size, all_type[j], 0, 1);
-
-                matchTemplate(src, templ, dst, CV_TM_CCORR);
-
-                CPU_ON;
-                matchTemplate(src, templ, dst, CV_TM_CCORR);
-                CPU_OFF;
-
-#ifdef USE_OPENCL
-                ocl::oclMat d_src(src), d_templ, d_dst;
-
-                d_templ.upload(templ);
-
-                WARMUP_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-                WARMUP_OFF;
-
-                GPU_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-                GPU_OFF;
-
-                GPU_FULL_ON;
-                d_src.upload(src);
-                d_templ.upload(templ);
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-                d_dst.download(dst);
-                GPU_FULL_OFF;
-#endif
-            }
-        }
-
-        int all_type_8U[] = {CV_8UC1};
-        std::string type_name_8U[] = {"CV_8UC1"};
-
-        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
-        {
-            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
-            {
-                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";
-
-                gen(src, size, size, all_type_8U[j], 0, 255);
-
-                gen(templ, templ_size, templ_size, all_type_8U[j], 0, 255);
-
-                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
-
-                CPU_ON;
-                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
-                CPU_OFF;
-
-#ifdef USE_OPENCL
-                ocl::oclMat d_src(src);
-                ocl::oclMat d_templ(templ), d_dst;
-
-                WARMUP_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
-                WARMUP_OFF;
-
-                GPU_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
-                GPU_OFF;
-
-                GPU_FULL_ON;
-                d_src.upload(src);
-                d_templ.upload(templ);
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
-                d_dst.download(dst);
-                GPU_FULL_OFF;
-#endif
-            }
-        }
-    }
-}
-
-///////////// PyrLKOpticalFlow ////////////////////////
-TEST(PyrLKOpticalFlow)
-{
-    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
-    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
-
-    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
-    {
-        Mat frame0 = imread(abspath(images1[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
-
-        if (frame0.empty())
-        {
-            std::string errstr = "can't open " + images1[i];
-            throw runtime_error(errstr);
-        }
-
-        Mat frame1 = imread(abspath(images2[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
-
-        if (frame1.empty())
-        {
-            std::string errstr = "can't open " + images2[i];
-            throw runtime_error(errstr);
-        }
-
-        Mat gray_frame;
-
-        if (i == 0)
-        {
-            cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
-        }
-
-        for (int points = 1000; points <= 4000; points *= 2)
-        {
-            if (i == 0)
-                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
-            else
-                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
-            Mat nextPts_cpu;
-            Mat status_cpu;
-
-            vector<Point2f> pts;
-            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
-
-            vector<Point2f> nextPts;
-            vector<unsigned char> status;
-
-            vector<float> err;
-
-            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-
-            CPU_ON;
-            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            ocl::PyrLKOpticalFlow d_pyrLK;
-
-            ocl::oclMat d_frame0(frame0);
-            ocl::oclMat d_frame1(frame1);
-
-            ocl::oclMat d_pts;
-            Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
-            d_pts.upload(pts_mat);
-
-            ocl::oclMat d_nextPts;
-            ocl::oclMat d_status;
-            ocl::oclMat d_err;
-
-            WARMUP_ON;
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_frame0.upload(frame0);
-            d_frame1.upload(frame1);
-            d_pts.upload(pts_mat);
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-
-            if (!d_nextPts.empty())
-            {
-                d_nextPts.download(nextPts_cpu);
-            }
-
-            if (!d_status.empty())
-            {
-                d_status.download(status_cpu);
-            }
-
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-
-///////////// pyrDown //////////////////////
-TEST(pyrDown)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            pyrDown(src, dst);
-
-            CPU_ON;
-            pyrDown(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
-
-            WARMUP_ON;
-            ocl::pyrDown(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::pyrDown(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pyrDown(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-    }
-}
-
-///////////// pyrUp ////////////////////////
-TEST(pyrUp)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 500; size <= 2000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            pyrUp(src, dst);
-
-            CPU_ON;
-            pyrUp(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
-
-            WARMUP_ON;
-            ocl::pyrUp(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::pyrUp(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pyrUp(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-    }
-}
-
-///////////// Canny ////////////////////////
-TEST(Canny)
-{
-    Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
-
-    if (img.empty())
-    {
-        throw runtime_error("can't open aloeL.jpg");
-    }
-
-    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";
-
-    Mat edges(img.size(), CV_8UC1);
-
-    CPU_ON;
-    Canny(img, edges, 50.0, 100.0);
-    CPU_OFF;
-
-#ifdef USE_OPENCL
-    ocl::oclMat d_img(img);
-    ocl::oclMat d_edges;
-    ocl::CannyBuf d_buf;
-
-    WARMUP_ON;
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    WARMUP_OFF;
-
-    GPU_ON;
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_img.upload(img);
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    d_edges.download(edges);
-    GPU_FULL_OFF;
-#endif
-}
-
-///////////// Haar ////////////////////////
-#ifdef USE_OPENCL
-namespace cv
-{
-namespace ocl
-{
-
-struct getRect
-{
-    Rect operator()(const CvAvgComp &e) const
-    {
-        return e.rect;
-    }
-};
-
-class CascadeClassifier_GPU : public OclCascadeClassifier
-{
-public:
-    void detectMultiScale(oclMat &image,
-                          std::vector<cv::Rect>& faces,
-                          double scaleFactor = 1.1,
-                          int minNeighbors = 3, int flags = 0,
-                          Size minSize = Size(),
-                          Size maxSize = Size())
-    {
-        (void)maxSize;
-        MemStorage storage(cvCreateMemStorage(0));
-        //CvMat img=image;
-        CvSeq *objs = oclHaarDetectObjects(image, storage, scaleFactor, minNeighbors, flags, minSize);
-        vector<CvAvgComp> vecAvgComp;
-        Seq<CvAvgComp>(objs).copyTo(vecAvgComp);
-        faces.resize(vecAvgComp.size());
-        std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
-    }
-
-};
-
-}
-}
-#endif
-TEST(Haar)
-{
-    Mat img = imread(abspath("basketball1.png"), CV_LOAD_IMAGE_GRAYSCALE);
-
-    if (img.empty())
-    {
-        throw runtime_error("can't open basketball1.png");
-    }
-
-    CascadeClassifier faceCascadeCPU;
-
-    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
-    {
-        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
-    }
-
-    vector<Rect> faces;
-
-    SUBTEST << img.cols << "x" << img.rows << "; scale image";
-    CPU_ON;
-    faceCascadeCPU.detectMultiScale(img, faces,
-                                    1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-    CPU_OFF;
-
-#ifdef USE_OPENCL
-    ocl::CascadeClassifier_GPU faceCascade;
-
-    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
-    {
-        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
-    }
-
-    ocl::oclMat d_img(img);
-
-    faces.clear();
-
-    WARMUP_ON;
-    faceCascade.detectMultiScale(d_img, faces,
-                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-    WARMUP_OFF;
-
-    faces.clear();
-
-    GPU_ON;
-    faceCascade.detectMultiScale(d_img, faces,
-                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_img.upload(img);
-    faceCascade.detectMultiScale(d_img, faces,
-                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-    GPU_FULL_OFF;
-#endif
-}
-
-///////////// blend ////////////////////////
-template <typename T>
-void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
-{
-    result_gold.create(img1.size(), img1.type());
-
-    int cn = img1.channels();
-
-    for (int y = 0; y < img1.rows; ++y)
-    {
-        const float *weights1_row = weights1.ptr<float>(y);
-        const float *weights2_row = weights2.ptr<float>(y);
-        const T *img1_row = img1.ptr<T>(y);
-        const T *img2_row = img2.ptr<T>(y);
-        T *result_gold_row = result_gold.ptr<T>(y);
-
-        for (int x = 0; x < img1.cols * cn; ++x)
-        {
-            float w1 = weights1_row[x / cn];
-            float w2 = weights2_row[x / cn];
-            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
-        }
-    }
-}
-TEST(blend)
-{
-    Mat src1, src2, weights1, weights2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " and CV_32FC1";
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(weights1, size, size, CV_32FC1, 0, 1);
-            gen(weights2, size, size, CV_32FC1, 0, 1);
-
-            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
-
-            CPU_ON;
-            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            d_weights1.upload(weights1);
-            d_weights2.upload(weights2);
-
-            WARMUP_ON;
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            d_weights1.upload(weights1);
-            d_weights2.upload(weights2);
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-    }
-}
-///////////// columnSum////////////////////////
-TEST(columnSum)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; CV_32FC1";
-
-        gen(src, size, size, CV_32FC1, 0, 256);
-
-        CPU_ON;
-        dst.create(src.size(), src.type());
-
-        for (int i = 1; i < src.rows; ++i)
-        {
-            for (int j = 0; j < src.cols; ++j)
-            {
-                dst.at<float>(i, j) = src.at<float>(i, j) += src.at<float>(i - 1, j);
-            }
-        }
-
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        d_src.upload(src);
-        WARMUP_ON;
-        ocl::columnSum(d_src, d_dst);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::columnSum(d_src, d_dst);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::columnSum(d_src, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-
-///////////// HOG////////////////////////
-TEST(HOG)
-{
-    Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);
-
-    if (src.empty())
-    {
-        throw runtime_error("can't open road.png");
-    }
-
-
-    cv::HOGDescriptor hog;
-    hog.setSVMDetector(hog.getDefaultPeopleDetector());
-    std::vector<cv::Rect> found_locations;
-
-    SUBTEST << 768 << 'x' << 576 << "; road.png";
-
-    hog.detectMultiScale(src, found_locations);
-
-    CPU_ON;
-    hog.detectMultiScale(src, found_locations);
-    CPU_OFF;
-
-#ifdef USE_OPENCL
-    cv::ocl::HOGDescriptor ocl_hog;
-    ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
-    ocl::oclMat d_src;
-    d_src.upload(src);
-
-    WARMUP_ON;
-    ocl_hog.detectMultiScale(d_src, found_locations);
-    WARMUP_OFF;
-
-    GPU_ON;
-    ocl_hog.detectMultiScale(d_src, found_locations);
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_src.upload(src);
-    ocl_hog.detectMultiScale(d_src, found_locations);
-    GPU_FULL_OFF;
-#endif
-}
-
-///////////// SURF ////////////////////////
-
-TEST(SURF)
-{
-    Mat keypoints_cpu;
-    Mat descriptors_cpu;
-
-    Mat src = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
-
-    if (src.empty())
-    {
-        throw runtime_error("can't open aloeL.jpg");
-    }
-
-    SUBTEST << src.cols << "x" << src.rows << "; aloeL.jpg";
-    SURF surf;
-    vector<KeyPoint> keypoints;
-    Mat descriptors;
-
-    surf(src, Mat(), keypoints, descriptors);
-
-    CPU_ON;
-    keypoints.clear();
-    surf(src, Mat(), keypoints, descriptors);
-    CPU_OFF;
-
-#ifdef USE_OPENCL
-    ocl::SURF_OCL d_surf;
-    ocl::oclMat d_src(src);
-    ocl::oclMat d_keypoints;
-    ocl::oclMat d_descriptors;
-
-    WARMUP_ON;
-    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
-    WARMUP_OFF;
-
-    GPU_ON;
-    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_src.upload(src);
-    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
-
-    if (!d_keypoints.empty())
-    {
-        d_keypoints.download(keypoints_cpu);
-    }
-
-    if (!d_descriptors.empty())
-    {
-        d_descriptors.download(descriptors_cpu);
-    }
-
-    GPU_FULL_OFF;
-#endif
-}
-//////////////////// BruteForceMatch /////////////////
-TEST(BruteForceMatcher)
-{
-    Mat trainIdx_cpu;
-    Mat distance_cpu;
-    Mat allDist_cpu;
-    Mat nMatches_cpu;
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        // Init CPU matcher
-        int desc_len = 64;
-
-        BFMatcher matcher(NORM_L2);
-
-        Mat query;
-        gen(query, size, desc_len, CV_32F, 0, 1);
-
-        Mat train;
-        gen(train, size, desc_len, CV_32F, 0, 1);
-        // Output
-        vector< vector<DMatch> > matches(2);
-#ifdef USE_OPENCL
-        // Init GPU matcher
-        ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
-
-        ocl::oclMat d_query(query);
-        ocl::oclMat d_train(train);
-
-        ocl::oclMat d_trainIdx, d_distance, d_allDist, d_nMatches;
-#endif
-        SUBTEST << size << "; match";
-
-        matcher.match(query, train, matches[0]);
-
-        CPU_ON;
-        matcher.match(query, train, matches[0]);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        WARMUP_ON;
-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        WARMUP_OFF;
-
-        GPU_ON;
-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.match(d_query, d_train, matches[0]);
-        GPU_FULL_OFF;
-#endif
-
-        SUBTEST << size << "; knnMatch";
-
-        matcher.knnMatch(query, train, matches, 2);
-
-        CPU_ON;
-        matcher.knnMatch(query, train, matches, 2);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        WARMUP_ON;
-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
-        WARMUP_OFF;
-
-        GPU_ON;
-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.knnMatch(d_query, d_train, matches, 2);
-        GPU_FULL_OFF;
-#endif
-        SUBTEST << size << "; radiusMatch";
-
-        float max_distance = 2.0f;
-
-        matcher.radiusMatch(query, train, matches, max_distance);
-
-        CPU_ON;
-        matcher.radiusMatch(query, train, matches, max_distance);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        d_trainIdx.release();
-
-        WARMUP_ON;
-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
-        WARMUP_OFF;
-
-        GPU_ON;
-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.radiusMatch(d_query, d_train, matches, max_distance);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// Lut ////////////////////////
-TEST(lut)
-{
-    Mat src, lut, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_lut, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC3};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC3"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(lut, 1, 256, CV_8UC1, 0, 1);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-            LUT(src, lut, dst);
-
-            CPU_ON;
-            LUT(src, lut, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-            d_lut.upload(lut);
-
-            WARMUP_ON;
-            ocl::LUT(d_src, d_lut, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::LUT(d_src, d_lut, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_lut.upload(lut);
-            ocl::LUT(d_src, d_lut, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Exp ////////////////////////
-TEST(Exp)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; CV_32FC1";
-
-        gen(src, size, size, CV_32FC1, 0, 256);
-        gen(dst, size, size, CV_32FC1, 0, 256);
-
-        exp(src, dst);
-
-        CPU_ON;
-        exp(src, dst);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::exp(d_src, d_dst);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::exp(d_src, d_dst);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::exp(d_src, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-
-///////////// LOG ////////////////////////
-TEST(Log)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 32F";
-
-        gen(src, size, size, CV_32F, 1, 10);
-
-        log(src, dst);
-
-        CPU_ON;
-        log(src, dst);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::log(d_src, d_dst);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::log(d_src, d_dst);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::log(d_src, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-
-///////////// Add ////////////////////////
-
-TEST(Add)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 1);
-            gen(src2, size, size, all_type[j], 0, 1);
-
-            add(src1, src2, dst);
-
-            CPU_ON;
-            add(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::add(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::add(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::add(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Mul ////////////////////////
-TEST(Mul)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            multiply(src1, src2, dst);
-
-            CPU_ON;
-            multiply(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::multiply(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::multiply(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::multiply(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Div ////////////////////////
-TEST(Div)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            divide(src1, src2, dst);
-
-            CPU_ON;
-            divide(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::divide(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::divide(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::divide(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Absdiff ////////////////////////
-TEST(Absdiff)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            absdiff(src1, src2, dst);
-
-            CPU_ON;
-            absdiff(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::absdiff(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::absdiff(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::absdiff(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// CartToPolar ////////////////////////
-TEST(CartToPolar)
-{
-    Mat src1, src2, dst, dst1;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-            gen(dst1, size, size, all_type[j], 0, 256);
-
-
-            cartToPolar(src1, src2, dst, dst1, 1);
-
-            CPU_ON;
-            cartToPolar(src1, src2, dst, dst1, 1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-            d_dst.download(dst);
-            d_dst1.download(dst1);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// PolarToCart ////////////////////////
-TEST(PolarToCart)
-{
-    Mat src1, src2, dst, dst1;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-            gen(dst1, size, size, all_type[j], 0, 256);
-
-
-            polarToCart(src1, src2, dst, dst1, 1);
-
-            CPU_ON;
-            polarToCart(src1, src2, dst, dst1, 1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-            d_dst.download(dst);
-            d_dst1.download(dst1);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Magnitude ////////////////////////
-TEST(magnitude)
-{
-    Mat x, y, mag;
-#ifdef USE_OPENCL
-    ocl::oclMat d_x, d_y, d_mag;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(x, size, size, all_type[j], 0, 1);
-            gen(y, size, size, all_type[j], 0, 1);
-
-            magnitude(x, y, mag);
-
-            CPU_ON;
-            magnitude(x, y, mag);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_x.upload(x);
-            d_y.upload(y);
-
-            WARMUP_ON;
-            ocl::magnitude(d_x, d_y, d_mag);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::magnitude(d_x, d_y, d_mag);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_x.upload(x);
-            d_y.upload(y);
-            ocl::magnitude(d_x, d_y, d_mag);
-            d_mag.download(mag);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Transpose ////////////////////////
-TEST(Transpose)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-            transpose(src, dst);
-
-            CPU_ON;
-            transpose(src, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::transpose(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::transpose(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::transpose(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Flip ////////////////////////
-TEST(Flip)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; FLIP_BOTH";
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-            flip(src, dst, 0);
-
-            CPU_ON;
-            flip(src, dst, 0);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::flip(d_src, d_dst, 0);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::flip(d_src, d_dst, 0);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::flip(d_src, d_dst, 0);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// minMax ////////////////////////
-TEST(minMax)
-{
-    Mat src;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src;
-#endif
-    double min_val, max_val;
-    Point min_loc, max_loc;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            CPU_ON;
-            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::minMax(d_src, &min_val, &max_val);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::minMax(d_src, &min_val, &max_val);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::minMax(d_src, &min_val, &max_val);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// minMaxLoc ////////////////////////
-TEST(minMaxLoc)
-{
-    Mat src;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src;
-#endif
-    double min_val, max_val;
-    Point min_loc, max_loc;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 1);
-
-            CPU_ON;
-            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Sum ////////////////////////
-TEST(Sum)
-{
-    Mat src;
-    Scalar cpures, gpures;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            cpures = sum(src);
-
-            CPU_ON;
-            cpures = sum(src);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            gpures = ocl::sum(d_src);
-            WARMUP_OFF;
-
-            GPU_ON;
-            gpures = ocl::sum(d_src);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            gpures = ocl::sum(d_src);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// countNonZero ////////////////////////
-TEST(countNonZero)
-{
-    Mat src;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src;
-#endif
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            countNonZero(src);
-
-            CPU_ON;
-            countNonZero(src);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::countNonZero(d_src);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::countNonZero(d_src);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::countNonZero(d_src);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Phase ////////////////////////
-TEST(Phase)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            phase(src1, src2, dst, 1);
-
-            CPU_ON;
-            phase(src1, src2, dst, 1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::phase(d_src1, d_src2, d_dst, 1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::phase(d_src1, d_src2, d_dst, 1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::phase(d_src1, d_src2, d_dst, 1);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// bitwise_and////////////////////////
-TEST(bitwise_and)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_and(src1, src2, dst);
-
-            CPU_ON;
-            bitwise_and(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::bitwise_and(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_and(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::bitwise_and(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// bitwise_or////////////////////////
-TEST(bitwise_or)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_or(src1, src2, dst);
-
-            CPU_ON;
-            bitwise_or(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// bitwise_xor////////////////////////
-TEST(bitwise_xor)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_xor(src1, src2, dst);
-
-            CPU_ON;
-            bitwise_xor(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// bitwise_not////////////////////////
-TEST(bitwise_not)
-{
-    Mat src1, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_not(src1, dst);
-
-            CPU_ON;
-            bitwise_not(src1, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-
-            WARMUP_ON;
-            ocl::bitwise_not(d_src1, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_not(d_src1, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            ocl::bitwise_not(d_src1, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// compare////////////////////////
-TEST(compare)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int CMP_EQ = 0;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            compare(src1, src2, dst, CMP_EQ);
-
-            CPU_ON;
-            compare(src1, src2, dst, CMP_EQ);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// pow ////////////////////////
-TEST(pow)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 100);
-            gen(dst, size, size, all_type[j], 0, 100);
-
-            pow(src, -2.0, dst);
-
-            CPU_ON;
-            pow(src, -2.0, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-            d_dst.upload(dst);
-
-            WARMUP_ON;
-            ocl::pow(d_src, -2.0, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::pow(d_src, -2.0, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pow(d_src, -2.0, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// MagnitudeSqr////////////////////////
-TEST(MagnitudeSqr)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[t];
-
-            gen(src1, size, size, all_type[t], 0, 256);
-            gen(src2, size, size, all_type[t], 0, 256);
-            gen(dst, size, size, all_type[t], 0, 256);
-
-
-            for (int i = 0; i < src1.rows; ++i)
-
-                for (int j = 0; j < src1.cols; ++j)
-                {
-                    float val1 = src1.at<float>(i, j);
-                    float val2 = src2.at<float>(i, j);
-
-                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
-
-                }
-
-            CPU_ON;
-
-            for (int i = 0; i < src1.rows; ++i)
-                for (int j = 0; j < src1.cols; ++j)
-                {
-                    float val1 = src1.at<float>(i, j);
-                    float val2 = src2.at<float>(i, j);
-
-                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
-
-                }
-
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// AddWeighted////////////////////////
-TEST(AddWeighted)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    double alpha = 2.0, beta = 1.0, gama = 3.0;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            addWeighted(src1, alpha, src2, beta, gama, dst);
-
-            CPU_ON;
-            addWeighted(src1, alpha, src2, beta, gama, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Blur////////////////////////
-TEST(Blur)
-{
-    Mat src1, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_dst;
-#endif
-    Size ksize = Size(3, 3);
-    int bordertype = BORDER_CONSTANT;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            blur(src1, dst, ksize, Point(-1, -1), bordertype);
-
-            CPU_ON;
-            blur(src1, dst, ksize, Point(-1, -1), bordertype);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-
-            WARMUP_ON;
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Laplacian////////////////////////
-TEST(Laplacian)
-{
-    Mat src1, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_dst;
-#endif
-    int ksize = 3;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            Laplacian(src1, dst, -1, ksize, 1);
-
-            CPU_ON;
-            Laplacian(src1, dst, -1, ksize, 1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-
-            WARMUP_ON;
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Erode ////////////////////
-TEST(Erode)
-{
-    Mat src, dst, ker;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(256));
-            ker = getStructuringElement(MORPH_RECT, Size(3, 3));
-
-            erode(src, dst, ker);
-
-            CPU_ON;
-            erode(src, dst, ker);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::erode(d_src, d_dst, ker);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::erode(d_src, d_dst, ker);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::erode(d_src, d_dst, ker);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Sobel ////////////////////////
-TEST(Sobel)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int dx = 1;
-    int dy = 1;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            Sobel(src, dst, -1, dx, dy);
-
-            CPU_ON;
-            Sobel(src, dst, -1, dx, dy);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Scharr ////////////////////////
-TEST(Scharr)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int dx = 1;
-    int dy = 0;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            Scharr(src, dst, -1, dx, dy);
-
-            CPU_ON;
-            Scharr(src, dst, -1, dx, dy);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// GaussianBlur ////////////////////////
-TEST(GaussianBlur)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            GaussianBlur(src, dst, Size(9, 9), 0);
-
-            CPU_ON;
-            GaussianBlur(src, dst, Size(9, 9), 0);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst(src.size(), src.type());
-            ocl::oclMat d_buf;
-
-            WARMUP_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// equalizeHist ////////////////////////
-TEST(equalizeHist)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1};
-    std::string type_name[] = {"CV_8UC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            equalizeHist(src, dst);
-
-            CPU_ON;
-            equalizeHist(src, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
-            ocl::oclMat d_hist;
-            ocl::oclMat d_buf;
-
-            WARMUP_ON;
-            ocl::equalizeHist(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::equalizeHist(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::equalizeHist(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-/////////// CopyMakeBorder //////////////////////
-TEST(CopyMakeBorder)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_dst;
-#endif
-    int bordertype = BORDER_CONSTANT;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-
-            CPU_ON;
-            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            CPU_OFF;
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-
-            WARMUP_ON;
-            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// cornerMinEigenVal ////////////////////////
-TEST(cornerMinEigenVal)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_dst;
-#endif
-    int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
-    int borderType = BORDER_REFLECT;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
-
-            CPU_ON;
-            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-
-            WARMUP_ON;
-            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// cornerHarris ////////////////////////
-TEST(cornerHarris)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; BORDER_REFLECT";
-
-            gen(src, size, size, all_type[j], 0, 1);
-
-            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
-
-            CPU_ON;
-            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-
-    }
-}
-///////////// integral ////////////////////////
-TEST(integral)
-{
-    Mat src, sum;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_sum, d_buf;
-#endif
-    int all_type[] = {CV_8UC1};
-    std::string type_name[] = {"CV_8UC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j]  ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            integral(src, sum);
-
-            CPU_ON;
-            integral(src, sum);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::integral(d_src, d_sum);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::integral(d_src, d_sum);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::integral(d_src, d_sum);
-            d_sum.download(sum);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// WarpAffine ////////////////////////
-TEST(WarpAffine)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    static const double coeffs[2][3] =
-    {
-        {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
-        {sin(3.14 / 6), cos(3.14 / 6), -100.0}
-    };
-    Mat M(2, 3, CV_64F, (void *)coeffs);
-    int interpolation = INTER_NEAREST;
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-            Size size1 = Size(size, size);
-
-            warpAffine(src, dst, M, size1, interpolation);
-
-            CPU_ON;
-            warpAffine(src, dst, M, size1, interpolation);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// WarpPerspective ////////////////////////
-TEST(WarpPerspective)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    static const double coeffs[3][3] =
-    {
-        {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
-        {sin(3.14 / 6), cos(3.14 / 6), -100.0},
-        {0.0, 0.0, 1.0}
-    };
-    Mat M(3, 3, CV_64F, (void *)coeffs);
-    int interpolation = INTER_NEAREST;
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-            Size size1 = Size(size, size);
-
-            warpPerspective(src, dst, M, size1, interpolation);
-
-            CPU_ON;
-            warpPerspective(src, dst, M, size1, interpolation);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// resize ////////////////////////
-TEST(resize)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; up";
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            resize(src, dst, Size(), 2.0, 2.0);
-
-            CPU_ON;
-            resize(src, dst, Size(), 2.0, 2.0);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; down";
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            resize(src, dst, Size(), 0.5, 0.5);
-
-            CPU_ON;
-            resize(src, dst, Size(), 0.5, 0.5);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// threshold////////////////////////
-TEST(threshold)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";
-
-        gen(src, size, size, CV_8U, 0, 100);
-
-        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
-
-        CPU_ON;
-        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 32FC1; THRESH_TRUNC [NPP]";
-
-        gen(src, size, size, CV_32FC1, 0, 100);
-
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
-
-        CPU_ON;
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// meanShiftFiltering////////////////////////
-TEST(meanShiftFiltering)
-{
-    int sp = 10, sr = 10;
-
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 8UC3 vs 8UC4";
-
-        gen(src, size, size, CV_8UC3, Scalar::all(0), Scalar::all(256));
-
-        pyrMeanShiftFiltering(src, dst, sp, sr);
-
-        CPU_ON;
-        pyrMeanShiftFiltering(src, dst, sp, sr);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// meanShiftProc////////////////////////
-COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size size, int sp, int sr, int maxIter, float eps, int *tab)
-{
-
-    int isr2 = sr * sr;
-    int c0, c1, c2, c3;
-    int iter;
-    uchar *ptr = NULL;
-    uchar *pstart = NULL;
-    int revx = 0, revy = 0;
-    c0 = sptr[0];
-    c1 = sptr[1];
-    c2 = sptr[2];
-    c3 = sptr[3];
-
-    // iterate meanshift procedure
-    for (iter = 0; iter < maxIter; iter++)
-    {
-        int count = 0;
-        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
-
-        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-        int minx = x0 - sp;
-        int miny = y0 - sp;
-        int maxx = x0 + sp;
-        int maxy = y0 + sp;
-
-        //deal with the image boundary
-        if (minx < 0)
-        {
-            minx = 0;
-        }
-
-        if (miny < 0)
-        {
-            miny = 0;
-        }
-
-        if (maxx >= size.width)
-        {
-            maxx = size.width - 1;
-        }
-
-        if (maxy >= size.height)
-        {
-            maxy = size.height - 1;
-        }
-
-        if (iter == 0)
-        {
-            pstart = sptr;
-        }
-        else
-        {
-            pstart = pstart + revy * sstep + (revx << 2); //point to the new position
-        }
-
-        ptr = pstart;
-        ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row
-
-        for (int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
-        {
-            int rowCount = 0;
-            int x = minx;
-#if CV_ENABLE_UNROLLED
-
-            for (; x + 4 <= maxx; x += 4, ptr += 16)
-            {
-                int t0, t1, t2;
-                t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-
-                t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 1;
-                    rowCount++;
-                }
-
-                t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 2;
-                    rowCount++;
-                }
-
-                t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 3;
-                    rowCount++;
-                }
-            }
-
-#endif
-
-            for (; x <= maxx; x++, ptr += 4)
-            {
-                int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-            }
-
-            if (rowCount == 0)
-            {
-                continue;
-            }
-
-            count += rowCount;
-            sy += y * rowCount;
-        }
-
-        if (count == 0)
-        {
-            break;
-        }
-
-        int x1 = sx / count;
-        int y1 = sy / count;
-        s0 = s0 / count;
-        s1 = s1 / count;
-        s2 = s2 / count;
-
-        bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
-                        tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
-
-        //revise the pointer corresponding to the new (y0,x0)
-        revx = x1 - x0;
-        revy = y1 - y0;
-
-        x0 = x1;
-        y0 = y1;
-        c0 = s0;
-        c1 = s1;
-        c2 = s2;
-
-        if (stopFlag)
-        {
-            break;
-        }
-    } //for iter
-
-    dptr[0] = (uchar)c0;
-    dptr[1] = (uchar)c1;
-    dptr[2] = (uchar)c2;
-    dptr[3] = (uchar)c3;
-
-    COOR coor;
-    coor.x = static_cast<short>(x0);
-    coor.y = static_cast<short>(y0);
-    return coor;
-}
-
-void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, cv::TermCriteria crit)
-{
-
-    if (src_roi.empty())
-    {
-        CV_Error(CV_StsBadArg, "The input image is empty");
-    }
-
-    if (src_roi.depth() != CV_8U || src_roi.channels() != 4)
-    {
-        CV_Error(CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
-    }
-
-    CV_Assert((src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
-              (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
-    CV_Assert(!(dstCoor_roi.step & 0x3));
-
-    if (!(crit.type & cv::TermCriteria::MAX_ITER))
-    {
-        crit.maxCount = 5;
-    }
-
-    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
-    float eps;
-
-    if (!(crit.type & cv::TermCriteria::EPS))
-    {
-        eps = 1.f;
-    }
-
-    eps = (float)std::max(crit.epsilon, 0.0);
-
-    int tab[512];
-
-    for (int i = 0; i < 512; i++)
-    {
-        tab[i] = (i - 255) * (i - 255);
-    }
-
-    uchar *sptr = src_roi.data;
-    uchar *dptr = dst_roi.data;
-    short *dCoorptr = (short *)dstCoor_roi.data;
-    int sstep = (int)src_roi.step;
-    int dstep = (int)dst_roi.step;
-    int dCoorstep = (int)dstCoor_roi.step >> 1;
-    cv::Size size = src_roi.size();
-
-    for (int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
-            dptr += dstep - (size.width << 2), dCoorptr += dCoorstep - (size.width << 1))
-    {
-        for (int j = 0; j < size.width; j++, sptr += 4, dptr += 4, dCoorptr += 2)
-        {
-            *((COOR *)dCoorptr) = do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
-        }
-    }
-
-}
-TEST(meanShiftProc)
-{
-    Mat src, dst, dstCoor_roi;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst, d_dstCoor_roi;
-#endif
-    TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";
-
-        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
-
-        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
-
-        CPU_ON;
-        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-        d_dst.download(dst);
-        d_dstCoor_roi.download(dstCoor_roi);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// ConvertTo////////////////////////
-TEST(ConvertTo)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " to 32FC1";
-
-            gen(src, size, size, all_type[j], 0, 256);
-            //gen(dst, size, size, all_type[j], 0, 256);
-
-            //d_dst.upload(dst);
-
-            src.convertTo(dst, CV_32FC1);
-
-            CPU_ON;
-            src.convertTo(dst, CV_32FC1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            d_src.convertTo(d_dst, CV_32FC1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_src.convertTo(d_dst, CV_32FC1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.convertTo(d_dst, CV_32FC1);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// copyTo////////////////////////
-TEST(copyTo)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-            //gen(dst, size, size, all_type[j], 0, 256);
-
-            //d_dst.upload(dst);
-
-            src.copyTo(dst);
-
-            CPU_ON;
-            src.copyTo(dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            d_src.copyTo(d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_src.copyTo(d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.copyTo(d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// setTo////////////////////////
-TEST(setTo)
-{
-    Mat src, dst;
-    Scalar val(1, 2, 3, 4);
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            src.setTo(val);
-
-            CPU_ON;
-            src.setTo(val);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            d_src.setTo(val);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_src.setTo(val);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.setTo(val);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Merge////////////////////////
-TEST(Merge)
-{
-    Mat dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_dst;
-#endif
-    int channels = 4;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-            Size size1 = Size(size, size);
-            std::vector<Mat> src(channels);
-
-            for (int i = 0; i < channels; ++i)
-            {
-                src[i] = Mat(size1, all_type[j], cv::Scalar::all(i));
-            }
-
-            merge(src, dst);
-
-            CPU_ON;
-            merge(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            std::vector<ocl::oclMat> d_src(channels);
-
-            for (int i = 0; i < channels; ++i)
-            {
-                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
-            }
-
-            WARMUP_ON;
-            ocl::merge(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::merge(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-
-            for (int i = 0; i < channels; ++i)
-            {
-                d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
-            }
-
-            ocl::merge(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Split////////////////////////
-TEST(Split)
-{
-    //int channels = 4;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-            Size size1 = Size(size, size);
-
-            Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
-
-            std::vector<cv::Mat> dst;
-
-            split(src, dst);
-
-            CPU_ON;
-            split(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
-            std::vector<cv::ocl::oclMat> d_dst;
-
-            WARMUP_ON;
-            ocl::split(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::split(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::split(d_src, d_dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-
-///////////// norm////////////////////////
-TEST(norm)
-{
-    Mat src, buf;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_buf;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
-
-        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
-        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
-
-        norm(src, NORM_INF);
-
-        CPU_ON;
-        norm(src, NORM_INF);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        d_src.upload(src);
-        d_buf.upload(buf);
-
-        WARMUP_ON;
-        ocl::norm(d_src, d_buf, NORM_INF);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::norm(d_src, d_buf, NORM_INF);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::norm(d_src, d_buf, NORM_INF);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// remap////////////////////////
-TEST(remap)
-{
-    Mat src, dst, xmap, ymap;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst, d_xmap, d_ymap;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    int interpolation = INTER_LINEAR;
-    int borderMode = BORDER_CONSTANT;
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
-        {
-            SUBTEST << size << 'x' << size << "; src " << type_name[t] << "; map CV_32FC1";
-
-            gen(src, size, size, all_type[t], 0, 256);
-
-            xmap.create(size, size, CV_32FC1);
-            dst.create(size, size, CV_32FC1);
-            ymap.create(size, size, CV_32FC1);
-
-            for (int i = 0; i < size; ++i)
-            {
-                float *xmap_row = xmap.ptr<float>(i);
-                float *ymap_row = ymap.ptr<float>(i);
-
-                for (int j = 0; j < size; ++j)
-                {
-                    xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
-                    ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
-                }
-            }
-
-
-            remap(src, dst, xmap, ymap, interpolation, borderMode);
-
-            CPU_ON;
-            remap(src, dst, xmap, ymap, interpolation, borderMode);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-            d_dst.upload(dst);
-            d_xmap.upload(xmap);
-            d_ymap.upload(ymap);
-
-            WARMUP_ON;
-            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// cvtColor////////////////////////
-TEST(cvtColor)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC4};
-    std::string type_name[] = {"CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            gen(src, size, size, all_type[j], 0, 256);
-            SUBTEST << size << "x" << size << "; " << type_name[j] << " ; CV_RGBA2GRAY";
-
-            cvtColor(src, dst, CV_RGBA2GRAY, 4);
-
-            CPU_ON;
-            cvtColor(src, dst, CV_RGBA2GRAY, 4);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-
-    }
-
-
-}
-///////////// filter2D////////////////////////
-TEST(filter2D)
-{
-    Mat src;
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        int all_type[] = {CV_8UC1, CV_8UC4};
-        std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            gen(src, size, size, all_type[j], 0, 256);
-
-            for (int ksize = 3; ksize <= 15; ksize = 2*ksize+1)
-            {
-                SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;
-
-                Mat kernel;
-                gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
-
-                Mat dst;
-                cv::filter2D(src, dst, -1, kernel);
-
-                CPU_ON;
-                cv::filter2D(src, dst, -1, kernel);
-                CPU_OFF;
-#ifdef USE_OPENCL
-                ocl::oclMat d_src(src);
-                ocl::oclMat d_dst;
-
-                WARMUP_ON;
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                WARMUP_OFF;
-
-                GPU_ON;
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                GPU_OFF;
-
-                GPU_FULL_ON;
-                d_src.upload(src);
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                d_dst.download(dst);
-                GPU_FULL_OFF;
-#endif
-            }
-
-        }
-
-
-    }
-}
-
-
-///////////// dft ////////////////////////
-TEST(dft)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    int all_type[] = {CV_32FC1, CV_32FC2};
-    std::string type_name[] = {"CV_32FC1", "CV_32FC2"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; complex-to-complex";
-
-            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(1));
-
-            dft(src, dst);
-
-            CPU_ON;
-            dft(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::dft(d_src, d_dst, Size(size, size));
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::dft(d_src, d_dst, Size(size, size));
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::dft(d_src, d_dst, Size(size, size));
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// gemm ////////////////////////
-TEST(gemm)
-{
-    Mat src1, src2, src3, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_src3, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size;
-
-        gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
-        gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
-        gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
-
-        gemm(src1, src2, 1.0, src3, 1.0, dst);
-
-        CPU_ON;
-        gemm(src1, src2, 1.0, src3, 1.0, dst);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        d_src1.upload(src1);
-        d_src2.upload(src2);
-        d_src3.upload(src3);
-
-        WARMUP_ON;
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src1.upload(src1);
-        d_src2.upload(src2);
-        d_src3.upload(src3);
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-
-int main(int argc, const char *argv[])
-{
-#ifdef USE_OPENCL
-    vector<ocl::Info> oclinfo;
-    int num_devices = getDevice(oclinfo);
-
-    if (num_devices < 1)
-    {
-        cerr << "no device found\n";
-        return -1;
-    }
-
-    int devidx = 0;
-
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
-        {
-            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
-        }
-    }
-
-#endif
-    redirectError(cvErrorCallback);
-
-    const char *keys =
-        "{ h help    | false | print help message }"
-        "{ f filter  |       | filter for test }"
-        "{ w workdir |       | set working directory }"
-        "{ l list    | false | show all tests }"
-        "{ d device  | 0     | device id }"
-        "{ i iters   | 10    | iteration count }"
-        "{ m warmup  | 1     | gpu warm up iteration count}"
-        "{ t xtop    | 1.1   | xfactor top boundary}"
-        "{ b xbottom | 0.9   | xfactor bottom boundary}"
-        "{ v verify  | false | only run gpu once to verify if problems occur}";
-
-    CommandLineParser cmd(argc, argv, keys);
-
-    if (cmd.get<bool>("help"))
-    {
-        cout << "Avaible options:" << endl;
-        cmd.printMessage();
-        return 0;
-    }
-
-#ifdef USE_OPENCL
-    int device = cmd.get<int>("device");
-
-    if (device < 0 || device >= num_devices)
-    {
-        cerr << "Invalid device ID" << endl;
-        return -1;
-    }
-
-    if (cmd.get<bool>("verify"))
-    {
-        TestSystem::instance().setNumIters(1);
-        TestSystem::instance().setGPUWarmupIters(0);
-        TestSystem::instance().setCPUIters(0);
-    }
-
-    devidx = 0;
-
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
-        {
-            if (device == devidx)
-            {
-                ocl::setDevice(oclinfo[i], (int)j);
-                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
-                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
-                goto END_DEV;
-            }
-        }
-    }
-
-END_DEV:
-
-#endif
-    string filter = cmd.get<string>("filter");
-    string workdir = cmd.get<string>("workdir");
-    bool list = cmd.get<bool>("list");
-    int iters = cmd.get<int>("iters");
-    int wu_iters = cmd.get<int>("warmup");
-    double x_top = cmd.get<double>("xtop");
-    double x_bottom = cmd.get<double>("xbottom");
-
-    TestSystem::instance().setTopThreshold(x_top);
-    TestSystem::instance().setBottomThreshold(x_bottom);
-
-    if (!filter.empty())
-    {
-        TestSystem::instance().setTestFilter(filter);
-    }
-
-    if (!workdir.empty())
-    {
-        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
-        {
-            workdir += '/';
-        }
-
-        TestSystem::instance().setWorkingDir(workdir);
-    }
-
-    if (list)
-    {
-        TestSystem::instance().setListMode(true);
-    }
-
-    TestSystem::instance().setNumIters(iters);
-    TestSystem::instance().setGPUWarmupIters(wu_iters);
-
-    TestSystem::instance().run();
-
-    return 0;
-}
diff --git a/samples/python2/dft.py b/samples/python2/dft.py
new file mode 100644
index 0000000000..9aac53a884
--- /dev/null
+++ b/samples/python2/dft.py
@@ -0,0 +1,100 @@
+#/usr/bin/env python
+
+import cv2
+import numpy as np
+import sys
+
+
+def shift_dft(src, dst=None):
+    '''
+        Rearrange the quadrants of Fourier image so that the origin is at
+        the image center. Swaps quadrant 1 with 3, and 2 with 4.
+
+        src and dst arrays must be equal size & type
+    '''
+
+    if dst is None:
+        dst = np.empty(src.shape, src.dtype)
+    elif src.shape != dst.shape:
+        raise ValueError("src and dst must have equal sizes")
+    elif src.dtype != dst.dtype:
+        raise TypeError("src and dst must have equal types")
+
+    if src is dst:
+        ret = np.empty(src.shape, src.dtype)
+    else:
+        ret = dst
+
+    h, w = src.shape[:2]
+
+    cx1 = cx2 = w/2
+    cy1 = cy2 = h/2
+
+    # if the size is odd, then adjust the bottom/right quadrants
+    if w % 2 != 0:
+        cx2 += 1
+    if h % 2 != 0:
+        cy2 += 1
+
+    # swap quadrants
+
+    # swap q1 and q3
+    ret[h-cy1:, w-cx1:] = src[0:cy1 , 0:cx1 ]   # q1 -> q3
+    ret[0:cy2 , 0:cx2 ] = src[h-cy2:, w-cx2:]   # q3 -> q1
+
+    # swap q2 and q4
+    ret[0:cy2 , w-cx2:] = src[h-cy2:, 0:cx2 ]   # q2 -> q4
+    ret[h-cy1:, 0:cx1 ] = src[0:cy1 , w-cx1:]   # q4 -> q2
+
+    if src is dst:
+        dst[:,:] = ret
+
+    return dst
+
+if __name__ == "__main__":
+
+    if len(sys.argv)>1:
+        im = cv2.imread(sys.argv[1])
+    else :
+        im = cv2.imread('../c/baboon.jpg')
+        print "usage : python dft.py <image_file>"
+
+    # convert to grayscale
+    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
+    h, w = im.shape[:2]
+
+    realInput = im.astype(np.float64)
+
+    # perform an optimally sized dft
+    dft_M = cv2.getOptimalDFTSize(w)
+    dft_N = cv2.getOptimalDFTSize(h)
+
+    # copy A to dft_A and pad dft_A with zeros
+    dft_A = np.zeros((dft_N, dft_M, 2), dtype=np.float64)
+    dft_A[:h, :w, 0] = realInput
+
+    # no need to pad bottom part of dft_A with zeros because of
+    # use of nonzeroRows parameter in cv2.dft()
+    cv2.dft(dft_A, dst=dft_A, nonzeroRows=h)
+
+    cv2.imshow("win", im)
+
+    # Split fourier into real and imaginary parts
+    image_Re, image_Im = cv2.split(dft_A)
+
+    # Compute the magnitude of the spectrum Mag = sqrt(Re^2 + Im^2)
+    magnitude = cv2.sqrt(image_Re**2.0 + image_Im**2.0)
+
+    # Compute log(1 + Mag)
+    log_spectrum = cv2.log(1.0 + magnitude)
+
+    # Rearrange the quadrants of Fourier image so that the origin is at
+    # the image center
+    shift_dft(log_spectrum, log_spectrum)
+
+    # normalize and display the results as rgb
+    cv2.normalize(log_spectrum, log_spectrum, 0.0, 1.0, cv2.cv.CV_MINMAX)
+    cv2.imshow("magnitude", log_spectrum)
+
+    cv2.waitKey(0)
+    cv2.destroyAllWindows()