Merge branch 'master' into python2and3

11 years ago · c9876c3cd8
parent cac1218eef 1a1097ab23
commit c9876c3cd8
453 changed files with 11924 additions and 14701 deletions
--- a/3rdparty/libjpeg/CMakeLists.txt
+++ b/3rdparty/libjpeg/CMakeLists.txt
@ -9,7 +9,7 @@ ocv_include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 file(GLOB lib_srcs *.c)
 file(GLOB lib_hdrs *.h)

-if(ANDROID OR IOS)
+if(ANDROID OR IOS OR APPLE)
  ocv_list_filterout(lib_srcs jmemansi.c)
 else()
  ocv_list_filterout(lib_srcs jmemnobs.c)
--- a/3rdparty/readme.txt
+++ b/3rdparty/readme.txt
@ -1,5 +1,5 @@
 This folder contains libraries and headers of a few very popular still image codecs
-used by highgui module.
+used by imgcodecs module.
 The libraries and headers are preferably to build Win32 and Win64 versions of OpenCV.
 On UNIX systems all the libraries are automatically detected by configure script.
 In order to use these versions of libraries instead of system ones on UNIX systems you
@ -11,7 +11,7 @@ libjpeg 8d (8.4)  -   The Independent JPEG Group's JPEG software.
                      See IGJ home page http://www.ijg.org
                      for details and links to the source code

-                      HAVE_JPEG preprocessor flag must be set to make highgui use libjpeg.
+                      HAVE_JPEG preprocessor flag must be set to make imgcodecs use libjpeg.
                      On UNIX systems configure script takes care of it.
 ------------------------------------------------------------------------------------
 libpng 1.5.12     -   Portable Network Graphics library.
@ -19,7 +19,7 @@ libpng 1.5.12     -   Portable Network Graphics library.
                      See libpng home page http://www.libpng.org
                      for details and links to the source code

-                      HAVE_PNG preprocessor flag must be set to make highgui use libpng.
+                      HAVE_PNG preprocessor flag must be set to make imgcodecs use libpng.
                      On UNIX systems configure script takes care of it.
 ------------------------------------------------------------------------------------
 libtiff 4.0.2     -   Tag Image File Format (TIFF) Software
@ -28,7 +28,7 @@ libtiff 4.0.2     -   Tag Image File Format (TIFF) Software
                      See libtiff home page http://www.remotesensing.org/libtiff/
                      for details and links to the source code

-                      HAVE_TIFF preprocessor flag must be set to make highgui use libtiff.
+                      HAVE_TIFF preprocessor flag must be set to make imgcodecs use libtiff.
                      On UNIX systems configure script takes care of it.
                      In this build support for ZIP (LZ77 compression) is turned on.
 ------------------------------------------------------------------------------------
@ -37,7 +37,7 @@ zlib 1.2.7        -   General purpose LZ77 compression library
                      See zlib home page http://www.zlib.net
                      for details and links to the source code

-                      No preprocessor definition is needed to make highgui use this library -
+                      No preprocessor definition is needed to make imgcodecs use this library -
                      it is included automatically if either libpng or libtiff are used.
 ------------------------------------------------------------------------------------
 jasper-1.900.1    -   JasPer is a collection of software
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -6,6 +6,8 @@
 #
 # ----------------------------------------------------------------------------

+
+
 include(cmake/OpenCVMinDepVersions.cmake)

 if(CMAKE_GENERATOR MATCHES Xcode AND XCODE_VERSION VERSION_GREATER 4.3)
@ -135,6 +137,7 @@ OCV_OPTION(WITH_WEBP           "Include WebP support"                        ON
 OCV_OPTION(WITH_OPENEXR        "Include ILM support via OpenEXR"             ON   IF (NOT IOS) )
 OCV_OPTION(WITH_OPENGL         "Include OpenGL support"                      OFF  IF (NOT ANDROID) )
 OCV_OPTION(WITH_OPENNI         "Include OpenNI support"                      OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENNI2        "Include OpenNI2 support"                     OFF  IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_PNG            "Include PNG support"                         ON)
 OCV_OPTION(WITH_PVAPI          "Include Prosilica GigE support"              ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_GIGEAPI        "Include Smartek GigE support"                ON   IF (NOT ANDROID AND NOT IOS) )
@ -148,8 +151,8 @@ OCV_OPTION(WITH_TIFF           "Include TIFF support"                        ON
 OCV_OPTION(WITH_UNICAP         "Include Unicap support (GPL)"                OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_V4L            "Include Video 4 Linux support"               ON   IF (UNIX AND NOT ANDROID) )
 OCV_OPTION(WITH_LIBV4L         "Use libv4l for Video 4 Linux support"        ON   IF (UNIX AND NOT ANDROID) )
-OCV_OPTION(WITH_DSHOW          "Build HighGUI with DirectShow support"       ON   IF (WIN32 AND NOT ARM) )
-OCV_OPTION(WITH_MSMF           "Build HighGUI with Media Foundation support" OFF  IF WIN32 )
+OCV_OPTION(WITH_DSHOW          "Build VideoIO with DirectShow support"       ON   IF (WIN32 AND NOT ARM) )
+OCV_OPTION(WITH_MSMF           "Build VideoIO with Media Foundation support" OFF  IF WIN32 )
 OCV_OPTION(WITH_XIMEA          "Include XIMEA cameras support"               OFF  IF (NOT ANDROID AND NOT APPLE) )
 OCV_OPTION(WITH_XINE           "Include Xine support (GPL)"                  OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_CLP            "Include Clp support (EPL)"                   OFF)
@ -865,6 +868,11 @@ if(DEFINED WITH_OPENNI)
                                                   THEN "YES (${OPENNI_PRIME_SENSOR_MODULE})"      ELSE NO)
 endif(DEFINED WITH_OPENNI)

+if(DEFINED WITH_OPENNI2)
+  status("    OpenNI2:"		   HAVE_OPENNI2	   THEN "YES (ver ${OPENNI2_VERSION_STRING}, build ${OPENNI2_VERSION_BUILD})"
+																								   ELSE NO)
+endif(DEFINED WITH_OPENNI2)
+
 if(DEFINED WITH_PVAPI)
  status("    PvAPI:"          HAVE_PVAPI          THEN YES                                        ELSE NO)
 endif(DEFINED WITH_PVAPI)
--- a/apps/traincascade/CMakeLists.txt
+++ b/apps/traincascade/CMakeLists.txt
@ -1,4 +1,4 @@
-set(OPENCV_TRAINCASCADE_DEPS opencv_core opencv_ml opencv_imgproc opencv_photo opencv_objdetect opencv_highgui opencv_calib3d opencv_video opencv_features2d)
+set(OPENCV_TRAINCASCADE_DEPS opencv_core opencv_ml opencv_imgproc opencv_photo opencv_objdetect opencv_imgcodecs opencv_videoio opencv_highgui opencv_calib3d opencv_video opencv_features2d)
 ocv_check_dependencies(${OPENCV_TRAINCASCADE_DEPS})

 if(NOT OCV_DEPENDENCIES_FOUND)
--- a/apps/traincascade/imagestorage.cpp
+++ b/apps/traincascade/imagestorage.cpp
@ -1,6 +1,7 @@
 #include "opencv2/core.hpp"
+#include "opencv2/core/core_c.h"
 #include "opencv2/imgproc.hpp"
-#include "opencv2/highgui.hpp"
+#include "opencv2/imgcodecs.hpp"

 #include "imagestorage.h"
 #include <stdio.h>
--- a/apps/traincascade/imagestorage.h
+++ b/apps/traincascade/imagestorage.h
@ -1,9 +1,6 @@
 #ifndef _OPENCV_IMAGESTORAGE_H_
 #define _OPENCV_IMAGESTORAGE_H_

-#include "highgui.h"
-
-

 class CvCascadeImageReader
 {
--- a/apps/traincascade/traincascade.cpp
+++ b/apps/traincascade/traincascade.cpp
@ -13,6 +13,7 @@ int main( int argc, char* argv[] )
    int numPos    = 2000;
    int numNeg    = 1000;
    int numStages = 20;
+    int numThreads = getNumThreads();
    int precalcValBufSize = 256,
        precalcIdxBufSize = 256;
    bool baseFormatSave = false;
@ -36,6 +37,7 @@ int main( int argc, char* argv[] )
        cout << "  [-precalcValBufSize <precalculated_vals_buffer_size_in_Mb = " << precalcValBufSize << ">]" << endl;
        cout << "  [-precalcIdxBufSize <precalculated_idxs_buffer_size_in_Mb = " << precalcIdxBufSize << ">]" << endl;
        cout << "  [-baseFormatSave]" << endl;
+        cout << "  [-numThreads <max_number_of_threads = " << numThreads << ">]" << endl;
        cascadeParams.printDefaults();
        stageParams.printDefaults();
        for( int fi = 0; fi < fc; fi++ )
@ -82,6 +84,10 @@ int main( int argc, char* argv[] )
        {
            baseFormatSave = true;
        }
+        else if( !strcmp( argv[i], "-numThreads" ) )
+        {
+          numThreads = atoi(argv[++i]);
+        }
        else if ( cascadeParams.scanAttr( argv[i], argv[i+1] ) ) { i++; }
        else if ( stageParams.scanAttr( argv[i], argv[i+1] ) ) { i++; }
        else if ( !set )
@ -98,6 +104,7 @@ int main( int argc, char* argv[] )
        }
    }

+    setNumThreads( numThreads );
    classifier.train( cascadeDirName,
                      vecName,
                      bgName,
--- a/cmake/OpenCVConfig.cmake
+++ b/cmake/OpenCVConfig.cmake
@ -11,7 +11,7 @@
 #
 #    Or you can search for specific OpenCV modules:
 #
-#    FIND_PACKAGE(OpenCV REQUIRED core highgui)
+#    FIND_PACKAGE(OpenCV REQUIRED core imgcodecs)
 #
 #    If the module is found then OPENCV_<MODULE>_FOUND is set to TRUE.
 #
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@ -131,7 +131,7 @@ if(WITH_1394)
      if(HAVE_DC1394_2)
        ocv_parse_pkg("libdc1394-2" "${DC1394_2_LIB_DIR}/pkgconfig" "")
        ocv_include_directories(${DC1394_2_INCLUDE_PATH})
-        set(HIGHGUI_LIBRARIES ${HIGHGUI_LIBRARIES}
+        set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES}
            "${DC1394_2_LIB_DIR}/libdc1394.a"
            "${CMU1394_LIB_DIR}/lib1394camera.a")
      endif(HAVE_DC1394_2)
@ -166,6 +166,11 @@ if(WITH_OPENNI)
  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindOpenNI.cmake")
 endif(WITH_OPENNI)

+ocv_clear_vars(HAVE_OPENNI2)
+if(WITH_OPENNI2)
+  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindOpenNI2.cmake")
+endif(WITH_OPENNI2)
+
 # --- XIMEA ---
 ocv_clear_vars(HAVE_XIMEA)
 if(WITH_XIMEA)
@ -234,7 +239,7 @@ if(WITH_FFMPEG)
      endif()
    endif(FFMPEG_INCLUDE_DIR)
    if(HAVE_FFMPEG)
-      set(HIGHGUI_LIBRARIES ${HIGHGUI_LIBRARIES} "${FFMPEG_LIB_DIR}/libavcodec.a"
+      set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavcodec.a"
          "${FFMPEG_LIB_DIR}/libavformat.a" "${FFMPEG_LIB_DIR}/libavutil.a"
          "${FFMPEG_LIB_DIR}/libswscale.a")
      ocv_include_directories(${FFMPEG_INCLUDE_DIR})
@ -253,14 +258,15 @@ if(WITH_MSMF)
  check_include_file(Mfapi.h HAVE_MSMF)
 endif(WITH_MSMF)

-# --- Extra HighGUI libs on Windows ---
+# --- Extra HighGUI and VideoIO libs on Windows ---
 if(WIN32)
-  list(APPEND HIGHGUI_LIBRARIES comctl32 gdi32 ole32 setupapi ws2_32 vfw32)
+  list(APPEND HIGHGUI_LIBRARIES comctl32 gdi32 ole32 setupapi ws2_32)
+  list(APPEND VIDEOIO_LIBRARIES vfw32)
  if(MINGW64)
-    list(APPEND HIGHGUI_LIBRARIES avifil32 avicap32 winmm msvfw32)
-    list(REMOVE_ITEM HIGHGUI_LIBRARIES vfw32)
+    list(APPEND VIDEOIO_LIBRARIES avifil32 avicap32 winmm msvfw32)
+    list(REMOVE_ITEM VIDEOIO_LIBRARIES vfw32)
  elseif(MINGW)
-    list(APPEND HIGHGUI_LIBRARIES winmm)
+    list(APPEND VIDEOIO_LIBRARIES winmm)
  endif()
 endif(WIN32)

--- a/cmake/OpenCVFindOpenNI2.cmake
+++ b/cmake/OpenCVFindOpenNI2.cmake
@ -0,0 +1,61 @@
+# Main variables:
+# OPENNI2_LIBRARY and OPENNI2_INCLUDES to link OpenCV modules with OpenNI2
+# HAVE_OPENNI2 for conditional compilation OpenCV with/without OpenNI2
+
+if(NOT "${OPENNI2_LIB_DIR}" STREQUAL "${OPENNI2_LIB_DIR_INTERNAL}")
+    unset(OPENNI2_LIBRARY CACHE)
+    unset(OPENNI2_LIB_DIR CACHE)
+endif()
+
+if(NOT "${OPENNI2_INCLUDE_DIR}" STREQUAL "${OPENNI2_INCLUDE_DIR_INTERNAL}")
+    unset(OPENNI2_INCLUDES CACHE)
+    unset(OPENNI2_INCLUDE_DIR CACHE)
+endif()
+
+if(WIN32)
+    if(NOT (MSVC64 OR MINGW64))
+        find_file(OPENNI2_INCLUDES "OpenNI.h" PATHS "$ENV{OPEN_NI_INSTALL_PATH}Include" DOC "OpenNI2 c++ interface header")
+        find_library(OPENNI2_LIBRARY "OpenNI2" PATHS $ENV{OPENNI2_LIB} DOC "OpenNI2 library")
+    else()
+        find_file(OPENNI2_INCLUDES "OpenNI.h" PATHS "$ENV{OPEN_NI_INSTALL_PATH64}Include" DOC "OpenNI2 c++ interface header")
+        find_library(OPENNI2_LIBRARY "OpenNI2" PATHS $ENV{OPENNI2_LIB64} DOC "OpenNI2 library")
+    endif()
+elseif(UNIX OR APPLE)
+    find_file(OPENNI2_INCLUDES "OpenNI.h" PATHS "/usr/include/ni2" "/usr/include/openni2" DOC "OpenNI2 c++ interface header")
+    find_library(OPENNI2_LIBRARY "OpenNI2" PATHS "/usr/lib" DOC "OpenNI2 library")
+endif()
+
+if(OPENNI2_LIBRARY AND OPENNI2_INCLUDES)
+    set(HAVE_OPENNI2 TRUE)
+endif() #if(OPENNI2_LIBRARY AND OPENNI2_INCLUDES)
+
+get_filename_component(OPENNI2_LIB_DIR "${OPENNI2_LIBRARY}" PATH)
+get_filename_component(OPENNI2_INCLUDE_DIR ${OPENNI2_INCLUDES} PATH)
+
+if(HAVE_OPENNI2)
+  set(OPENNI2_LIB_DIR "${OPENNI2_LIB_DIR}" CACHE PATH "Path to OpenNI2 libraries" FORCE)
+  set(OPENNI2_INCLUDE_DIR "${OPENNI2_INCLUDE_DIR}" CACHE PATH "Path to OpenNI2 headers" FORCE)
+endif()
+
+if(OPENNI2_LIBRARY)
+    set(OPENNI2_LIB_DIR_INTERNAL "${OPENNI2_LIB_DIR}" CACHE INTERNAL "This is the value of the last time OPENNI_LIB_DIR was set successfully." FORCE)
+else()
+    message( WARNING, " OpenNI2 library directory (set by OPENNI2_LIB_DIR variable) is not found or does not have OpenNI2 libraries." )
+endif()
+
+if(OPENNI2_INCLUDES)
+    set(OPENNI2_INCLUDE_DIR_INTERNAL "${OPENNI2_INCLUDE_DIR}" CACHE INTERNAL "This is the value of the last time OPENNI2_INCLUDE_DIR was set successfully." FORCE)
+else()
+    message( WARNING, " OpenNI2 include directory (set by OPENNI2_INCLUDE_DIR variable) is not found or does not have OpenNI2 include files." )
+endif()
+
+mark_as_advanced(FORCE OPENNI2_LIBRARY)
+mark_as_advanced(FORCE OPENNI2_INCLUDES)
+
+if(HAVE_OPENNI2)
+  ocv_parse_header("${OPENNI2_INCLUDE_DIR}/OniVersion.h" ONI_VERSION_LINE ONI_VERSION_MAJOR ONI_VERSION_MINOR ONI_VERSION_MAINTENANCE ONI_VERSION_BUILD)
+  if(ONI_VERSION_MAJOR)
+    set(OPENNI2_VERSION_STRING ${ONI_VERSION_MAJOR}.${ONI_VERSION_MINOR}.${ONI_VERSION_MAINTENANCE} CACHE INTERNAL "OpenNI2 version")
+    set(OPENNI2_VERSION_BUILD ${ONI_VERSION_BUILD} CACHE INTERNAL "OpenNI2 build version")
+  endif()
+endif()
--- a/cmake/OpenCVGenInfoPlist.cmake
+++ b/cmake/OpenCVGenInfoPlist.cmake
@ -1,4 +1,7 @@
 if(IOS)
  configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.plist.in"
                 "${CMAKE_BINARY_DIR}/ios/Info.plist")
+elseif(APPLE)
+  configure_file("${OpenCV_SOURCE_DIR}/platforms/osx/Info.plist.in"
+                 "${CMAKE_BINARY_DIR}/osx/Info.plist")
 endif()
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -704,8 +704,8 @@ function(ocv_add_perf_tests)
  if(BUILD_PERF_TESTS AND EXISTS "${perf_path}")
    __ocv_parse_test_sources(PERF ${ARGN})

-    # opencv_highgui is required for imread/imwrite
-    set(perf_deps ${the_module} opencv_ts opencv_highgui ${OPENCV_PERF_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
+    # opencv_imgcodecs is required for imread/imwrite
+    set(perf_deps ${the_module} opencv_ts opencv_imgcodecs ${OPENCV_PERF_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
    ocv_check_dependencies(${perf_deps})

    if(OCV_DEPENDENCIES_FOUND)
@ -757,8 +757,8 @@ function(ocv_add_accuracy_tests)
  if(BUILD_TESTS AND EXISTS "${test_path}")
    __ocv_parse_test_sources(TEST ${ARGN})

-    # opencv_highgui is required for imread/imwrite
-    set(test_deps ${the_module} opencv_ts opencv_highgui ${OPENCV_TEST_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
+    # opencv_imgcodecs is required for imread/imwrite
+    set(test_deps ${the_module} opencv_ts opencv_imgcodecs opencv_videoio ${OPENCV_TEST_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
    ocv_check_dependencies(${test_deps})

    if(OCV_DEPENDENCIES_FOUND)
@ -811,7 +811,7 @@ function(ocv_add_samples)
  string(REGEX REPLACE "^opencv_" "" module_id ${the_module})

  if(BUILD_EXAMPLES AND EXISTS "${samples_path}")
-    set(samples_deps ${the_module} ${OPENCV_MODULE_${the_module}_DEPS} opencv_highgui ${ARGN})
+    set(samples_deps ${the_module} ${OPENCV_MODULE_${the_module}_DEPS} opencv_imgcodecs opencv_videoio opencv_highgui ${ARGN})
    ocv_check_dependencies(${samples_deps})

    if(OCV_DEPENDENCIES_FOUND)
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@ -265,16 +265,19 @@ macro(CHECK_MODULE module_name define)
      set(${define} 1)
      foreach(P "${ALIAS_INCLUDE_DIRS}")
        if(${P})
+          list(APPEND VIDEOIO_INCLUDE_DIRS ${${P}})
          list(APPEND HIGHGUI_INCLUDE_DIRS ${${P}})
        endif()
      endforeach()

      foreach(P "${ALIAS_LIBRARY_DIRS}")
        if(${P})
+          list(APPEND VIDEOIO_LIBRARY_DIRS ${${P}})
          list(APPEND HIGHGUI_LIBRARY_DIRS ${${P}})
        endif()
      endforeach()

+      list(APPEND VIDEOIO_LIBRARIES ${${ALIAS_LIBRARIES}})
      list(APPEND HIGHGUI_LIBRARIES ${${ALIAS_LIBRARIES}})
    endif()
  endif()
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@ -12,7 +12,7 @@
 #
 #    Or you can search for specific OpenCV modules:
 #
-#    find_package(OpenCV REQUIRED core highgui)
+#    find_package(OpenCV REQUIRED core videoio)
 #
 #    If the module is found then OPENCV_<MODULE>_FOUND is set to TRUE.
 #
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@ -129,6 +129,9 @@
 /* OpenNI library */
 #cmakedefine HAVE_OPENNI

+/* OpenNI library */
+#cmakedefine HAVE_OPENNI2
+
 /* PNG codec */
 #cmakedefine HAVE_PNG

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -33,7 +33,7 @@ if(BUILD_DOCS AND HAVE_SPHINX)
    endif()
  endforeach()

-  set(FIXED_ORDER_MODULES core imgproc highgui video calib3d features2d objdetect ml flann photo stitching nonfree contrib legacy)
+  set(FIXED_ORDER_MODULES core imgproc imgcodecs videoio highgui video calib3d features2d objdetect ml flann photo stitching nonfree contrib legacy)

  list(REMOVE_ITEM BASE_MODULES ${FIXED_ORDER_MODULES})

--- a/doc/check_docs.py
+++ b/doc/check_docs.py
@ -14,6 +14,8 @@ opencv_hdr_list = [
 "../modules/video/include/opencv2/video/tracking.hpp",
 "../modules/video/include/opencv2/video/background_segm.hpp",
 "../modules/objdetect/include/opencv2/objdetect.hpp",
+"../modules/imgcodecs/include/opencv2/imgcodecs.hpp",
+"../modules/videoio/include/opencv2/videoio.hpp",
 "../modules/highgui/include/opencv2/highgui.hpp",
 ]

@ -24,6 +26,8 @@ opencv_module_list = [
 "features2d",
 "video",
 "objdetect",
+"imgcodecs",
+"videoio",
 "highgui",
 "ml"
 ]
--- a/doc/conf.py
+++ b/doc/conf.py
@ -302,14 +302,16 @@ man_pages = [
 extlinks = {
            'basicstructures' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html#%s', None),
            'oldbasicstructures' : ('http://docs.opencv.org/modules/core/doc/old_basic_structures.html#%s', None),
-            'readwriteimagevideo' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#%s', None),
+            'readwriteimage' : ('http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html#%s', None),
+            'readwritevideo' : ('http://docs.opencv.org/modules/videoio/doc/reading_and_writing_video.html#%s', None),
            'operationsonarrays' : ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html#%s', None),
            'utilitysystemfunctions' : ('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html#%s', None),
            'imgprocfilter' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
            'svms' : ('http://docs.opencv.org/modules/ml/doc/support_vector_machines.html#%s', None),
            'drawingfunc' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#%s', None),
            'xmlymlpers' : ('http://docs.opencv.org/modules/core/doc/xml_yaml_persistence.html#%s', None),
-            'hgvideo' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#%s', None),
+            'rwimg' : ('http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html#%s', None),
+            'hgvideo' : ('http://docs.opencv.org/modules/videoio/doc/reading_and_writing_video.html#%s', None),
            'gpuinit' : ('http://docs.opencv.org/modules/gpu/doc/initalization_and_information.html#%s', None),
            'gpudatastructure' : ('http://docs.opencv.org/modules/gpu/doc/data_structures.html#%s', None),
            'gpuopmatrices' : ('http://docs.opencv.org/modules/gpu/doc/operations_on_matrices.html#%s', None),
@ -329,8 +331,8 @@ extlinks = {
            'how_to_contribute' : ('http://code.opencv.org/projects/opencv/wiki/How_to_contribute/%s', None),

            'cvt_color' : ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=cvtcolor#cvtcolor%s', None),
-            'imread' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imread#imread%s', None),
-            'imwrite' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imwrite#imwrite%s', None),
+            'imread' : ('http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html?highlight=imread#imread%s', None),
+            'imwrite' : ('http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html?highlight=imwrite#imwrite%s', None),
            'imshow' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=imshow#imshow%s', None),
            'named_window' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=namedwindow#namedwindow%s', None),
            'wait_key' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=waitkey#waitkey%s', None),
@ -418,7 +420,7 @@ extlinks = {
            'background_subtractor' : ('http://docs.opencv.org/modules/video/doc/motion_analysis_and_object_tracking.html?highlight=backgroundsubtractor#backgroundsubtractor%s', None),
            'background_subtractor_mog' : ('http://docs.opencv.org/modules/video/doc/motion_analysis_and_object_tracking.html?highlight=backgroundsubtractorMOG#backgroundsubtractormog%s', None),
            'background_subtractor_mog_two' : ('http://docs.opencv.org/modules/video/doc/motion_analysis_and_object_tracking.html?highlight=backgroundsubtractorMOG2#backgroundsubtractormog2%s', None),
-            'video_capture' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=videocapture#videocapture%s', None),
+            'video_capture' : ('http://docs.opencv.org/modules/videoio/doc/reading_and_writing_video.html?highlight=videocapture#videocapture%s', None),
            'ippa_convert': ('http://docs.opencv.org/modules/core/doc/ipp_async_converters.html#%s', None),
            'ptr':('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=Ptr#Ptr%s', None)
           }
--- a/doc/opencv_cheatsheet.tex
+++ b/doc/opencv_cheatsheet.tex
@ -522,9 +522,9 @@ samples on what are the contours and how to use them.

 \begin{tabbing}
 \textbf{Wr}\=\textbf{iting and reading raster images}\\
-\texttt{\href{http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html\#imwrite}{imwrite}("myimage.jpg", image);}\\
-\texttt{Mat image\_color\_copy = \href{http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html\#imread}{imread}("myimage.jpg", 1);}\\
-\texttt{Mat image\_grayscale\_copy = \href{http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html\#imread}{imread}("myimage.jpg", 0);}\\
+\texttt{\href{http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html\#imwrite}{imwrite}("myimage.jpg", image);}\\
+\texttt{Mat image\_color\_copy = \href{http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html\#imread}{imread}("myimage.jpg", 1);}\\
+\texttt{Mat image\_grayscale\_copy = \href{http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html\#imread}{imread}("myimage.jpg", 0);}\\
 \end{tabbing}

 \emph{The functions can read/write images in the following formats: \textbf{BMP (.bmp), JPEG (.jpg, .jpeg), TIFF (.tif, .tiff), PNG (.png), PBM/PGM/PPM (.p?m), Sun Raster (.sr), JPEG 2000 (.jp2)}. Every format supports 8-bit, 1- or 3-channel images. Some formats (PNG, JPEG 2000) support 16 bits per channel.}
--- a/doc/py_tutorials/py_gui/py_video_display/py_video_display.rst
+++ b/doc/py_tutorials/py_gui/py_video_display/py_video_display.rst
@ -46,7 +46,7 @@ To capture a video, you need to create a **VideoCapture** object. Its argument c

 Sometimes, ``cap`` may not have initialized the capture. In that case, this code shows error. You can check whether it is initialized or not by the method **cap.isOpened()**. If it is True, OK. Otherwise open it using **cap.open()**.

-You can also access some of the features of this video using **cap.get(propId)** method where propId is a number from 0 to 18. Each number denotes a property of the video (if it is applicable to that video) and full details can be seen here: `Property Identifier <http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#videocapture-get>`_. Some of these values can be modified using **cap.set(propId, value)**. Value is the new value you want.
+You can also access some of the features of this video using **cap.get(propId)** method where propId is a number from 0 to 18. Each number denotes a property of the video (if it is applicable to that video) and full details can be seen here: `Property Identifier <http://docs.opencv.org/modules/highgui/doc/reading_and_writing_video.html#videocapture-get>`_. Some of these values can be modified using **cap.set(propId, value)**. Value is the new value you want.

 For example, I can check the frame width and height by ``cap.get(3)`` and ``cap.get(4)``. It gives me 640x480 by default. But I want to modify it to 320x240. Just use ``ret = cap.set(3,320)`` and ``ret = cap.set(4,240)``.

--- a/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.rst
+++ b/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.rst
@ -25,7 +25,7 @@ Here's a sample usage of :operationsonarrays:`dft() <dft>` :
   :language: cpp
   :linenos:
   :tab-width: 4
-   :lines: 1-3, 5, 19-20, 23-78
+   :lines: 1-4, 6, 20-21, 24-79

 Explanation
 ===========
--- a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.rst
+++ b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.rst
@ -45,7 +45,7 @@ The final argument is optional. If given the image will be loaded in gray scale
 .. literalinclude:: ../../../../samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 48-60
+   :lines: 49-61

 Here we first use the C++ *stringstream* class to convert the third command line argument from text to an integer format. Then we use a simple look and the upper formula to calculate the lookup table. No OpenCV specific stuff here.

@ -99,7 +99,7 @@ When it comes to performance you cannot beat the classic C style operator[] (poi
 .. literalinclude:: ../../../../samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 125-152
+   :lines: 126-153

 Here we basically just acquire a pointer to the start of each row and go through it until it ends. In the special case that the matrix is stored in a continues manner we only need to request the pointer a single time and go all the way to the end. We need to look out for color images: we have three channels so we need to pass through three times more items in each row.

@ -122,7 +122,7 @@ In case of the efficient way making sure that you pass through the right amount
 .. literalinclude:: ../../../../samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 154-182
+   :lines: 155-183

 In case of color images we have three uchar items per column. This may be considered a short vector of uchar items, that has been baptized in OpenCV with the *Vec3b* name. To access the n-th sub column we use simple operator[] access. It's important to remember that OpenCV iterators go through the columns and automatically skip to the next row. Therefore in case of color images if you use a simple *uchar* iterator you'll be able to access only the blue channel values.

@ -134,7 +134,7 @@ The final method isn't recommended for scanning. It was made to acquire or modif
 .. literalinclude:: ../../../../samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 184-216
+   :lines: 185-217

 The functions takes your input type and coordinates and calculates on the fly the address of the queried item. Then returns a reference to that. This may be a constant when you *get* the value and non-constant when you *set* the value. As a safety step in **debug mode only*** there is performed a check that your input coordinates are valid and does exist. If this isn't the case you'll get a nice output message of this on the standard error output stream. Compared to the efficient way in release mode the only difference in using this is that for every element of the image you'll get a new row pointer for what we use the C operator[] to acquire the column element.

@ -148,14 +148,14 @@ This is a bonus method of achieving lookup table modification in an image. Becau
 .. literalinclude:: ../../../../samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 107-110
+   :lines: 108-111

 Finally call the function (I is our input image and J the output one):

 .. literalinclude:: ../../../../samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 115
+   :lines: 116

 Performance Difference
 ======================
--- a/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.rst
+++ b/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.rst
@ -77,7 +77,7 @@ Now that you have the basics done :download:`here's <../../../../samples/cpp/tut
   :language: cpp
   :linenos:
   :tab-width: 4
-   :lines: 1-9, 22-25, 27-44
+   :lines: 1-10, 23-26, 29-46

 Here you can observe that with the new structure we have no pointer problems, although it is possible to use the old functions and in the end just transform the result to a *Mat* object.

@ -85,7 +85,7 @@ Here you can observe that with the new structure we have no pointer problems, al
   :language: cpp
   :linenos:
   :tab-width: 4
-   :lines: 46-51
+   :lines: 48-53

 Because, we want to mess around with the images luma component we first convert from the default RGB to the YUV color space and then split the result up into separate planes. Here the program splits: in the first example it processes each plane using one of the three major image scanning algorithms in OpenCV (C [] operator, iterator, individual element access). In a second variant we add to the image some Gaussian noise and then mix together the channels according to some formula.

@ -95,7 +95,7 @@ The scanning version looks like:
   :language: cpp
   :linenos:
   :tab-width: 4
-   :lines: 55-75
+   :lines: 57-77

 Here you can observe that we may go through all the pixels of an image in three fashions: an iterator, a C pointer and an individual element access style. You can read a more in-depth description of these in the :ref:`howToScanImagesOpenCV` tutorial. Converting from the old function names is easy. Just remove the cv prefix and use the new *Mat* data structure. Here's an example of this by using the weighted addition function:

@ -103,7 +103,7 @@ Here you can observe that we may go through all the pixels of an image in three
   :language: cpp
   :linenos:
   :tab-width: 4
-   :lines: 79-112
+   :lines: 81-113

 As you may observe the *planes* variable is of type *Mat*. However, converting from *Mat* to *IplImage* is easy and made automatically with a simple assignment operator.

@ -111,7 +111,7 @@ As you may observe the *planes* variable is of type *Mat*. However, converting f
   :language: cpp
   :linenos:
   :tab-width: 4
-   :lines: 115-127
+   :lines: 117-129

 The new *imshow* highgui function accepts both the *Mat* and *IplImage* data structures. Compile and run the program and if the first image below is your input you may get either the first or second as output:

--- a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
+++ b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
@ -86,7 +86,7 @@ Each of the building components has their own valid domains. This leads to the d
 Creating a *Mat* object explicitly
 ==================================

-In the :ref:`Load_Save_Image` tutorial you have already learned how to write a matrix to an image file by using the :readwriteimagevideo:`imwrite() <imwrite>` function. However, for debugging purposes it's much more convenient to see the actual values. You can do this using the << operator of *Mat*. Be aware that this only works for two dimensional matrices.
+In the :ref:`Load_Save_Image` tutorial you have already learned how to write a matrix to an image file by using the :readwriteimage:`imwrite() <imwrite>` function. However, for debugging purposes it's much more convenient to see the actual values. You can do this using the << operator of *Mat*. Be aware that this only works for two dimensional matrices.

 Although *Mat* works really well as an image container, it is also a general matrix class. Therefore, it is possible to create and manipulate multidimensional matrices. You can create a Mat object in multiple ways:

--- a/doc/tutorials/core/table_of_content_core/table_of_content_core.rst
+++ b/doc/tutorials/core/table_of_content_core/table_of_content_core.rst
@ -200,7 +200,6 @@ Here you will learn the about the basic building blocks of the library. A must r
                   :height: 90pt
                   :width:  90pt

-  =============== ======================================================
 +
  .. tabularcolumns:: m{100pt} m{300pt}
  .. cssclass:: toctableopencv
@ -221,8 +220,6 @@ Here you will learn the about the basic building blocks of the library. A must r
                   :width:  90pt
  .. |Author_ElenaG| unicode:: Elena U+0020 Gvozdeva

-  =============== ======================================================
-
 .. raw:: latex

   \pagebreak
--- a/doc/tutorials/features2d/akaze_matching/akaze_matching.rst
+++ b/doc/tutorials/features2d/akaze_matching/akaze_matching.rst
@ -0,0 +1,161 @@
+.. _akazeMatching:
+
+
+AKAZE local features matching
+******************************
+
+Introduction
+------------------
+
+In this tutorial we will learn how to use [AKAZE]_ local features to detect and match keypoints on two images.
+
+We will find keypoints on a pair of images with given homography matrix,
+match them and count the number of inliers (i. e. matches that fit in the given homography).
+
+You can find expanded version of this example here: https://github.com/pablofdezalc/test_kaze_akaze_opencv
+
+.. [AKAZE] Fast Explicit Diffusion for Accelerated Features in Nonlinear Scale Spaces. Pablo F. Alcantarilla, Jesús Nuevo and Adrien Bartoli. In British Machine Vision Conference (BMVC), Bristol, UK, September 2013.
+
+Data
+------------------
+We are going to use images 1 and 3 from *Graffity* sequence of Oxford dataset.
+
+.. image:: images/graf.png
+  :height: 200pt
+  :width:  320pt
+  :alt: Graffity
+  :align: center
+
+Homography is given by a 3 by 3 matrix:
+
+.. code-block:: none
+
+    7.6285898e-01  -2.9922929e-01   2.2567123e+02
+    3.3443473e-01   1.0143901e+00  -7.6999973e+01
+    3.4663091e-04  -1.4364524e-05   1.0000000e+00
+
+You can find the images (*graf1.png*, *graf3.png*) and homography (*H1to3p.xml*) in *opencv/samples/cpp*.
+
+Source Code
+===========
+.. literalinclude:: ../../../../samples/cpp/tutorial_code/features2D/AKAZE_match.cpp
+   :language: cpp
+   :linenos:
+   :tab-width: 4
+
+Explanation
+===========
+
+1. **Load images and homography**
+
+  .. code-block:: cpp
+
+    Mat img1 = imread("graf1.png", IMREAD_GRAYSCALE);
+    Mat img2 = imread("graf3.png", IMREAD_GRAYSCALE);
+
+    Mat homography;
+    FileStorage fs("H1to3p.xml", FileStorage::READ);
+    fs.getFirstTopLevelNode() >> homography;
+
+  We are loading grayscale images here. Homography is stored in the xml created with FileStorage.
+
+2. **Detect keypoints and compute descriptors using AKAZE**
+
+  .. code-block:: cpp
+
+    vector<KeyPoint> kpts1, kpts2;
+    Mat desc1, desc2;
+
+    AKAZE akaze;
+    akaze(img1, noArray(), kpts1, desc1);
+    akaze(img2, noArray(), kpts2, desc2);
+
+  We create AKAZE object and use it's *operator()* functionality. Since we don't need the *mask* parameter, *noArray()* is used.
+
+3. **Use brute-force matcher to find 2-nn matches**
+
+  .. code-block:: cpp
+
+    BFMatcher matcher(NORM_HAMMING);
+    vector< vector<DMatch> > nn_matches;
+    matcher.knnMatch(desc1, desc2, nn_matches, 2);
+
+  We use Hamming distance, because AKAZE uses binary descriptor by default.
+
+4. **Use 2-nn matches to find correct keypoint matches**
+
+  .. code-block:: cpp
+
+    for(size_t i = 0; i < nn_matches.size(); i++) {
+        DMatch first = nn_matches[i][0];
+        float dist1 = nn_matches[i][0].distance;
+        float dist2 = nn_matches[i][1].distance;
+
+        if(dist1 < nn_match_ratio * dist2) {
+            matched1.push_back(kpts1[first.queryIdx]);
+            matched2.push_back(kpts2[first.trainIdx]);
+        }
+    }
+
+  If the closest match is *ratio* closer than the second closest one, then the match is correct.
+
+5. **Check if our matches fit in the homography model**
+
+  .. code-block:: cpp
+
+    for(int i = 0; i < matched1.size(); i++) {
+        Mat col = Mat::ones(3, 1, CV_64F);
+        col.at<double>(0) = matched1[i].pt.x;
+        col.at<double>(1) = matched1[i].pt.y;
+
+        col = homography * col;
+        col /= col.at<double>(2);
+        float dist = sqrt( pow(col.at<double>(0) - matched2[i].pt.x, 2) +
+                           pow(col.at<double>(1) - matched2[i].pt.y, 2));
+
+        if(dist < inlier_threshold) {
+            int new_i = inliers1.size();
+            inliers1.push_back(matched1[i]);
+            inliers2.push_back(matched2[i]);
+            good_matches.push_back(DMatch(new_i, new_i, 0));
+        }
+    }
+
+  If the distance from first keypoint's projection to the second keypoint is less than threshold, then it it fits in the homography.
+
+  We create a new set of matches for the inliers, because it is required by the drawing function.
+
+6. **Output results**
+
+  .. code-block:: cpp
+
+    Mat res;
+    drawMatches(img1, inliers1, img2, inliers2, good_matches, res);
+    imwrite("res.png", res);
+    ...
+
+  Here we save the resulting image and print some statistics.
+
+Results
+=======
+
+Found matches
+--------------
+
+.. image:: images/res.png
+  :height: 200pt
+  :width:  320pt
+  :alt: Matches
+  :align: center
+
+A-KAZE Matching Results
+--------------------------
+Keypoints 1:                        	2943
+
+Keypoints 2:                        	3511
+
+Matches:                            	447
+
+Inliers:                            	308
+
+Inliers Ratio:                      	0.689038
--- a/doc/tutorials/features2d/akaze_matching/images/graf.png
+++ b/doc/tutorials/features2d/akaze_matching/images/graf.png
--- a/doc/tutorials/features2d/akaze_matching/images/res.png
+++ b/doc/tutorials/features2d/akaze_matching/images/res.png
--- a/doc/tutorials/features2d/table_of_content_features2d/images/AKAZE_Match_Tutorial_Cover.png
+++ b/doc/tutorials/features2d/table_of_content_features2d/images/AKAZE_Match_Tutorial_Cover.png
--- a/doc/tutorials/features2d/table_of_content_features2d/table_of_content_features2d.rst
+++ b/doc/tutorials/features2d/table_of_content_features2d/table_of_content_features2d.rst
@ -183,6 +183,25 @@ Learn about how to use the feature points  detectors, descriptors and matching f
                     :height: 90pt
                     :width:  90pt

+
+  .. tabularcolumns:: m{100pt} m{300pt}
+  .. cssclass:: toctableopencv
+
+  ===================== ==============================================
+   |AkazeMatch|         **Title:** :ref:`akazeMatching`
+
+                        *Compatibility:* > OpenCV 3.0
+
+                        *Author:* Fedor Morozov
+
+                        Use *AKAZE* local features to find correspondence between two images.
+
+  ===================== ==============================================
+
+  .. |AkazeMatch| image:: images/AKAZE_Match_Tutorial_Cover.png
+                     :height: 90pt
+                     :width:  90pt
+
 .. raw:: latex

   \pagebreak
@ -201,3 +220,4 @@ Learn about how to use the feature points  detectors, descriptors and matching f
   ../feature_flann_matcher/feature_flann_matcher
   ../feature_homography/feature_homography
   ../detection_of_planar_objects/detection_of_planar_objects
+   ../akaze_matching/akaze_matching
--- a/doc/tutorials/highgui/video-input-psnr-ssim/video-input-psnr-ssim.rst
+++ b/doc/tutorials/highgui/video-input-psnr-ssim/video-input-psnr-ssim.rst
@ -22,7 +22,7 @@ As a test case where to show off these using OpenCV I've created a small program
   :language: cpp
   :linenos:
   :tab-width: 4
-   :lines: 1-14, 28-29, 31-205
+   :lines: 1-15, 29-31, 33-208

 How to read a video stream (online-camera or offline-file)?
 ===========================================================
--- a/doc/tutorials/introduction/clojure_dev_intro/clojure_dev_intro.rst
+++ b/doc/tutorials/introduction/clojure_dev_intro/clojure_dev_intro.rst
@ -656,7 +656,7 @@ classes we're going to use:
     Results: Stored in vars *1, *2, *3, an exception in *e

    user=> (import '[org.opencv.core Mat Size CvType]
-                   '[org.opencv.highgui Highgui]
+                   '[org.opencv.imgcodecs Imgcodecs]
                   '[org.opencv.imgproc Imgproc])
    org.opencv.imgproc.Imgproc

--- a/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
+++ b/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
@ -373,7 +373,7 @@ Now modify src/main/java/HelloOpenCV.java so it contains the following Java code
   import org.opencv.core.Point;
   import org.opencv.core.Rect;
   import org.opencv.core.Scalar;
-   import org.opencv.highgui.Highgui;
+   import org.opencv.imgcodecs.Imgcodecs;
   import org.opencv.objdetect.CascadeClassifier;

   //
@ -387,7 +387,7 @@ Now modify src/main/java/HelloOpenCV.java so it contains the following Java code
       // Create a face detector from the cascade file in the resources
       // directory.
       CascadeClassifier faceDetector = new CascadeClassifier(getClass().getResource("/lbpcascade_frontalface.xml").getPath());
-       Mat image = Highgui.imread(getClass().getResource("/lena.png").getPath());
+       Mat image = Imgcodecs.imread(getClass().getResource("/lena.png").getPath());

       // Detect faces in the image.
       // MatOfRect is a special container class for Rect.
@ -404,7 +404,7 @@ Now modify src/main/java/HelloOpenCV.java so it contains the following Java code
       // Save the visualized detection.
       String filename = "faceDetection.png";
       System.out.println(String.format("Writing %s", filename));
-       Highgui.imwrite(filename, image);
+       Imgcodecs.imwrite(filename, image);
     }
   }

--- a/doc/tutorials/introduction/display_image/display_image.rst
+++ b/doc/tutorials/introduction/display_image/display_image.rst
@ -39,28 +39,28 @@ You'll almost always end up using the:
 .. literalinclude:: ../../../../samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
   :language: cpp
   :tab-width: 4
-   :lines:  1-3
+   :lines:  1-4

 We also include the *iostream* to facilitate console line output and input. To avoid data structure and function name conflicts with other libraries, OpenCV has its own namespace: *cv*. To avoid the need appending prior each of these the *cv::* keyword you can import the namespace in the whole file by using the lines:

 .. literalinclude:: ../../../../samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
   :language: cpp
   :tab-width: 4
-   :lines:  5-6
+   :lines:  6-7

 This is true for the STL library too (used for console I/O). Now, let's analyze the *main* function. We start up assuring that we acquire a valid image name argument from the command line.

 .. literalinclude:: ../../../../samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 10-14
+   :lines: 11-15

 Then create a *Mat* object that will store the data of the loaded image.

 .. literalinclude:: ../../../../samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 16
+   :lines: 17

 Now we call the :imread:`imread <>` function which loads the image name specified by the first argument (*argv[1]*). The second argument specifies the format in what we want the image. This may be:

@ -73,7 +73,7 @@ Now we call the :imread:`imread <>` function which loads the image name specifie
 .. literalinclude:: ../../../../samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
   :language: cpp
   :tab-width: 4
-   :lines: 17
+   :lines: 18

 .. note::

@ -88,21 +88,21 @@ After checking that the image data was loaded correctly, we want to display our

 .. literalinclude:: ../../../../samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
   :language: cpp
-   :lines: 25
+   :lines: 26
   :tab-width: 4

 Finally, to update the content of the OpenCV window with a new image use the :imshow:`imshow <>` function. Specify the OpenCV window name to update and the image to use during this operation:

 .. literalinclude:: ../../../../samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
   :language: cpp
-   :lines: 26
+   :lines: 27
   :tab-width: 4

 Because we want our window to be displayed until the user presses a key (otherwise the program would end far too quickly), we use the :wait_key:`waitKey <>` function whose only parameter is just how long should it wait for a user input (measured in milliseconds). Zero means to wait forever.

 .. literalinclude:: ../../../../samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
   :language: cpp
-   :lines: 28
+   :lines: 29
   :tab-width: 4

 Result
--- a/doc/tutorials/introduction/how_to_write_a_tutorial/how_to_write_a_tutorial.rst
+++ b/doc/tutorials/introduction/how_to_write_a_tutorial/how_to_write_a_tutorial.rst
@ -349,7 +349,7 @@ Now here's our recommendation for the structure of the tutorial (although, remem
           :language: cpp
           :linenos:
           :tab-width: 4
-           :lines: 1-8, 21-22, 24-
+           :lines: 1-8, 21-23, 25-

     After the directive you specify a relative path to the file from what to import. It has four options: the language to use, if you add the ``:linenos:`` the line numbers will be shown, you can specify the tab size with the ``:tab-width:`` and you do not need to load the whole file, you can show just the important lines. Use the *lines* option to do not show redundant information (such as the *help* function). Here basically you specify ranges, if the second range line number is missing than that means that until the end of the file. The ranges specified here do no need to be in an ascending order, you may even reorganize the structure of how you want to show your sample inside the tutorial.

@ -361,16 +361,16 @@ Now here's our recommendation for the structure of the tutorial (although, remem

       # ---- External links for tutorials -----------------
       extlinks = {
-           'hgvideo' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#%s', None)
+           'rwimg' : ('http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html#%s', None)
           }

-     In short here we defined a new **hgvideo** directive that refers to an external webpage link. Its usage is:
+     In short here we defined a new **rwimg** directive that refers to an external webpage link. Its usage is:

     .. code-block:: rst

-       A sample function of the highgui modules image write and read page is the :hgvideo:`imread() function <imread>`.
+       A sample function of the highgui modules image write and read page is the :rwimg:`imread() function <imread>`.

-     Which turns to: A sample function of the highgui modules image write and read page is the :hgvideo:`imread() function <imread>`. The argument you give between the <> will be put in place of the ``%s`` in the upper definition, and as the link will anchor to the correct function. To find out the anchor of a given function just open up a web page, search for the function and click on it. In the address bar it should appear like: ``http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#imread`` .  Look here for the name of the directives for each page of the OpenCV reference manual. If none present for one of them feel free to add one for it.
+     Which turns to: A sample function of the highgui modules image write and read page is the :rwimg:`imread() function <imread>`. The argument you give between the <> will be put in place of the ``%s`` in the upper definition, and as the link will anchor to the correct function. To find out the anchor of a given function just open up a web page, search for the function and click on it. In the address bar it should appear like: ``http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images.html#imread`` .  Look here for the name of the directives for each page of the OpenCV reference manual. If none present for one of them feel free to add one for it.

     For formulas you can add LATEX code that will translate in the web pages into images. You do this by using the *math* directive. A usage tip:

--- a/doc/tutorials/introduction/load_save_image/load_save_image.rst
+++ b/doc/tutorials/introduction/load_save_image/load_save_image.rst
@ -5,7 +5,7 @@ Load, Modify, and Save an Image

 .. note::

-   We assume that by now you know how to load an image using :readwriteimagevideo:`imread <imread>` and to display it in a window (using :user_interface:`imshow <imshow>`). Read the :ref:`Display_Image` tutorial otherwise.
+   We assume that by now you know how to load an image using :readwriteimage:`imread <imread>` and to display it in a window (using :user_interface:`imshow <imshow>`). Read the :ref:`Display_Image` tutorial otherwise.

 Goals
 ======
@ -14,9 +14,9 @@ In this tutorial you will learn how to:

 .. container:: enumeratevisibleitemswithsquare

-   * Load an image using :readwriteimagevideo:`imread <imread>`
+   * Load an image using :readwriteimage:`imread <imread>`
   * Transform an image from BGR to Grayscale format by using :miscellaneous_transformations:`cvtColor <cvtcolor>`
-   * Save your transformed image in a file on disk (using :readwriteimagevideo:`imwrite <imwrite>`)
+   * Save your transformed image in a file on disk (using :readwriteimage:`imwrite <imwrite>`)

 Code
 ======
@ -62,7 +62,7 @@ Here it is:
 Explanation
 ============

-#. We begin by loading an image using :readwriteimagevideo:`imread <imread>`, located in the path given by *imageName*. For this example, assume you are loading a RGB image.
+#. We begin by loading an image using :readwriteimage:`imread <imread>`, located in the path given by *imageName*. For this example, assume you are loading a RGB image.

 #. Now we are going to convert our image from BGR to Grayscale format. OpenCV has a really nice function to do this kind of transformations:

@ -76,9 +76,9 @@ Explanation

      * a source image (*image*)
      * a destination image (*gray_image*), in which we will save the converted image.
-      * an additional parameter that indicates what kind of transformation will be performed. In this case we use **CV_BGR2GRAY** (because of :readwriteimagevideo:`imread <imread>` has BGR default channel order in case of color images).
+      * an additional parameter that indicates what kind of transformation will be performed. In this case we use **CV_BGR2GRAY** (because of :readwriteimage:`imread <imread>` has BGR default channel order in case of color images).

-#. So now we have our new *gray_image* and want to save it on disk (otherwise it will get lost after the program ends). To save it, we will use a function analagous to :readwriteimagevideo:`imread <imread>`: :readwriteimagevideo:`imwrite <imwrite>`
+#. So now we have our new *gray_image* and want to save it on disk (otherwise it will get lost after the program ends). To save it, we will use a function analagous to :readwriteimage:`imread <imread>`: :readwriteimage:`imwrite <imwrite>`

   .. code-block:: cpp

--- a/doc/tutorials/introduction/windows_visual_studio_image_watch/windows_visual_studio_image_watch.rst
+++ b/doc/tutorials/introduction/windows_visual_studio_image_watch/windows_visual_studio_image_watch.rst
@ -32,7 +32,7 @@ Image Watch works with any existing project that uses OpenCV image objects (for

    #include <iostream>                        // std::cout
    #include <opencv2/core/core.hpp>           // cv::Mat
-    #include <opencv2/highgui/highgui.hpp>     // cv::imread()
+    #include <opencv2/imgcodecs/imgcodecs.hpp>     // cv::imread()
    #include <opencv2/imgproc/imgproc.hpp>     // cv::Canny()

    using namespace std;
--- a/doc/tutorials/ios/video_processing/video_processing.rst
+++ b/doc/tutorials/ios/video_processing/video_processing.rst
@ -80,7 +80,7 @@ We add a camera controller to the view controller and initialize it when the vie
 .. code-block:: objc
    :linenos:

-    #import <opencv2/highgui/cap_ios.h>
+    #import <opencv2/videoio/cap_ios.h>
    using namespace cv;


--- a/doc/tutorials/ml/non_linear_svms/non_linear_svms.rst
+++ b/doc/tutorials/ml/non_linear_svms/non_linear_svms.rst
@ -73,7 +73,7 @@ You may also find the source code and these video file in the :file:`samples/cpp
   :language: cpp
   :linenos:
   :tab-width: 4
-   :lines: 1-11, 22-23, 26-
+   :lines: 1-12, 23-24, 27-

 Explanation
 ===========
--- a/doc/user_guide/ug_traincascade.rst
+++ b/doc/user_guide/ug_traincascade.rst
@ -200,6 +200,12 @@ Command line arguments of ``opencv_traincascade`` application grouped by purpose

        This argument is actual in case of Haar-like features. If it is specified, the cascade will be saved in the old format.

+    * ``-numThreads <max_number_of_threads>``
+
+        Maximum number of threads to use during training. Notice that
+        the actual number of used threads may be lower, depending on
+        your machine and compilation options.
+
 #.

    Cascade parameters:
--- a/include/opencv2/opencv.hpp
+++ b/include/opencv2/opencv.hpp
@ -50,6 +50,8 @@
 #include "opencv2/features2d.hpp"
 #include "opencv2/objdetect.hpp"
 #include "opencv2/calib3d.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/videoio.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/ml.hpp"

--- a/modules/calib3d/perf/perf_precomp.hpp
+++ b/modules/calib3d/perf/perf_precomp.hpp
@ -11,7 +11,7 @@

 #include "opencv2/ts.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/highgui.hpp"
+#include "opencv2/imgcodecs.hpp"
 #include "opencv2/imgproc.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
--- a/modules/calib3d/test/test_precomp.hpp
+++ b/modules/calib3d/test/test_precomp.hpp
@ -13,7 +13,7 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/highgui.hpp"
+#include "opencv2/imgcodecs.hpp"

 namespace cvtest
 {
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@ -2981,7 +2981,7 @@ The class provides the following features for all derived classes:

    * so called "virtual constructor". That is, each Algorithm derivative is registered at program start and you can get the list of registered algorithms and create instance of a particular algorithm by its name (see ``Algorithm::create``). If you plan to add your own algorithms, it is good practice to add a unique prefix to your algorithms to distinguish them from other algorithms.

-    * setting/retrieving algorithm parameters by name. If you used video capturing functionality from OpenCV highgui module, you are probably familar with ``cvSetCaptureProperty()``, ``cvGetCaptureProperty()``, ``VideoCapture::set()`` and ``VideoCapture::get()``. ``Algorithm`` provides similar method where instead of integer id's you specify the parameter names as text strings. See ``Algorithm::set`` and ``Algorithm::get`` for details.
+    * setting/retrieving algorithm parameters by name. If you used video capturing functionality from OpenCV videoio module, you are probably familar with ``cvSetCaptureProperty()``, ``cvGetCaptureProperty()``, ``VideoCapture::set()`` and ``VideoCapture::get()``. ``Algorithm`` provides similar method where instead of integer id's you specify the parameter names as text strings. See ``Algorithm::set`` and ``Algorithm::get`` for details.

    * reading and writing parameters from/to XML or YAML files. Every Algorithm derivative can store all its parameters and then read them back. There is no need to re-implement it each time.

--- a/modules/core/doc/drawing_functions.rst
+++ b/modules/core/doc/drawing_functions.rst
@ -361,6 +361,37 @@ The function ``line`` draws the line segment between ``pt1`` and ``pt2`` points
 Antialiased lines are drawn using Gaussian filtering.


+arrowedLine
+----------------
+Draws a arrow segment pointing from the first point to the second one.
+
+.. ocv:function:: void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,          int thickness=1, int lineType=8, int shift=0, double tipLength=0.1)
+
+    :param img: Image.
+
+    :param pt1: The point the arrow starts from.
+
+    :param pt2: The point the arrow points to.
+
+    :param color: Line color.
+
+    :param thickness: Line thickness.
+
+    :param lineType: Type of the line:
+
+            * **8** (or omitted) - 8-connected line.
+
+            * **4** - 4-connected line.
+
+            * **CV_AA** - antialiased line.
+
+    :param shift: Number of fractional bits in the point coordinates.
+
+    :param tipLength: The length of the arrow tip in relation to the arrow length
+
+The function ``arrowedLine`` draws an arrow between ``pt1`` and ``pt2`` points in the image. See also :ocv:func:`line`.
+
+
 LineIterator
 ------------
 .. ocv:class:: LineIterator
--- a/modules/core/doc/intro.rst
+++ b/modules/core/doc/intro.rst
@ -14,7 +14,8 @@ OpenCV has a modular structure, which means that the package includes several sh
 * **calib3d** - basic multiple-view geometry algorithms, single and stereo camera calibration, object pose estimation, stereo correspondence algorithms, and elements of 3D reconstruction.
 * **features2d** - salient feature detectors, descriptors, and descriptor matchers.
 * **objdetect** - detection of objects and instances of the predefined classes (for example, faces, eyes, mugs, people, cars, and so on).
- * **highgui** - an easy-to-use interface to video capturing, image and video codecs, as well as simple UI capabilities.
+ * **highgui** - an easy-to-use interface to simple UI capabilities.
+ * **videoio** - an easy-to-use interface to video capturing and video codecs.
 * **gpu** - GPU-accelerated algorithms from different OpenCV modules.
 * ... some other helper modules, such as FLANN and Google test wrappers, Python bindings, and others.

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -510,6 +510,10 @@ CV_EXPORTS_W void randShuffle(InputOutputArray dst, double iterFactor = 1., RNG*
 CV_EXPORTS_W void line(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
                     int thickness = 1, int lineType = LINE_8, int shift = 0);

+//! draws an arrow from pt1 to pt2 in the image
+CV_EXPORTS_W void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
+                     int thickness=1, int line_type=8, int shift=0, double tipLength=0.1);
+
 //! draws the rectangle outline or a solid rectangle with the opposite corners pt1 and pt2 in the image
 CV_EXPORTS_W void rectangle(InputOutputArray img, Point pt1, Point pt2,
                          const Scalar& color, int thickness = 1,
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -244,6 +244,7 @@ typedef signed char schar;

 /* fundamental constants */
 #define CV_PI   3.1415926535897932384626433832795
+#define CV_2PI 6.283185307179586476925286766559
 #define CV_LOG2 0.69314718055994530941723212145818

 /****************************************************************************************\
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -360,7 +360,7 @@ struct CV_EXPORTS UMatData
 {
    enum { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
        DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24,
-        USER_ALLOCATED=32 };
+        USER_ALLOCATED=32, DEVICE_MEM_MAPPED=64};
    UMatData(const MatAllocator* allocator);
    ~UMatData();

@ -370,11 +370,13 @@ struct CV_EXPORTS UMatData

    bool hostCopyObsolete() const;
    bool deviceCopyObsolete() const;
+    bool deviceMemMapped() const;
    bool copyOnMap() const;
    bool tempUMat() const;
    bool tempCopiedUMat() const;
    void markHostCopyObsolete(bool flag);
    void markDeviceCopyObsolete(bool flag);
+    void markDeviceMemMapped(bool flag);

    const MatAllocator* prevAllocator;
    const MatAllocator* currAllocator;
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -3350,10 +3350,19 @@ size_t UMat::total() const

 inline bool UMatData::hostCopyObsolete() const { return (flags & HOST_COPY_OBSOLETE) != 0; }
 inline bool UMatData::deviceCopyObsolete() const { return (flags & DEVICE_COPY_OBSOLETE) != 0; }
+inline bool UMatData::deviceMemMapped() const { return (flags & DEVICE_MEM_MAPPED) != 0; }
 inline bool UMatData::copyOnMap() const { return (flags & COPY_ON_MAP) != 0; }
 inline bool UMatData::tempUMat() const { return (flags & TEMP_UMAT) != 0; }
 inline bool UMatData::tempCopiedUMat() const { return (flags & TEMP_COPIED_UMAT) == TEMP_COPIED_UMAT; }

+inline void UMatData::markDeviceMemMapped(bool flag)
+{
+  if(flag)
+    flags |= DEVICE_MEM_MAPPED;
+  else
+    flags &= ~DEVICE_MEM_MAPPED;
+}
+
 inline void UMatData::markHostCopyObsolete(bool flag)
 {
    if(flag)
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@ -308,6 +308,23 @@ OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine(
    SANITY_CHECK(dst);
 }

+OCL_PERF_TEST_P(TransposeFixture, TransposeInplace, ::testing::Combine(
+                OCL_PERF_ENUM(Size(640, 640), Size(1280, 1280), Size(2160, 2160)), OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(src, WARMUP_NONE);
+
+    OCL_TEST_CYCLE() cv::transpose(src, src);
+
+    SANITY_CHECK_NOTHING();
+}
+
 ///////////// Flip ////////////////////////

 enum
--- a/modules/core/perf/opencl/perf_dxt.cpp
+++ b/modules/core/perf/opencl/perf_dxt.cpp
@ -54,23 +54,42 @@ namespace ocl {

 ///////////// dft ////////////////////////

-typedef tuple<Size, int> DftParams;
+enum OCL_FFT_TYPE
+{
+    R2R = 0,
+    C2R = 1,
+    R2C = 2,
+    C2C = 3
+};
+
+typedef tuple<OCL_FFT_TYPE, Size, int> DftParams;
 typedef TestBaseWithParam<DftParams> DftFixture;

-OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
-                                                Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
-                                                       (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
+OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C),
+                                                    Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(512, 512), Size(1024, 1024), Size(2048, 2048)),
+                                                    Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
+                                                           (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
 {
    const DftParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int flags = get<1>(params);
-
-    UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2);
+    const int dft_type = get<0>(params);
+    const Size srcSize = get<1>(params);
+    int flags = get<2>(params);
+
+    int in_cn, out_cn;
+    switch (dft_type)
+    {
+    case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break;
+    case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break;
+    case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break;
+    case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break;
+    }
+
+    UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn));
    declare.in(src, WARMUP_RNG).out(dst);

-    OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT);
+    OCL_TEST_CYCLE() cv::dft(src, dst, flags);

-    SANITY_CHECK(dst, 1e-3);
+    SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
 }

 ///////////// MulSpectrums ////////////////////////
--- a/modules/core/perf/opencl/perf_matop.cpp
+++ b/modules/core/perf/opencl/perf_matop.cpp
@ -139,6 +139,7 @@ OCL_PERF_TEST_P(CopyToFixture, CopyToWithMaskUninit,
        dst.release();
        startTimer();
        src.copyTo(dst, mask);
+        cv::ocl::finish();
        stopTimer();
    }

--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -54,21 +54,23 @@ namespace cv

 struct NOP {};

-#if CV_SSE2
+#if CV_SSE2 || CV_NEON

 #define FUNCTOR_TEMPLATE(name)          \
    template<typename T> struct name {}

 FUNCTOR_TEMPLATE(VLoadStore128);
+#if CV_SSE2
 FUNCTOR_TEMPLATE(VLoadStore64);
 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
+#endif

 #endif

 template<typename T, class Op, class VOp>
 void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
 {
-#if CV_SSE2
+#if CV_SSE2 || CV_NEON
    VOp vop;
 #endif
    Op op;
@ -79,9 +81,11 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
    {
        int x = 0;

+#if CV_NEON || CV_SSE2
 #if CV_SSE2
        if( USE_SSE2 )
        {
+#endif
            for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
            {
                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
@ -91,8 +95,10 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
                VLoadStore128<T>::store(dst + x               , r0);
                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
            }
+#if CV_SSE2
        }
 #endif
+#endif
 #if CV_SSE2
        if( USE_SSE2 )
        {
@ -125,7 +131,7 @@ template<typename T, class Op, class Op32>
 void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
              T* dst, size_t step, Size sz)
 {
-#if CV_SSE2
+#if CV_SSE2 || CV_NEON
    Op32 op32;
 #endif
    Op op;
@ -153,9 +159,11 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
            }
        }
 #endif
+#if CV_NEON || CV_SSE2
 #if CV_SSE2
        if( USE_SSE2 )
        {
+#endif
            for( ; x <= sz.width - 8; x += 8 )
            {
                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
@ -165,8 +173,10 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
                VLoadStore128<T>::store(dst + x    , r0);
                VLoadStore128<T>::store(dst + x + 4, r1);
            }
+#if CV_SSE2
        }
 #endif
+#endif
 #if CV_ENABLE_UNROLLED
        for( ; x <= sz.width - 4; x += 4 )
        {
@ -383,7 +393,98 @@ FUNCTOR_TEMPLATE(VNot);
 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
 #endif

-#if CV_SSE2
+#if CV_NEON
+
+#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
+    template <>                                                                \
+    struct name<template_arg>{                                                 \
+        typedef register_type reg_type;                                        \
+        static reg_type load(const template_arg * p) { return load_body (p);}; \
+        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
+    }
+
+#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
+    template<>                                                         \
+    struct name<template_arg>                                          \
+    {                                                                  \
+        VLoadStore128<template_arg>::reg_type operator()(              \
+                        VLoadStore128<template_arg>::reg_type a,       \
+                        VLoadStore128<template_arg>::reg_type b) const \
+        {                                                              \
+            return body;                                               \
+        };                                                             \
+    }
+
+#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
+    template<>                                                         \
+    struct name<template_arg>                                          \
+    {                                                                  \
+        VLoadStore128<template_arg>::reg_type operator()(              \
+                        VLoadStore128<template_arg>::reg_type a,       \
+                        VLoadStore128<template_arg>::reg_type  ) const \
+        {                                                              \
+            return body;                                               \
+        };                                                             \
+    }
+
+FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
+FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
+FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
+FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
+FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
+FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
+
+FUNCTOR_TEMPLATE(VAdd);
+FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VSub);
+FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VMin);
+FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
+
+FUNCTOR_TEMPLATE(VMax);
+FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
+
+FUNCTOR_TEMPLATE(VAbsDiff);
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
+FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VAnd);
+FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
+FUNCTOR_TEMPLATE(VOr);
+FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
+FUNCTOR_TEMPLATE(VXor);
+FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
+FUNCTOR_TEMPLATE(VNot);
+FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
+#endif
+
+#if CV_SSE2 || CV_NEON
 #define IF_SIMD(op) op
 #else
 #define IF_SIMD(op) NOP
@ -2980,8 +3081,187 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
 namespace cv
 {

-template<typename T> static void
-inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
+template <typename T>
+struct InRange_SSE
+{
+    int operator () (const T *, const T *, const T *, uchar *, int) const
+    {
+        return 0;
+    }
+};
+
+#if CV_SSE2
+
+template <>
+struct InRange_SSE<uchar>
+{
+    int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128);
+
+            for ( ; x <= len - 16; x += 16 )
+            {
+                __m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128);
+                __m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src);
+                __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128));
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<schar>
+{
+    int operator () (const schar * src1, const schar * src2, const schar * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_full = _mm_set1_epi8(-1);
+
+            for ( ; x <= len - 16; x += 16 )
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
+                __m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
+                __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<ushort>
+{
+    int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768);
+
+            for ( ; x <= len - 8; x += 8 )
+            {
+                __m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768);
+                __m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src);
+                __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768));
+                __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<short>
+{
+    int operator () (const short * src1, const short * src2, const short * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1);
+
+            for ( ; x <= len - 8; x += 8 )
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
+                __m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
+                __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
+                __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<int>
+{
+    int operator () (const int * src1, const int * src2, const int * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1);
+
+            for ( ; x <= len - 8; x += 8 )
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
+                __m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src),
+                    _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))));
+
+                v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4));
+                __m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src),
+                    _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4))));
+
+                __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16),
+                                                _mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<float>
+{
+    int operator () (const float * src1, const float * src2, const float * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_zero = _mm_setzero_si128();
+
+            for ( ; x <= len - 8; x += 8 )
+            {
+                __m128 v_src = _mm_loadu_ps(src1 + x);
+                __m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src),
+                    _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x)));
+
+                v_src = _mm_loadu_ps(src1 + x + 4);
+                __m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src),
+                    _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4)));
+
+                __m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2);
+                __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
+            }
+        }
+
+        return x;
+    }
+};
+
+#endif
+
+template <typename T>
+static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
         const T* src3, size_t step3, uchar* dst, size_t step,
         Size size)
 {
@ -2989,9 +3269,11 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
    step2 /= sizeof(src2[0]);
    step3 /= sizeof(src3[0]);

+    InRange_SSE<T> vop;
+
    for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
    {
-        int x = 0;
+        int x = vop(src1, src2, src3, dst, size.width);
        #if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
        {
@ -3132,9 +3414,16 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
         (!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
        return false;

-    ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc,
-                    format("%s-D cn=%d -D T=%s%s", haveScalar ? "-D HAVE_SCALAR " : "",
-                           cn, ocl::typeToStr(sdepth), doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn);
+    if (kercn % cn != 0)
+        kercn = cn;
+    int colsPerWI = kercn / cn;
+    String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
+                           haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
+                           ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
+                           doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);
+
+    ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts);
    if (ker.empty())
        return false;

@ -3182,7 +3471,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
    }

    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
-            dstarg = ocl::KernelArg::WriteOnly(dst);
+            dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI);

    if (haveScalar)
    {
@ -3196,7 +3485,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
        ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
               ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);

-    size_t globalsize[2] = { ssize.width, (ssize.height + rowsPerWI - 1) / rowsPerWI };
+    size_t globalsize[2] = { ssize.width / colsPerWI, (ssize.height + rowsPerWI - 1) / rowsPerWI };
    return ker.run(2, globalsize, NULL, false);
 }

--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -851,6 +851,175 @@ void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi)
 namespace cv
 {

+template<typename T, typename DT, typename WT>
+struct cvtScaleAbs_SSE2
+{
+    int operator () (const T *, DT *, int, WT, WT) const
+    {
+        return 0;
+    }
+};
+
+#if CV_SSE2
+
+template <>
+struct cvtScaleAbs_SSE2<uchar, uchar, float>
+{
+    int operator () (const uchar * src, uchar * dst, int width,
+                     float scale, float shift) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
+                v_zero_f = _mm_setzero_ps();
+            __m128i v_zero_i = _mm_setzero_si128();
+
+            for ( ; x <= width - 16; x += 16)
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
+                __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
+                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
+                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
+                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
+                __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
+                v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
+                __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
+                v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
+
+                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
+                                                   _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
+                _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScaleAbs_SSE2<ushort, uchar, float>
+{
+    int operator () (const ushort * src, uchar * dst, int width,
+                     float scale, float shift) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
+                v_zero_f = _mm_setzero_ps();
+            __m128i v_zero_i = _mm_setzero_si128();
+
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
+                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
+                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
+                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
+
+                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
+                _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScaleAbs_SSE2<short, uchar, float>
+{
+    int operator () (const short * src, uchar * dst, int width,
+                     float scale, float shift) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
+                v_zero_f = _mm_setzero_ps();
+            __m128i v_zero_i = _mm_setzero_si128();
+
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
+                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
+                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
+                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
+
+                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
+                _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScaleAbs_SSE2<int, uchar, float>
+{
+    int operator () (const int * src, uchar * dst, int width,
+                     float scale, float shift) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
+                v_zero_f = _mm_setzero_ps();
+            __m128i v_zero_i = _mm_setzero_si128();
+
+            for ( ; x <= width - 8; x += 4)
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
+                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+
+                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
+                _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScaleAbs_SSE2<float, uchar, float>
+{
+    int operator () (const float * src, uchar * dst, int width,
+                     float scale, float shift) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
+                v_zero_f = _mm_setzero_ps();
+            __m128i v_zero_i = _mm_setzero_si128();
+
+            for ( ; x <= width - 8; x += 4)
+            {
+                __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
+                v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
+
+                __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
+            }
+        }
+
+        return x;
+    }
+};
+
+#endif
+
 template<typename T, typename DT, typename WT> static void
 cvtScaleAbs_( const T* src, size_t sstep,
              DT* dst, size_t dstep, Size size,
@ -858,10 +1027,12 @@ cvtScaleAbs_( const T* src, size_t sstep,
 {
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
+    cvtScaleAbs_SSE2<T, DT, WT> vop;

    for( ; size.height--; src += sstep, dst += dstep )
    {
-        int x = 0;
+        int x = vop(src, dst, size.width, scale, shift);
+
        #if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
        {
@ -879,7 +1050,6 @@ cvtScaleAbs_( const T* src, size_t sstep,
    }
 }

-
 template<typename T, typename DT, typename WT> static void
 cvtScale_( const T* src, size_t sstep,
           DT* dst, size_t dstep, Size size,
@ -1559,22 +1729,18 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
    UMat src = _src.getUMat(), lut = _lut.getUMat();
    _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
    UMat dst = _dst.getUMat();
-    bool bAligned = (1 == lcn) && (0 == (src.offset % 4)) && (0 == ((dcn * src.cols) % 4));
-    // dst.cols == src.cols by params of dst.create
+    int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_dst)) : dcn;

    ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
-                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", bAligned ? 4 : dcn, lcn,
-                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)
-                         ));
+                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
+                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
    if (k.empty())
        return false;

-    int cols = bAligned ? dcn * dst.cols / 4 : dst.cols;
-
    k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
-        ocl::KernelArg::WriteOnlyNoSize(dst), dst.rows, cols);
+        ocl::KernelArg::WriteOnly(dst, dcn, kercn));

-    size_t globalSize[2] = { cols, (dst.rows + 3) / 4 };
+    size_t globalSize[2] = { dst.cols * dcn / kercn, (dst.rows + 3) / 4 };
    return k.run(2, globalSize, NULL, false);
 }

--- a/modules/core/src/cuda_buffer_pool.cpp
+++ b/modules/core/src/cuda_buffer_pool.cpp
@ -207,7 +207,6 @@ namespace
    MemoryStack* MemoryPool::getFreeMemStack()
    {
        AutoLock lock(mtx_);
-
        if (!initialized_)
            initilizeImpl();

@ -256,22 +255,31 @@ namespace

 namespace
 {
+    Mutex mtx_;
+    bool memory_pool_manager_initialized;
+
    class MemoryPoolManager
    {
    public:
        MemoryPoolManager();
        ~MemoryPoolManager();
+        void Init();

        MemoryPool* getPool(int deviceId);

    private:
        std::vector<MemoryPool> pools_;
-    };
+    } manager;
+
+    //MemoryPoolManager ;

    MemoryPoolManager::MemoryPoolManager()
    {
-        int deviceCount = getCudaEnabledDeviceCount();
+    }

+    void MemoryPoolManager::Init()
+    {
+        int deviceCount = getCudaEnabledDeviceCount();
        if (deviceCount > 0)
            pools_.resize(deviceCount);
    }
@ -280,7 +288,7 @@ namespace
    {
        for (size_t i = 0; i < pools_.size(); ++i)
        {
-            cudaSetDevice(i);
+            cudaSetDevice(static_cast<int>(i));
            pools_[i].release();
        }
    }
@ -293,7 +301,14 @@ namespace

    MemoryPool* memPool(int deviceId)
    {
-        static MemoryPoolManager manager;
+        {
+            AutoLock lock(mtx_);
+            if (!memory_pool_manager_initialized)
+            {
+                memory_pool_manager_initialized = true;
+                manager.Init();
+            }
+        }
        return manager.getPool(deviceId);
    }
 }
@ -311,8 +326,10 @@ cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream),
    if (enableMemoryPool)
    {
        const int deviceId = getDevice();
-        memStack_ = memPool(deviceId)->getFreeMemStack();
-
+        {
+            AutoLock lock(mtx_);
+            memStack_ = memPool(deviceId)->getFreeMemStack();
+        }
        DeviceInfo devInfo(deviceId);
        alignment_ = devInfo.textureAlignment();
    }
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@ -190,10 +190,22 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
 #endif
 }

+namespace
+{
+    bool default_stream_is_initialized;
+    Mutex mtx;
+    Ptr<Stream> default_stream;
+}
+
 Stream& cv::cuda::Stream::Null()
 {
-    static Stream s(Ptr<Impl>(new Impl(0)));
-    return s;
+    AutoLock lock(mtx);
+    if (!default_stream_is_initialized)
+    {
+        default_stream = Ptr<Stream>(new Stream(Ptr<Impl>(new Impl(0))));
+        default_stream_is_initialized = true;
+    }
+    return *default_stream;
 }

 cv::cuda::Stream::operator bool_type() const
--- a/modules/core/src/drawing.cpp
+++ b/modules/core/src/drawing.cpp
@ -1584,6 +1584,24 @@ void line( InputOutputArray _img, Point pt1, Point pt2, const Scalar& color,
    ThickLine( img, pt1, pt2, buf, thickness, line_type, 3, shift );
 }

+void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
+           int thickness, int line_type, int shift, double tipLength)
+{
+    const double tipSize = norm(pt1-pt2)*tipLength; // Factor to normalize the size of the tip depending on the length of the arrow
+
+    line(img, pt1, pt2, color, thickness, line_type, shift);
+
+    const double angle = atan2( (double) pt1.y - pt2.y, (double) pt1.x - pt2.x );
+
+    Point p(cvRound(pt2.x + tipSize * cos(angle + CV_PI / 4)),
+        cvRound(pt2.y + tipSize * sin(angle + CV_PI / 4)));
+    line(img, p, pt2, color, thickness, line_type, shift);
+
+    p.x = cvRound(pt2.x + tipSize * cos(angle - CV_PI / 4));
+    p.y = cvRound(pt2.y + tipSize * sin(angle - CV_PI / 4));
+    line(img, p, pt2, color, thickness, line_type, shift);
+}
+
 void rectangle( InputOutputArray _img, Point pt1, Point pt2,
                const Scalar& color, int thickness,
                int lineType, int shift )
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@ -43,6 +43,7 @@
 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
 #include "opencl_kernels.hpp"
+#include <map>

 namespace cv
 {
@ -1781,6 +1782,375 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
 #endif
 }

+#ifdef HAVE_OPENCL
+
+namespace cv
+{
+
+enum FftType
+{
+    R2R = 0, // real to CCS in case forward transform, CCS to real otherwise
+    C2R = 1, // complex to real in case inverse transform
+    R2C = 2, // real to complex in case forward transform
+    C2C = 3  // complex to complex
+};
+
+struct OCL_FftPlan
+{
+private:
+    UMat twiddles;
+    String buildOptions;
+    int thread_count;
+    bool status;
+    int dft_size;
+
+public:
+    OCL_FftPlan(int _size): dft_size(_size), status(true)
+    {
+        int min_radix;
+        std::vector<int> radixes, blocks;
+        ocl_getRadixes(dft_size, radixes, blocks, min_radix);
+        thread_count = dft_size / min_radix;
+
+        if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize())
+        {
+            status = false;
+            return;
+        }
+
+        // generate string with radix calls
+        String radix_processing;
+        int n = 1, twiddle_size = 0;
+        for (size_t i=0; i<radixes.size(); i++)
+        {
+            int radix = radixes[i], block = blocks[i];
+            if (block > 1)
+                radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix);
+            else
+                radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix);
+            twiddle_size += (radix-1)*n;
+            n *= radix;
+        }
+
+        Mat tw(1, twiddle_size, CV_32FC2);
+        float* ptr = tw.ptr<float>();
+        int ptr_index = 0;
+
+        n = 1;
+        for (size_t i=0; i<radixes.size(); i++)
+        {
+            int radix = radixes[i];
+            n *= radix;
+
+            for (int j=1; j<radix; j++)
+            {
+                double theta = -CV_2PI*j/n;
+
+                for (int k=0; k<(n/radix); k++)
+                {
+                    ptr[ptr_index++] = (float) cos(k*theta);
+                    ptr[ptr_index++] = (float) sin(k*theta);
+                }
+            }
+        }
+        twiddles = tw.getUMat(ACCESS_READ);
+
+        buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s",
+                              dft_size, min_radix, radix_processing.c_str());
+    }
+
+    bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true) const
+    {
+        if (!status)
+            return false;
+
+        UMat src = _src.getUMat();
+        UMat dst = _dst.getUMat();
+
+        size_t globalsize[2];
+        size_t localsize[2];
+        String kernel_name;
+
+        bool is1d = (flags & DFT_ROWS) != 0 || num_dfts == 1;
+        bool inv = (flags & DFT_INVERSE) != 0;
+        String options = buildOptions;
+
+        if (rows)
+        {
+            globalsize[0] = thread_count; globalsize[1] = src.rows;
+            localsize[0] = thread_count; localsize[1] = 1;
+            kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows";
+            if ((is1d || inv) && (flags & DFT_SCALE))
+                options += " -D DFT_SCALE";
+        }
+        else
+        {
+            globalsize[0] = num_dfts; globalsize[1] = thread_count;
+            localsize[0] = 1; localsize[1] = thread_count;
+            kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols";
+            if (flags & DFT_SCALE)
+                options += " -D DFT_SCALE";
+        }
+
+        options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT";
+        options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT";
+        options += is1d ? " -D IS_1D" : "";
+
+        if (!inv)
+        {
+            if ((is1d && src.channels() == 1) || (rows && (fftType == R2R)))
+                options += " -D NO_CONJUGATE";
+        }
+        else
+        {
+            if (rows && (fftType == C2R || fftType == R2R))
+                options += " -D NO_CONJUGATE";
+            if (dst.cols % 2 == 0)
+                options += " -D EVEN";
+        }
+
+        ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
+        if (k.empty())
+            return false;
+
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(twiddles), thread_count, num_dfts);
+        return k.run(2, globalsize, localsize, false);
+    }
+
+private:
+    static void ocl_getRadixes(int cols, std::vector<int>& radixes, std::vector<int>& blocks, int& min_radix)
+    {
+        int factors[34];
+        int nf = DFTFactorize(cols, factors);
+
+        int n = 1;
+        int factor_index = 0;
+        min_radix = INT_MAX;
+
+        // 2^n transforms
+        if ((factors[factor_index] & 1) == 0)
+        {
+            for( ; n < factors[factor_index];)
+            {
+                int radix = 2, block = 1;
+                if (8*n <= factors[0])
+                    radix = 8;
+                else if (4*n <= factors[0])
+                {
+                    radix = 4;
+                    if (cols % 12 == 0)
+                        block = 3;
+                    else if (cols % 8 == 0)
+                        block = 2;
+                }
+                else
+                {
+                    if (cols % 10 == 0)
+                        block = 5;
+                    else if (cols % 8 == 0)
+                        block = 4;
+                    else if (cols % 6 == 0)
+                        block = 3;
+                    else if (cols % 4 == 0)
+                        block = 2;
+                }
+
+                radixes.push_back(radix);
+                blocks.push_back(block);
+                min_radix = min(min_radix, block*radix);
+                n *= radix;
+            }
+            factor_index++;
+        }
+
+        // all the other transforms
+        for( ; factor_index < nf; factor_index++)
+        {
+            int radix = factors[factor_index], block = 1;
+            if (radix == 3)
+            {
+                if (cols % 12 == 0)
+                    block = 4;
+                else if (cols % 9 == 0)
+                    block = 3;
+                else if (cols % 6 == 0)
+                    block = 2;
+            }
+            else if (radix == 5)
+            {
+                if (cols % 10 == 0)
+                    block = 2;
+            }
+            radixes.push_back(radix);
+            blocks.push_back(block);
+            min_radix = min(min_radix, block*radix);
+        }
+    }
+};
+
+class OCL_FftPlanCache
+{
+public:
+    static OCL_FftPlanCache & getInstance()
+    {
+        static OCL_FftPlanCache planCache;
+        return planCache;
+    }
+
+    Ptr<OCL_FftPlan> getFftPlan(int dft_size)
+    {
+        std::map<int, Ptr<OCL_FftPlan> >::iterator f = planStorage.find(dft_size);
+        if (f != planStorage.end())
+        {
+            return f->second;
+        }
+        else
+        {
+            Ptr<OCL_FftPlan> newPlan = Ptr<OCL_FftPlan>(new OCL_FftPlan(dft_size));
+            planStorage[dft_size] = newPlan;
+            return newPlan;
+        }
+    }
+
+    ~OCL_FftPlanCache()
+    {
+        planStorage.clear();
+    }
+
+protected:
+    OCL_FftPlanCache() :
+        planStorage()
+    {
+    }
+    std::map<int, Ptr<OCL_FftPlan> > planStorage;
+};
+
+static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
+{
+    Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols());
+    return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);
+}
+
+static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)
+{
+    Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows());
+    return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false);
+}
+
+static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
+{
+    int type = _src.type(), cn = CV_MAT_CN(type);
+    Size ssize = _src.size();
+    if ( !(type == CV_32FC1 || type == CV_32FC2) )
+        return false;
+
+    // if is not a multiplication of prime numbers { 2, 3, 5 }
+    if (ssize.area() != getOptimalDFTSize(ssize.area()))
+        return false;
+
+    UMat src = _src.getUMat();
+    int complex_input = cn == 2 ? 1 : 0;
+    int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0;
+    int real_input = cn == 1 ? 1 : 0;
+    int real_output = (flags & DFT_REAL_OUTPUT) != 0;
+    bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;
+
+    if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )
+        nonzero_rows = _src.rows();
+    bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;
+
+    // if output format is not specified
+    if (complex_output + real_output == 0)
+    {
+        if (real_input)
+            real_output = 1;
+        else
+            complex_output = 1;
+    }
+
+    FftType fftType = (FftType)(complex_input << 0 | complex_output << 1);
+
+    // Forward Complex to CCS not supported
+    if (fftType == C2R && !inv)
+        fftType = C2C;
+
+    // Inverse CCS to Complex not supported
+    if (fftType == R2C && inv)
+        fftType = R2R;
+
+    UMat output;
+    if (fftType == C2C || fftType == R2C)
+    {
+        // complex output
+        _dst.create(src.size(), CV_32FC2);
+        output = _dst.getUMat();
+    }
+    else
+    {
+        // real output
+        if (is1d)
+        {
+            _dst.create(src.size(), CV_32FC1);
+            output = _dst.getUMat();
+        }
+        else
+        {
+            _dst.create(src.size(), CV_32FC1);
+            output.create(src.size(), CV_32FC2);
+        }
+    }
+
+    if (!inv)
+    {
+        if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
+            return false;
+
+        if (!is1d)
+        {
+            int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;
+            if (!ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType))
+                return false;
+        }
+    }
+    else
+    {
+        if (fftType == C2C)
+        {
+            // complex output
+            if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
+                return false;
+
+            if (!is1d)
+            {
+                if (!ocl_dft_cols(output, output, output.cols, flags, fftType))
+                    return false;
+            }
+        }
+        else
+        {
+            if (is1d)
+            {
+                if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
+                    return false;
+            }
+            else
+            {
+                int nonzero_cols = src.cols/2 + 1;
+                if (!ocl_dft_cols(src, output, nonzero_cols, flags, fftType))
+                    return false;
+
+                if (!ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType))
+                    return false;
+            }
+        }
+    }
+    return true;
+}
+
+} // namespace cv;
+
+#endif
+
 #ifdef HAVE_CLAMDFFT

 namespace cv {
@ -1791,14 +2161,6 @@ namespace cv {
        CV_Assert(s == CLFFT_SUCCESS); \
    }

-enum FftType
-{
-    R2R = 0, // real to real
-    C2R = 1, // opencl HERMITIAN_INTERLEAVED to real
-    R2C = 2, // real to opencl HERMITIAN_INTERLEAVED
-    C2C = 3  // complex to complex
-};
-
 class PlanCache
 {
    struct FftPlan
@ -1923,7 +2285,7 @@ public:
        }

        // no baked plan is found, so let's create a new one
-        FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType);
+        Ptr<FftPlan> newPlan = Ptr<FftPlan>(new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType));
        planStorage.push_back(newPlan);

        return newPlan->plHandle;
@ -1931,8 +2293,6 @@ public:

    ~PlanCache()
    {
-        for (std::vector<FftPlan *>::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i)
-            delete (*i);
        planStorage.clear();
    }

@ -1942,7 +2302,7 @@ protected:
    {
    }

-    std::vector<FftPlan *> planStorage;
+    std::vector<Ptr<FftPlan> > planStorage;
 };

 extern "C" {
@ -1960,7 +2320,7 @@ static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)

 }

-static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)
+static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
 {
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    Size ssize = _src.size();
@ -2019,7 +2379,6 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)

    tmpBuffer.addref();
    clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u);
-
    return true;
 }

@ -2034,7 +2393,12 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
 #ifdef HAVE_CLAMDFFT
    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
-               ocl_dft(_src0, _dst, flags))
+               ocl_dft_amdfft(_src0, _dst, flags))
+#endif
+
+#ifdef HAVE_OPENCL
+    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
+               ocl_dft(_src0, _dst, flags, nonzero_rows))
 #endif

    static DFTFunc dft_tbl[6] =
@ -2046,10 +2410,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
        (DFTFunc)RealDFT_64f,
        (DFTFunc)CCSIDFT_64f
    };
-
    AutoBuffer<uchar> buf;
    void *spec = 0;
-
    Mat src0 = _src0.getMat(), src = src0;
    int prev_len = 0, stage = 0;
    bool inv = (flags & DFT_INVERSE) != 0;
@ -2080,32 +2442,32 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
    {
        if ((flags & DFT_ROWS) == 0)
        {
-            if (!real_transform)
+            if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
            {
-                if (ippi_DFT_C_32F(src,dst, inv, ipp_norm_flag))
+                if (ippi_DFT_C_32F(src, dst, inv, ipp_norm_flag))
                    return;
                setIppErrorStatus();
            }
-            else if (inv || !(flags & DFT_COMPLEX_OUTPUT))
+            if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
            {
-                if (ippi_DFT_R_32F(src,dst, inv, ipp_norm_flag))
+                if (ippi_DFT_R_32F(src, dst, inv, ipp_norm_flag))
                    return;
                setIppErrorStatus();
            }
        }
        else
        {
-            if (!real_transform)
+            if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
            {
                ippiDFT_C_Func ippiFunc = inv ? (ippiDFT_C_Func)ippiDFTInv_CToC_32fc_C1R : (ippiDFT_C_Func)ippiDFTFwd_CToC_32fc_C1R;
-                if (Dft_C_IPPLoop(src,dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
+                if (Dft_C_IPPLoop(src, dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
                    return;
                setIppErrorStatus();
            }
-            else if (inv || !(flags & DFT_COMPLEX_OUTPUT))
+            if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
            {
                ippiDFT_R_Func ippiFunc = inv ? (ippiDFT_R_Func)ippiDFTInv_PackToR_32f_C1R : (ippiDFT_R_Func)ippiDFTFwd_RToPack_32f_C1R;
-                if (Dft_R_IPPLoop(src,dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
+                if (Dft_R_IPPLoop(src, dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
                    return;
                setIppErrorStatus();
            }
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@ -348,7 +348,18 @@ static void InvSqrt_32f(const float* src, float* dst, int len)

 static void InvSqrt_64f(const double* src, double* dst, int len)
 {
-    for( int i = 0; i < len; i++ )
+    int i = 0;
+
+#if CV_SSE2
+    if (USE_SSE2)
+    {
+        __m128d v_1 = _mm_set1_pd(1.0);
+        for ( ; i <= len - 2; i += 2)
+            _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
+    }
+#endif
+
+    for( ; i < len; i++ )
        dst[i] = 1/std::sqrt(src[i]);
 }

@ -2543,12 +2554,33 @@ void patchNaNs( InputOutputArray _a, double _val )
    NAryMatIterator it(arrays, (uchar**)ptrs);
    size_t len = it.size*a.channels();
    Cv32suf val;
-    val.f = (float)_val;
+    float fval = (float)_val;
+    val.f = fval;
+
+#if CV_SSE2
+    __m128i v_mask1 = _mm_set1_epi32(0x7fffffff), v_mask2 = _mm_set1_epi32(0x7f800000);
+    __m128i v_val = _mm_set1_epi32(val.i);
+#endif

    for( size_t i = 0; i < it.nplanes; i++, ++it )
    {
        int* tptr = ptrs[0];
-        for( size_t j = 0; j < len; j++ )
+        size_t j = 0;
+
+#if CV_SSE2
+        if (USE_SSE2)
+        {
+            for ( ; j < len; j += 4)
+            {
+                __m128i v_src = _mm_loadu_si128((__m128i const *)(tptr + j));
+                __m128i v_cmp_mask = _mm_cmplt_epi32(v_mask2, _mm_and_si128(v_src, v_mask1));
+                __m128i v_res = _mm_or_si128(_mm_andnot_si128(v_cmp_mask, v_src), _mm_and_si128(v_cmp_mask, v_val));
+                _mm_storeu_si128((__m128i *)(tptr + j), v_res);
+            }
+        }
+#endif
+
+        for( ; j < len; j++ )
            if( (tptr[j] & 0x7fffffff) > 0x7f800000 )
                tptr[j] = val.i;
    }
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -2758,21 +2758,30 @@ namespace cv {

 static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
 {
-    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
+    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn;
+    if (cn == 1)
+    {
+        kercn = std::min(ocl::predictOptimalVectorWidth(_m), 4);
+        if (kercn != 4)
+            kercn = 1;
+    }
+    int sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
            rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;

    ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
-                  format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
-                         ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype)));
+                  format("-D T=%s -D T1=%s -D cn=%d -D ST=%s -D kercn=%d -D rowsPerWI=%d",
+                         ocl::memopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
+                         ocl::memopTypeToStr(depth), cn,
+                         ocl::memopTypeToStr(sctype),
+                         kercn, rowsPerWI));
    if (k.empty())
        return false;

    UMat m = _m.getUMat();
-    k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)),
-           rowsPerWI);
+    k.args(ocl::KernelArg::WriteOnly(m, cn, kercn),
+           ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));

-    size_t globalsize[2] = { m.cols, (m.rows + rowsPerWI - 1) / rowsPerWI };
+    size_t globalsize[2] = { m.cols * cn / kercn, (m.rows + rowsPerWI - 1) / rowsPerWI };
    return k.run(2, globalsize, NULL, false);
 }

@ -2973,8 +2982,10 @@ static inline int divUp(int a, int b)

 static bool ocl_transpose( InputArray _src, OutputArray _dst )
 {
+    const ocl::Device & dev = ocl::Device::getDefault();
    const int TILE_DIM = 32, BLOCK_ROWS = 8;
-    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
+    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type),
+        rowsPerWI = dev.isIntel() ? 4 : 1;

    UMat src = _src.getUMat();
    _dst.create(src.cols, src.rows, type);
@ -2990,9 +3001,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
    }

    ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
-                  format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d",
+                  format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d",
                         ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
-                         cn, TILE_DIM, BLOCK_ROWS));
+                         cn, TILE_DIM, BLOCK_ROWS, rowsPerWI));
    if (k.empty())
        return false;

@ -3002,8 +3013,14 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
        k.args(ocl::KernelArg::ReadOnly(src),
               ocl::KernelArg::WriteOnlyNoSize(dst));

-    size_t localsize[3]  = { TILE_DIM, BLOCK_ROWS, 1 };
-    size_t globalsize[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
+    size_t localsize[2]  = { TILE_DIM, BLOCK_ROWS };
+    size_t globalsize[2] = { src.cols, inplace ? (src.rows + rowsPerWI - 1) / rowsPerWI : (divUp(src.rows, TILE_DIM) * BLOCK_ROWS) };
+
+    if (inplace && dev.isIntel())
+    {
+        localsize[0] = 16;
+        localsize[1] = dev.maxWorkGroupSize() / localsize[0];
+    }

    return k.run(2, globalsize, localsize, false);
 }
@ -3433,8 +3450,11 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
    const int min_opt_cols = 128, buf_cols = 32;
    int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
            ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
-            useOptimized = 1 == dim && _src.cols() > min_opt_cols;
+    const ocl::Device &defDev = ocl::Device::getDefault();
+    bool doubleSupport = defDev.doubleFPConfig() > 0;
+
+    size_t wgs = defDev.maxWorkGroupSize();
+    bool useOptimized = 1 == dim && _src.cols() > min_opt_cols && (wgs >= buf_cols);

    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
        return false;
@ -3447,78 +3467,80 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,

    const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
                                  "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
-    char cvt[2][40];
-
    int wdepth = std::max(ddepth, CV_32F);
-    cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
-                                  " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
-                                  " -D convertToDT=%s -D convertToDT0=%s%s",
-                                  ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
-                                  ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
-                                  ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
-                                  ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
-                                  ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
-                                  doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
    if (useOptimized)
    {
-        cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1"
-                                          "  -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
-                                          buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
-                                          ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
-                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-        ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
-        if (kpre.empty())
-            return false;
-
-        ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt);
-        if (kmain.empty())
+        size_t tileHeight = (size_t)(wgs / buf_cols);
+        if (defDev.isIntel())
+        {
+            static const size_t maxItemInGroupCount = 16;
+            tileHeight = min(tileHeight, defDev.localMemSize() / buf_cols / CV_ELEM_SIZE(CV_MAKETYPE(wdepth, cn)) / maxItemInGroupCount);
+        }
+        char cvt[3][40];
+        cv::String build_opt = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D TILE_HEIGHT=%d -D %s -D dim=1"
+                                            " -D cn=%d -D ddepth=%d"
+                                            " -D srcT=%s -D bufT=%s -D dstT=%s"
+                                            " -D convertToWT=%s -D convertToBufT=%s -D convertToDT=%s%s",
+                                            buf_cols, tileHeight, ops[op], cn, ddepth,
+                                            ocl::typeToStr(sdepth),
+                                            ocl::typeToStr(ddepth),
+                                            ocl::typeToStr(ddepth0),
+                                            ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
+                                            ocl::convertTypeStr(sdepth, ddepth, 1, cvt[1]),
+                                            ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[2]),
+                                            doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+        ocl::Kernel k("reduce_horz_opt", ocl::core::reduce2_oclsrc, build_opt);
+        if (k.empty())
            return false;
-
        UMat src = _src.getUMat();
        Size dsize(1, src.rows);
        _dst.create(dsize, dtype);
        UMat dst = _dst.getUMat();

-        UMat buf(src.rows, buf_cols, dst.type());
-
-        kpre.args(ocl::KernelArg::ReadOnly(src),
-                  ocl::KernelArg::WriteOnlyNoSize(buf));
-
-        size_t globalSize[2] = { buf_cols, src.rows };
-        if (!kpre.run(2, globalSize, NULL, false))
-            return false;
-
        if (op0 == CV_REDUCE_AVG)
-            kmain.args(ocl::KernelArg::ReadOnly(buf),
-                       ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
+            k.args(ocl::KernelArg::ReadOnly(src),
+                      ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
        else
-            kmain.args(ocl::KernelArg::ReadOnly(buf),
-                       ocl::KernelArg::WriteOnlyNoSize(dst));
+            k.args(ocl::KernelArg::ReadOnly(src),
+                      ocl::KernelArg::WriteOnlyNoSize(dst));

-        globalSize[0] = src.rows;
-        return kmain.run(1, globalSize, NULL, false);
+        size_t localSize[2] = { buf_cols, tileHeight};
+        size_t globalSize[2] = { buf_cols, src.rows };
+        return k.run(2, globalSize, localSize, false);
    }
+    else
+    {
+        char cvt[2][40];
+        cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
+                                      " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
+                                      " -D convertToDT=%s -D convertToDT0=%s%s",
+                                      ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
+                                      ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
+                                      ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
+                                      ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
+                                      ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
+                                      doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+        ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
+        if (k.empty())
+            return false;

-    ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
-    if (k.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
-    _dst.create(dsize, dtype);
-    UMat dst = _dst.getUMat();
+        UMat src = _src.getUMat();
+        Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
+        _dst.create(dsize, dtype);
+        UMat dst = _dst.getUMat();

-    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
-            temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
+                temparg = ocl::KernelArg::WriteOnlyNoSize(dst);

-    if (op0 == CV_REDUCE_AVG)
-        k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
-    else
-        k.args(srcarg, temparg);
+        if (op0 == CV_REDUCE_AVG)
+            k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
+        else
+            k.args(srcarg, temparg);

-    size_t globalsize = std::max(dsize.width, dsize.height);
-    return k.run(1, &globalsize, NULL, false);
+        size_t globalsize = std::max(dsize.width, dsize.height);
+        return k.run(1, &globalsize, NULL, false);
+    }
 }

 }
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -1416,7 +1416,16 @@ bool useOpenCL()
 {
    CoreTLSData* data = coreTlsData.get();
    if( data->useOpenCL < 0 )
-        data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
+    {
+        try
+        {
+            data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
+        }
+        catch (...)
+        {
+            data->useOpenCL = 0;
+        }
+    }
    return data->useOpenCL > 0;
 }

@ -2228,7 +2237,8 @@ static cl_device_id selectOpenCLDevice()
        if (!isID)
        {
            deviceTypes.push_back("GPU");
-            deviceTypes.push_back("CPU");
+            if (configuration)
+                deviceTypes.push_back("CPU");
        }
        else
            deviceTypes.push_back("ALL");
@ -3484,9 +3494,8 @@ public:
    OpenCLBufferPoolImpl()
        : currentReservedSize(0), maxReservedSize(0)
    {
-        // Note: Buffer pool is disabled by default,
-        //       because we didn't receive significant performance improvement
-        maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", 0);
+        int poolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
+        maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", poolSize);
    }
    virtual ~OpenCLBufferPoolImpl()
    {
@ -3729,6 +3738,7 @@ public:
                u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
                                           u->size, u->origdata, &retval);
                tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
+
            }
            if(!u->handle || retval != CL_SUCCESS)
                return false;
@ -3870,6 +3880,7 @@ public:
                if(u->data && retval == CL_SUCCESS)
                {
                    u->markHostCopyObsolete(false);
+                    u->markDeviceMemMapped(true);
                    return;
                }

@ -3898,6 +3909,7 @@ public:
        if(!u)
            return;

+
        CV_Assert(u->handle != 0);

        UMatDataAutoLock autolock(u);
@ -3908,8 +3920,10 @@ public:

        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
        cl_int retval = 0;
-        if( !u->copyOnMap() && u->data )
+        if( !u->copyOnMap() && u->deviceMemMapped() )
        {
+            CV_Assert(u->data != NULL);
+            u->markDeviceMemMapped(false);
            CV_Assert( (retval = clEnqueueUnmapMemObject(q,
                                (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
            CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
@ -4427,11 +4441,13 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
        d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
        d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
        d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth];
-    if (d.isIntel())
+
+    // if the device says don't use vectors
+    if (vectorWidths[0] == 1)
    {
        // it's heuristic
-        int vectorWidthsIntel[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
-        kercn = vectorWidthsIntel[depth];
+        int vectorWidthsOthers[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
+        kercn = vectorWidthsOthers[depth];
    }

    if (ssize.width * cn < kercn || kercn <= 0)
--- a/modules/core/src/opencl/fft.cl
+++ b/modules/core/src/opencl/fft.cl
@ -0,0 +1,864 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Itseez, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#define SQRT_2 0.707106781188f
+#define sin_120 0.866025403784f
+#define fft5_2  0.559016994374f
+#define fft5_3 -0.951056516295f
+#define fft5_4 -1.538841768587f
+#define fft5_5  0.363271264002f
+
+__attribute__((always_inline))
+float2 mul_float2(float2 a, float2 b) {
+    return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));
+}
+
+__attribute__((always_inline))
+float2 twiddle(float2 a) {
+    return (float2)(a.y, -a.x);
+}
+
+__attribute__((always_inline))
+void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles,
+                const int x, const int block_size)
+{
+    const int k = x & (block_size - 1);
+    a1 = mul_float2(twiddles[k], a1);
+    const int dst_ind = (x << 1) - k;
+
+    smem[dst_ind] = a0 + a1;
+    smem[dst_ind+block_size] = a0 - a1;
+}
+
+__attribute__((always_inline))
+void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles,
+                const int x, const int block_size)
+{
+    const int k = x & (block_size - 1);
+    a1 = mul_float2(twiddles[k], a1);
+    a2 = mul_float2(twiddles[k + block_size], a2);
+    a3 = mul_float2(twiddles[k + 2*block_size], a3);
+
+    const int dst_ind = ((x - k) << 2) + k;
+
+    float2 b0 = a0 + a2;
+    a2 = a0 - a2;
+    float2 b1 = a1 + a3;
+    a3 = twiddle(a1 - a3);
+
+    smem[dst_ind]                = b0 + b1;
+    smem[dst_ind + block_size]   = a2 + a3;
+    smem[dst_ind + 2*block_size] = b0 - b1;
+    smem[dst_ind + 3*block_size] = a2 - a3;
+}
+
+__attribute__((always_inline))
+void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles,
+                const int x, const int block_size)
+{
+    const int k = x % block_size;
+    a1 = mul_float2(twiddles[k], a1);
+    a2 = mul_float2(twiddles[k+block_size], a2);
+    const int dst_ind = ((x - k) * 3) + k;
+
+    float2 b1 = a1 + a2;
+    a2 = twiddle(sin_120*(a1 - a2));
+    float2 b0 = a0 - (float2)(0.5f)*b1;
+
+    smem[dst_ind] = a0 + b1;
+    smem[dst_ind + block_size] = b0 + a2;
+    smem[dst_ind + 2*block_size] = b0 - a2;
+}
+
+__attribute__((always_inline))
+void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles,
+                const int x, const int block_size)
+{
+    const int k = x % block_size;
+    a1 = mul_float2(twiddles[k], a1);
+    a2 = mul_float2(twiddles[k + block_size], a2);
+    a3 = mul_float2(twiddles[k+2*block_size], a3);
+    a4 = mul_float2(twiddles[k+3*block_size], a4);
+
+    const int dst_ind = ((x - k) * 5) + k;
+    __local float2* dst = smem + dst_ind;
+
+    float2 b0, b1, b5;
+
+    b1 = a1 + a4;
+    a1 -= a4;
+
+    a4 = a3 + a2;
+    a3 -= a2;
+
+    a2 = b1 + a4;
+    b0 = a0 - (float2)0.25f * a2;
+
+    b1 = fft5_2 * (b1 - a4);
+    a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x);
+    b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);
+
+    a4.x += fft5_4 * a3.y;
+    a4.y -= fft5_4 * a3.x;
+
+    a1 = b0 + b1;
+    b0 -= b1;
+
+    dst[0] = a0 + a2;
+    dst[block_size] = a1 + a4;
+    dst[2 * block_size] = b0 + b5;
+    dst[3 * block_size] = b0 - b5;
+    dst[4 * block_size] = a1 - a4;
+}
+
+__attribute__((always_inline))
+void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    float2 a0, a1;
+
+    if (x < t)
+    {
+        a0 = smem[x];
+        a1 = smem[x+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+        butterfly2(a0, a1, smem, twiddles, x, block_size);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/2;
+    float2 a0, a1, a2, a3;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t];
+        a2 = smem[x2]; a3 = smem[x2+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly2(a0, a1, smem, twiddles, x1, block_size);
+        butterfly2(a2, a3, smem, twiddles, x2, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/3;
+    const int x3 = x1 + 2*t/3;
+    float2 a0, a1, a2, a3, a4, a5;
+
+    if (x1 < t/3)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t];
+        a2 = smem[x2]; a3 = smem[x2+t];
+        a4 = smem[x3]; a5 = smem[x3+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/3)
+    {
+        butterfly2(a0, a1, smem, twiddles, x1, block_size);
+        butterfly2(a2, a3, smem, twiddles, x2, block_size);
+        butterfly2(a4, a5, smem, twiddles, x3, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int thread_block = t/4;
+    const int x2 = x1 + thread_block;
+    const int x3 = x1 + 2*thread_block;
+    const int x4 = x1 + 3*thread_block;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7;
+
+    if (x1 < t/4)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t];
+        a2 = smem[x2]; a3 = smem[x2+t];
+        a4 = smem[x3]; a5 = smem[x3+t];
+        a6 = smem[x4]; a7 = smem[x4+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/4)
+    {
+        butterfly2(a0, a1, smem, twiddles, x1, block_size);
+        butterfly2(a2, a3, smem, twiddles, x2, block_size);
+        butterfly2(a4, a5, smem, twiddles, x3, block_size);
+        butterfly2(a6, a7, smem, twiddles, x4, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int thread_block = t/5;
+    const int x2 = x1 + thread_block;
+    const int x3 = x1 + 2*thread_block;
+    const int x4 = x1 + 3*thread_block;
+    const int x5 = x1 + 4*thread_block;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
+
+    if (x1 < t/5)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t];
+        a2 = smem[x2]; a3 = smem[x2+t];
+        a4 = smem[x3]; a5 = smem[x3+t];
+        a6 = smem[x4]; a7 = smem[x4+t];
+        a8 = smem[x5]; a9 = smem[x5+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/5)
+    {
+        butterfly2(a0, a1, smem, twiddles, x1, block_size);
+        butterfly2(a2, a3, smem, twiddles, x2, block_size);
+        butterfly2(a4, a5, smem, twiddles, x3, block_size);
+        butterfly2(a6, a7, smem, twiddles, x4, block_size);
+        butterfly2(a8, a9, smem, twiddles, x5, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    float2 a0, a1, a2, a3;
+
+    if (x < t)
+    {
+        a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+        butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/2;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
+        a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
+        butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/3;
+    const int x3 = x2 + t/3;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
+
+    if (x1 < t/3)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
+        a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
+        a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/3)
+    {
+        butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
+        butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
+        butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    const int k = x % block_size;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7;
+
+    if (x < t)
+    {
+        int tw_ind = block_size / 8;
+
+        a0 = smem[x];
+        a1 = mul_float2(twiddles[k], smem[x + t]);
+        a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]);
+        a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]);
+        a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]);
+        a5 = mul_float2(twiddles[k+4*block_size],smem[x+5*t]);
+        a6 = mul_float2(twiddles[k+5*block_size],smem[x+6*t]);
+        a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]);
+
+        float2 b0, b1, b6, b7;
+
+        b0 = a0 + a4;
+        a4 = a0 - a4;
+        b1 = a1 + a5;
+        a5 = a1 - a5;
+        a5 = (float2)(SQRT_2) * (float2)(a5.x + a5.y, -a5.x + a5.y);
+        b6 = twiddle(a2 - a6);
+        a2 = a2 + a6;
+        b7 = a3 - a7;
+        b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y);
+        a3 = a3 + a7;
+
+        a0 = b0 + a2;
+        a2 = b0 - a2;
+        a1 = b1 + a3;
+        a3 = twiddle(b1 - a3);
+        a6 = a4 - b6;
+        a4 = a4 + b6;
+        a7 = twiddle(a5 - b7);
+        a5 = a5 + b7;
+
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+    {
+        const int dst_ind = ((x - k) << 3) + k;
+        __local float2* dst = smem + dst_ind;
+
+        dst[0] = a0 + a1;
+        dst[block_size] = a4 + a5;
+        dst[2 * block_size] = a2 + a3;
+        dst[3 * block_size] = a6 + a7;
+        dst[4 * block_size] = a0 - a1;
+        dst[5 * block_size] = a4 - a5;
+        dst[6 * block_size] = a2 - a3;
+        dst[7 * block_size] = a6 - a7;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    float2 a0, a1, a2;
+
+    if (x < t)
+    {
+        a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+        butterfly3(a0, a1, a2, smem, twiddles, x, block_size);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/2;
+    float2 a0, a1, a2, a3, a4, a5;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
+        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
+        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/3;
+    const int x3 = x2 + t/3;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
+        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
+        a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
+        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
+        butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int thread_block = t/4;
+    const int x2 = x1 + thread_block;
+    const int x3 = x1 + 2*thread_block;
+    const int x4 = x1 + 3*thread_block;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
+
+    if (x1 < t/4)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
+        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
+        a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
+        a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/4)
+    {
+        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
+        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
+        butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
+        butterfly3(a9, a10, a11, smem, twiddles, x4, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    const int k = x % block_size;
+    float2 a0, a1, a2, a3, a4;
+
+    if (x < t)
+    {
+        a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+        butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1+t/2;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];
+        a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size);
+        butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+#ifdef DFT_SCALE
+#define SCALE_VAL(x, scale) x*scale
+#else
+#define SCALE_VAL(x, scale) x
+#endif
+
+__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
+                                   __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                   __global float2* twiddles_ptr, const int t, const int nz)
+{
+    const int x = get_global_id(0);
+    const int y = get_group_id(1);
+    const int block_size = LOCAL_SIZE/kercn;
+    if (y < nz)
+    {
+        __local float2 smem[LOCAL_SIZE];
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = x;
+#ifdef IS_1D
+        float scale = 1.f/dst_cols;
+#else
+        float scale = 1.f/(dst_cols*dst_rows);
+#endif
+
+#ifdef COMPLEX_INPUT
+        __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+            smem[x+i*block_size] = src[i*block_size];
+#else
+        __global const float* src = (__global const float*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(float), src_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+            smem[x+i*block_size] = (float2)(src[i*block_size], 0.f);
+#endif
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+#ifdef COMPLEX_OUTPUT
+#ifdef NO_CONJUGATE
+        // copy result without complex conjugate
+        const int cols = dst_cols/2 + 1;
+#else
+        const int cols = dst_cols;
+#endif
+
+        __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
+        #pragma unroll
+        for (int i=x; i<cols; i+=block_size)
+            dst[i] = SCALE_VAL(smem[i], scale);
+#else
+        // pack row to CCS
+        __local float* smem_1cn = (__local float*) smem;
+        __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
+        for (int i=x; i<dst_cols-1; i+=block_size)
+            dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale);
+        if (x == 0)
+            dst[0] = SCALE_VAL(smem_1cn[0], scale);
+#endif
+    }
+    else
+    {
+        // fill with zero other rows
+#ifdef COMPLEX_OUTPUT
+        __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
+#else
+        __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
+#endif
+        #pragma unroll
+        for (int i=x; i<dst_cols; i+=block_size)
+            dst[i] = 0.f;
+    }
+}
+
+__kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
+                                   __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                   __global float2* twiddles_ptr, const int t, const int nz)
+{
+    const int x = get_group_id(0);
+    const int y = get_global_id(1);
+
+    if (x < nz)
+    {
+        __local float2 smem[LOCAL_SIZE];
+        __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = y;
+        const int block_size = LOCAL_SIZE/kercn;
+        float scale = 1.f/(dst_rows*dst_cols);
+
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+            smem[y+i*block_size] = *((__global const float2*)(src + i*block_size*src_step));
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+#ifdef COMPLEX_OUTPUT
+        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+            *((__global float2*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale);
+#else
+        if (x == 0)
+        {
+            // pack first column to CCS
+            __local float* smem_1cn = (__local float*) smem;
+            __global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);
+            for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
+                *((__global float*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
+            if (y == 0)
+                *((__global float*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale);
+        }
+        else if (x == (dst_cols+1)/2)
+        {
+            // pack last column to CCS (if needed)
+            __local float* smem_1cn = (__local float*) smem;
+            __global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(float), mad24(y+1, dst_step, dst_offset));
+            for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
+                *((__global float*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
+            if (y == 0)
+                *((__global float*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(float), dst_offset))) = SCALE_VAL(smem_1cn[0], scale);
+        }
+        else
+        {
+            __global uchar* dst = dst_ptr + mad24(x, (int)sizeof(float)*2, mad24(y, dst_step, dst_offset - (int)sizeof(float)));
+            #pragma unroll
+            for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
+                vstore2(SCALE_VAL(smem[i], scale), 0, (__global float*) dst);
+        }
+#endif
+    }
+}
+
+__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
+                                    __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                    __global float2* twiddles_ptr, const int t, const int nz)
+{
+    const int x = get_global_id(0);
+    const int y = get_group_id(1);
+    const int block_size = LOCAL_SIZE/kercn;
+#ifdef IS_1D
+    const float scale = 1.f/dst_cols;
+#else
+    const float scale = 1.f/(dst_cols*dst_rows);
+#endif
+
+    if (y < nz)
+    {
+        __local float2 smem[LOCAL_SIZE];
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = x;
+
+#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)
+        __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            smem[x+i*block_size].x =  src[i*block_size].x;
+            smem[x+i*block_size].y = -src[i*block_size].y;
+        }
+#else
+
+    #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
+        __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(float), src_offset)));
+
+        #pragma unroll
+        for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
+        {
+            smem[i+1].x = src[i].x;
+            smem[i+1].y = -src[i].y;
+            smem[LOCAL_SIZE-i-1] = src[i];
+        }
+    #else
+
+        #pragma unroll
+        for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
+        {
+            float2 src = vload2(0, (__global const float*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(float), src_offset))));
+
+            smem[i+1].x = src.x;
+            smem[i+1].y = -src.y;
+            smem[LOCAL_SIZE-i-1] = src;
+        }
+
+    #endif
+
+        if (x==0)
+        {
+            smem[0].x = *(__global const float*)(src_ptr + mad24(y, src_step, src_offset));
+            smem[0].y = 0.f;
+
+            if(LOCAL_SIZE % 2 ==0)
+            {
+                #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
+                smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;
+                #else
+                smem[LOCAL_SIZE/2].x = *(__global const float*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(float), src_offset)));
+                #endif
+                smem[LOCAL_SIZE/2].y = 0.f;
+            }
+        }
+#endif
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+        // copy data to dst
+#ifdef COMPLEX_OUTPUT
+        __global float2* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale);
+            dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale);
+        }
+#else
+        __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)), dst_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale);
+        }
+#endif
+    }
+    else
+    {
+        // fill with zero other rows
+#ifdef COMPLEX_OUTPUT
+        __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
+#else
+        __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
+#endif
+        #pragma unroll
+        for (int i=x; i<dst_cols; i+=block_size)
+            dst[i] = 0.f;
+    }
+}
+
+__kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
+                              __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                              __global float2* twiddles_ptr, const int t, const int nz)
+{
+    const int x = get_group_id(0);
+    const int y = get_global_id(1);
+
+#ifdef COMPLEX_INPUT
+    if (x < nz)
+    {
+        __local float2 smem[LOCAL_SIZE];
+        __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
+        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = y;
+        const int block_size = LOCAL_SIZE/kercn;
+
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            float2 temp = *((__global const float2*)(src + i*block_size*src_step));
+            smem[y+i*block_size].x =  temp.x;
+            smem[y+i*block_size].y =  -temp.y;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+        // copy data to dst
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+           __global float2* res = (__global float2*)(dst + i*block_size*dst_step);
+            res[0].x = smem[y + i*block_size].x;
+            res[0].y = -smem[y + i*block_size].y;
+        }
+    }
+#else
+    if (x < nz)
+    {
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = y;
+        const int block_size = LOCAL_SIZE/kercn;
+
+        __local float2 smem[LOCAL_SIZE];
+#ifdef EVEN
+        if (x!=0 && (x!=(nz-1)))
+#else
+        if (x!=0)
+#endif
+        {
+            __global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(float), src_offset));
+            #pragma unroll
+            for (int i=0; i<kercn; i++)
+            {
+                float2 temp = vload2(0, (__global const float*)(src + i*block_size*src_step));
+                smem[y+i*block_size].x = temp.x;
+                smem[y+i*block_size].y = -temp.y;
+            }
+        }
+        else
+        {
+            int ind = x==0 ? 0: 2*x-1;
+            __global const float* src = (__global const float*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(float), src_offset)));
+            int step = src_step/(int)sizeof(float);
+
+            #pragma unroll
+            for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size)
+            {
+                smem[i+1].x = src[2*i*step];
+                smem[i+1].y = -src[(2*i+1)*step];
+
+                smem[LOCAL_SIZE-i-1].x = src[2*i*step];;
+                smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step];
+            }
+            if (y==0)
+            {
+                smem[0].x = *(__global const float*)(src_ptr + mad24(ind, (int)sizeof(float), src_offset));
+                smem[0].y = 0.f;
+
+                if(LOCAL_SIZE % 2 ==0)
+                {
+                    smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step];
+                    smem[LOCAL_SIZE/2].y = 0.f;
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+        // copy data to dst
+        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float2)), dst_offset));
+
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            __global float2* res = (__global float2*)(dst + i*block_size*dst_step);
+            res[0].x =  smem[y + i*block_size].x;
+            res[0].y = -smem[y + i*block_size].y;
+        }
+    }
+#endif
+}
--- a/modules/core/src/opencl/inrange.cl
+++ b/modules/core/src/opencl/inrange.cl
@ -52,7 +52,7 @@
 __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
                      __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 #ifdef HAVE_SCALAR
-                      __global const T * src2, __global const T * src3,
+                      __global const srcT1 * src2, __global const srcT1 * src3,
 #else
                      __global const uchar * src2ptr, int src2_step, int src2_offset,
                      __global const uchar * src3ptr, int src3_step, int src3_offset,
@ -64,31 +64,56 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_of

    if (x < dst_cols)
    {
-        int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset));
-        int dst_index = mad24(y0, dst_step, x + dst_offset);
+        int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(srcT1) * kercn, src1_offset));
+        int dst_index = mad24(y0, dst_step, mad24(x, colsPerWI, dst_offset));
 #ifndef HAVE_SCALAR
-        int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset));
-        int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset));
+        int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(srcT1) * kercn, src2_offset));
+        int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(srcT1) * kercn, src3_offset));
 #endif

        for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)
        {
-            __global const T * src1 = (__global const T *)(src1ptr + src1_index);
+#if kercn >= cn && kercn == 4 && depth <= 4 && !defined HAVE_SCALAR
+            srcT src1 = *(__global const srcT *)(src1ptr + src1_index);
+            srcT src2 = *(__global const srcT *)(src2ptr + src2_index);
+            srcT src3 = *(__global const srcT *)(src3ptr + src3_index);
+            __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+#if cn == 1
+            dst[0] = src2 > src1 || src3 < src1 ? (dstT)(0) : (dstT)(255);
+#elif cn == 2
+            dst[0] = (dstT)(src2.xy > src1.xy || src3.xy < src1.xy ||
+                            src2.zw > src1.zw || src3.zw < src1.zw ? (dstT)(0) : (dstT)(255);
+#elif cn == 4
+            dst[0] = (dstT)(src2.x > src1.x || src3.x < src1.x ||
+                src2.y > src1.y || src3.y < src1.y ||
+                src2.z > src1.z || src3.z < src1.z ||
+                src2.w > src1.w || src3.w < src1.w ? 0 : 255);
+#endif
+#else
+            __global const srcT1 * src1 = (__global const srcT1 *)(src1ptr + src1_index);
            __global uchar * dst = dstptr + dst_index;
 #ifndef HAVE_SCALAR
-            __global const T * src2 = (__global const T *)(src2ptr + src2_index);
-            __global const T * src3 = (__global const T *)(src3ptr + src3_index);
+            __global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);
+            __global const srcT1 * src3 = (__global const srcT1 *)(src3ptr + src3_index);
 #endif

-            dst[0] = 255;
-
-            for (int c = 0; c < cn; ++c)
-                if (src2[c] > src1[c] || src3[c] < src1[c])
-                {
-                    dst[0] = 0;
-                    break;
-                }
+            #pragma unroll
+            for (int px = 0; px < colsPerWI; ++px, src1 += cn
+#ifndef HAVE_SCALAR
+                , src2 += cn, src3 += cn
+#endif
+                )
+            {
+                dst[px] = 255;

+                for (int c = 0; c < cn; ++c)
+                    if (src2[c] > src1[c] || src3[c] < src1[c])
+                    {
+                        dst[px] = 0;
+                        break;
+                    }
+            }
+#endif // kercn >= cn
 #ifndef HAVE_SCALAR
            src2_index += src2_step;
            src3_index += src3_step;
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@ -36,114 +36,118 @@

 #if lcn == 1
    #if dcn == 4
-        #define LUT_OP(num)\
-            int idx = *(__global const int *)(srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx & 0xff];\
-            dst[1] = lut_l[(idx >> 8) & 0xff];\
-            dst[2] = lut_l[(idx >> 16) & 0xff];\
+        #define LUT_OP  \
+            int idx = *(__global const int *)(srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx & 0xff]; \
+            dst[1] = lut_l[(idx >> 8) & 0xff]; \
+            dst[2] = lut_l[(idx >> 16) & 0xff]; \
            dst[3] = lut_l[(idx >> 24) & 0xff];
    #elif dcn == 3
-        #define LUT_OP(num)\
-            uchar3 idx = vload3(0, srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx.x];\
-            dst[1] = lut_l[idx.y];\
+        #define LUT_OP  \
+            uchar3 idx = vload3(0, srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx.x]; \
+            dst[1] = lut_l[idx.y]; \
            dst[2] = lut_l[idx.z];
    #elif dcn == 2
-        #define LUT_OP(num)\
-            short idx = *(__global const short *)(srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx & 0xff];\
+        #define LUT_OP \
+            short idx = *(__global const short *)(srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx & 0xff]; \
            dst[1] = lut_l[(idx >> 8) & 0xff];
    #elif dcn == 1
-        #define LUT_OP(num)\
-            uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+        #define LUT_OP \
+            uchar idx = (srcptr + src_index)[0]; \
+            dst = (__global dstT *)(dstptr + dst_index); \
            dst[0] = lut_l[idx];
    #else
-        #define LUT_OP(num)\
-            __global const srcT * src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            for (int cn = 0; cn < dcn; ++cn)\
+        #define LUT_OP \
+            __global const srcT * src = (__global const srcT *)(srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            for (int cn = 0; cn < dcn; ++cn) \
                dst[cn] = lut_l[src[cn]];
    #endif
 #else
    #if dcn == 4
-        #define LUT_OP(num)\
-            __global const uchar4 *src_pixel = (__global const uchar4 *)(srcptr + mad24(num, src_step, src_index));\
-            int4 idx = convert_int4(src_pixel[0]) * lcn + (int4)(0, 1, 2, 3);\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx.x];\
-            dst[1] = lut_l[idx.y];\
-            dst[2] = lut_l[idx.z];\
+        #define LUT_OP \
+            __global const uchar4 * src_pixel = (__global const uchar4 *)(srcptr + src_index); \
+            int4 idx = mad24(convert_int4(src_pixel[0]), (int4)(lcn), (int4)(0, 1, 2, 3)); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx.x]; \
+            dst[1] = lut_l[idx.y]; \
+            dst[2] = lut_l[idx.z]; \
            dst[3] = lut_l[idx.w];
    #elif dcn == 3
-        #define LUT_OP(num)\
-            uchar3 src_pixel = vload3(0, srcptr + mad24(num, src_step, src_index));\
-            int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx.x];\
-            dst[1] = lut_l[idx.y];\
+        #define LUT_OP \
+            uchar3 src_pixel = vload3(0, srcptr + src_index); \
+            int3 idx = mad24(convert_int3(src_pixel), (int3)(lcn), (int3)(0, 1, 2)); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx.x]; \
+            dst[1] = lut_l[idx.y]; \
            dst[2] = lut_l[idx.z];
    #elif dcn == 2
-        #define LUT_OP(num)\
-            __global const uchar2 *src_pixel = (__global const uchar2 *)(srcptr + mad24(num, src_step, src_index));\
-            int2 idx = convert_int2(src_pixel[0]) * lcn + (int2)(0, 1);\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx.x];\
+        #define LUT_OP \
+            __global const uchar2 * src_pixel = (__global const uchar2 *)(srcptr + src_index); \
+            int2 idx = mad24(convert_int2(src_pixel[0]), lcn, (int2)(0, 1)); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx.x]; \
            dst[1] = lut_l[idx.y];
    #elif dcn == 1 //error case (1 < lcn) ==> lcn == scn == dcn
-        #define LUT_OP(num)\
-            uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+        #define LUT_OP \
+            uchar idx = (srcptr + src_index)[0]; \
+            dst = (__global dstT *)(dstptr + dst_index); \
            dst[0] = lut_l[idx];
    #else
-        #define LUT_OP(num)\
-            __global const srcT *src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            for (int cn = 0; cn < dcn; ++cn)\
+        #define LUT_OP \
+            __global const srcT * src = (__global const srcT *)(srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            for (int cn = 0; cn < dcn; ++cn) \
                dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
    #endif
 #endif

-#define LOCAL_LUT_INIT\
-    {\
-        __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);\
-        int init = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0));\
-        int step = get_local_size(0) * get_local_size(1);\
-        for (int i = init; i < 256 * lcn; i += step)\
-        {\
-            lut_l[i] = lut[i];\
-        }\
-        barrier(CLK_LOCAL_MEM_FENCE);\
-    }
-
 __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
                  __global const uchar * lutptr, int lut_step, int lut_offset,
                  __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
 {
+    int x = get_global_id(0);
+    int y = get_global_id(1) << 2;
+
    __local dstT lut_l[256 * lcn];
-    LOCAL_LUT_INIT;
+    __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);

-    int x = get_global_id(0);
-    int y = 4 * get_global_id(1);
+    for (int i = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0)),
+             step = get_local_size(0) * get_local_size(1); i < 256 * lcn; i += step)
+        lut_l[i] = lut[i];
+    barrier(CLK_LOCAL_MEM_FENCE);

    if (x < cols && y < rows)
    {
        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
+
        __global dstT * dst;
-        LUT_OP(0);
+
+        LUT_OP;
+
        if (y < rows - 1)
        {
-            LUT_OP(1);
+            src_index += src_step;
+            dst_index += dst_step;
+            LUT_OP;
+
            if (y < rows - 2)
            {
-                LUT_OP(2);
+                src_index += src_step;
+                dst_index += dst_step;
+                LUT_OP;
+
                if (y < rows - 3)
                {
-                    LUT_OP(3);
+                    src_index += src_step;
+                    dst_index += dst_step;
+                    LUT_OP;
                }
            }
        }
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@ -42,9 +42,13 @@
 #if wdepth <= 4
 #define MIN_ABS(a) convertFromU(abs(a))
 #define MIN_ABS2(a, b) convertFromU(abs_diff(a, b))
+#define MIN(a, b) min(a, b)
+#define MAX(a, b) max(a, b)
 #else
 #define MIN_ABS(a) fabs(a)
 #define MIN_ABS2(a, b) fabs(a - b)
+#define MIN(a, b) fmin(a, b)
+#define MAX(a, b) fmax(a, b)
 #endif

 #if kercn != 3
@ -60,44 +64,41 @@
 #define srcTSIZE (int)sizeof(srcT1)
 #endif

-#ifdef NEED_MINLOC
-#define CALC_MINLOC(inc) minloc = id + inc
-#else
-#define CALC_MINLOC(inc)
-#endif
-
-#ifdef NEED_MAXLOC
-#define CALC_MAXLOC(inc) maxloc = id + inc
-#else
-#define CALC_MAXLOC(inc)
-#endif
-
 #ifdef NEED_MINVAL
+#ifdef NEED_MINLOC
 #define CALC_MIN(p, inc) \
    if (minval > temp.p) \
    { \
        minval = temp.p; \
-        CALC_MINLOC(inc); \
+        minloc = id + inc; \
    }
 #else
+#define CALC_MIN(p, inc) \
+    minval = MIN(minval, temp.p);
+#endif
+#else
 #define CALC_MIN(p, inc)
 #endif

 #ifdef NEED_MAXVAL
+#ifdef NEED_MAXLOC
 #define CALC_MAX(p, inc) \
    if (maxval < temp.p) \
    { \
        maxval = temp.p; \
-        CALC_MAXLOC(inc); \
+        maxloc = id + inc; \
    }
 #else
+#define CALC_MAX(p, inc) \
+    maxval = MAX(maxval, temp.p);
+#endif
+#else
 #define CALC_MAX(p, inc)
 #endif

 #ifdef OP_CALC2
 #define CALC_MAX2(p) \
-    if (maxval2 < temp.p) \
-        maxval2 = temp.p;
+    maxval2 = MAX(maxval2, temp.p);
 #else
 #define CALC_MAX2(p)
 #endif
@ -208,25 +209,28 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off

 #if kercn == 1
 #ifdef NEED_MINVAL
+#if NEED_MINLOC
            if (minval > temp)
            {
                minval = temp;
-#ifdef NEED_MINLOC
                minloc = id;
-#endif
            }
+#else
+            minval = MIN(minval, temp);
+#endif
 #endif
 #ifdef NEED_MAXVAL
+#ifdef NEED_MAXLOC
            if (maxval < temp)
            {
                maxval = temp;
-#ifdef NEED_MAXLOC
                maxloc = id;
-#endif
            }
+#else
+            maxval = MAX(maxval, temp);
+#endif
 #ifdef OP_CALC2
-            if (maxval2 < temp2)
-                maxval2 = temp2;
+            maxval2 = MAX(maxval2, temp2);
 #endif
 #endif
 #elif kercn >= 2
@ -282,32 +286,35 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
    {
        int lid3 = lid - WGS2_ALIGNED;
 #ifdef NEED_MINVAL
+#ifdef NEED_MINLOC
        if (localmem_min[lid3] >= minval)
        {
-#ifdef NEED_MINLOC
            if (localmem_min[lid3] == minval)
                localmem_minloc[lid3] = min(localmem_minloc[lid3], minloc);
            else
                localmem_minloc[lid3] = minloc,
-#endif
-                localmem_min[lid3] = minval;
+            localmem_min[lid3] = minval;
        }
+#else
+        localmem_min[lid3] = MIN(localmem_min[lid3], minval);
+#endif
 #endif
 #ifdef NEED_MAXVAL
+#ifdef NEED_MAXLOC
        if (localmem_max[lid3] <= maxval)
        {
-#ifdef NEED_MAXLOC
            if (localmem_max[lid3] == maxval)
                localmem_maxloc[lid3] = min(localmem_maxloc[lid3], maxloc);
            else
                localmem_maxloc[lid3] = maxloc,
-#endif
-                localmem_max[lid3] = maxval;
+            localmem_max[lid3] = maxval;
        }
+#else
+        localmem_max[lid3] = MAX(localmem_max[lid3], maxval);
+#endif
 #endif
 #ifdef OP_CALC2
-        if (localmem_max2[lid3] < maxval2)
-            localmem_max2[lid3] = maxval2;
+        localmem_max2[lid3] = MAX(localmem_max2[lid3], maxval2);
 #endif
    }
    barrier(CLK_LOCAL_MEM_FENCE);
@ -319,32 +326,35 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
            int lid2 = lsize + lid;

 #ifdef NEED_MINVAL
+#ifdef NEED_MAXLOC
            if (localmem_min[lid] >= localmem_min[lid2])
            {
-#ifdef NEED_MINLOC
                if (localmem_min[lid] == localmem_min[lid2])
                    localmem_minloc[lid] = min(localmem_minloc[lid2], localmem_minloc[lid]);
                else
                    localmem_minloc[lid] = localmem_minloc[lid2],
-#endif
-                    localmem_min[lid] = localmem_min[lid2];
+                localmem_min[lid] = localmem_min[lid2];
            }
+#else
+            localmem_min[lid] = MIN(localmem_min[lid], localmem_min[lid2]);
+#endif
 #endif
 #ifdef NEED_MAXVAL
+#ifdef NEED_MAXLOC
            if (localmem_max[lid] <= localmem_max[lid2])
            {
-#ifdef NEED_MAXLOC
                if (localmem_max[lid] == localmem_max[lid2])
                    localmem_maxloc[lid] = min(localmem_maxloc[lid2], localmem_maxloc[lid]);
                else
                    localmem_maxloc[lid] = localmem_maxloc[lid2],
-#endif
-                    localmem_max[lid] = localmem_max[lid2];
+                localmem_max[lid] = localmem_max[lid2];
            }
+#else
+            localmem_max[lid] = MAX(localmem_max[lid], localmem_max[lid2]);
+#endif
 #endif
 #ifdef OP_CALC2
-            if (localmem_max2[lid] < localmem_max2[lid2])
-                localmem_max2[lid] = localmem_max2[lid2];
+            localmem_max2[lid] = MAX(localmem_max2[lid], localmem_max2[lid2]);
 #endif
        }
        barrier(CLK_LOCAL_MEM_FENCE);
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@ -379,7 +379,7 @@
 #define REDUCE_GLOBAL \
    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = SUM_ABS2(temp, temp2)); \
+    temp = SUM_ABS2(temp, temp2); \
    FUNC(accumulator, temp.s0); \
    FUNC(accumulator, temp.s1); \
    FUNC(accumulator, temp.s2); \
--- a/modules/core/src/opencl/reduce2.cl
+++ b/modules/core/src/opencl/reduce2.cl
@ -81,29 +81,34 @@
 #define PROCESS_ELEM(acc, value) acc += value
 #elif defined OCL_CV_REDUCE_MAX
 #define INIT_VALUE MIN_VAL
-#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
+#define PROCESS_ELEM(acc, value) acc = max(value, acc)
 #elif defined OCL_CV_REDUCE_MIN
 #define INIT_VALUE MAX_VAL
-#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
+#define PROCESS_ELEM(acc, value) acc = min(value, acc)
 #else
 #error "No operation is specified"
 #endif

 #ifdef OP_REDUCE_PRE

-__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
-                     __global uchar * bufptr, int buf_step, int buf_offset)
+__kernel void reduce_horz_opt(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
+                     __global uchar * dstptr, int dst_step, int dst_offset
+#ifdef OCL_CV_REDUCE_AVG
+                     , float fscale
+#endif
+                     )
 {
+    __local bufT lsmem[TILE_HEIGHT][BUF_COLS][cn];
+
    int x = get_global_id(0);
    int y = get_global_id(1);
-    if (x < BUF_COLS)
+    int liy = get_local_id(1);
+    if ((x < BUF_COLS) && (y < rows))
    {
        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset));
-        int buf_index = mad24(y, buf_step, mad24(x, (int)sizeof(dstT) * cn, buf_offset));

        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
-        __global dstT * buf = (__global dstT *)(bufptr + buf_index);
-        dstT tmp[cn] = { INIT_VALUE };
+        bufT tmp[cn] = { INIT_VALUE };

        int src_step_mul = BUF_COLS * cn;
        for (int idx = x; idx < cols; idx += BUF_COLS, src += src_step_mul)
@ -111,14 +116,49 @@ __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int s
            #pragma unroll
            for (int c = 0; c < cn; ++c)
            {
-                dstT value = convertToDT(src[c]);
+                bufT value = convertToBufT(src[c]);
                PROCESS_ELEM(tmp[c], value);
            }
        }

        #pragma unroll
        for (int c = 0; c < cn; ++c)
-            buf[c] = tmp[c];
+            lsmem[liy][x][c] = tmp[c];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if ((x < BUF_COLS / 2) && (y < rows))
+    {
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+        {
+            PROCESS_ELEM(lsmem[liy][x][c], lsmem[liy][x +  BUF_COLS / 2][c]);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if ((x == 0) && (y < rows))
+    {
+        int dst_index = mad24(y, dst_step, dst_offset);
+
+        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+        bufT tmp[cn] = { INIT_VALUE };
+
+        #pragma unroll
+        for (int xin = 0; xin < BUF_COLS / 2; xin ++)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+            {
+                PROCESS_ELEM(tmp[c], lsmem[liy][xin][c]);
+            }
+        }
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+#ifdef OCL_CV_REDUCE_AVG
+            dst[c] = convertToDT(convertToWT(tmp[c]) * fscale);
+#else
+            dst[c] = convertToDT(tmp[c]);
+#endif
    }
 }

--- a/modules/core/src/opencl/set_identity.cl
+++ b/modules/core/src/opencl/set_identity.cl
@ -43,20 +43,18 @@
 //
 //M*/

-#if cn != 3
-#define loadpix(addr) *(__global const T *)(addr)
+#if kercn != 3
 #define storepix(val, addr)  *(__global T *)(addr) = val
 #define TSIZE (int)sizeof(T)
 #define scalar scalar_
 #else
-#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
 #define TSIZE ((int)sizeof(T1)*3)
 #define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
 #endif

 __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
-                          ST scalar_, int rowsPerWI)
+                          ST scalar_)
 {
    int x = get_global_id(0);
    int y0 = get_global_id(1) * rowsPerWI;
@ -65,7 +63,35 @@ __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset,
    {
        int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));

-        for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
-            storepix(x == y ? scalar : (T)(0), srcptr + src_index);
+#if kercn == cn
+        #pragma unroll
+        for (int y = y0, i = 0, y1 = min(rows, y0 + rowsPerWI); i < rowsPerWI; ++y, ++i, src_index += src_step)
+            if (y < y1)
+                storepix(x == y ? scalar : (T)(0), srcptr + src_index);
+#elif kercn == 4 && cn == 1
+        if (y0 < rows)
+        {
+            storepix(x == y0 >> 2 ? (T)(scalar, 0, 0, 0) : (T)(0), srcptr + src_index);
+            if (++y0 < rows)
+            {
+                src_index += src_step;
+                storepix(x == y0 >> 2 ? (T)(0, scalar, 0, 0) : (T)(0), srcptr + src_index);
+
+                if (++y0 < rows)
+                {
+                    src_index += src_step;
+                    storepix(x == y0 >> 2 ? (T)(0, 0, scalar, 0) : (T)(0), srcptr + src_index);
+
+                    if (++y0 < rows)
+                    {
+                        src_index += src_step;
+                        storepix(x == y0 >> 2 ? (T)(0, 0, 0, scalar) : (T)(0), srcptr + src_index);
+                    }
+                }
+            }
+        }
+#else
+#error "Incorrect combination of cn && kercn"
+#endif
    }
 }
--- a/modules/core/src/opencl/transpose.cl
+++ b/modules/core/src/opencl/transpose.cl
@ -53,7 +53,7 @@
 #define TSIZE ((int)sizeof(T1)*3)
 #endif

-#define LDS_STEP      TILE_DIM
+#define LDS_STEP      (TILE_DIM + 1)

 __kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                        __global uchar * dstptr, int dst_step, int dst_offset)
@ -90,6 +90,7 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
    {
        int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset));

+        #pragma unroll
        for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (y + i < src_rows)
            {
@ -103,6 +104,7 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
    {
        int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset));

+        #pragma unroll
        for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((y_index + i) < src_cols)
            {
@ -115,18 +117,24 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
 __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * rowsPerWI;

-    if (y < src_rows && x < y)
+    if (x < y + rowsPerWI)
    {
        int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));
        int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset));
+        T tmp;

-        __global const uchar * src = srcptr + src_index;
-        __global uchar * dst = srcptr + dst_index;
+        #pragma unroll
+        for (int i = 0; i < rowsPerWI; ++i, ++y, src_index += src_step, dst_index += TSIZE)
+            if (y < src_rows && x < y)
+            {
+                __global uchar * src = srcptr + src_index;
+                __global uchar * dst = srcptr + dst_index;

-        T tmp = loadpix(dst);
-        storepix(loadpix(src), dst);
-        storepix(tmp, src);
+                tmp = loadpix(dst);
+                storepix(loadpix(src), dst);
+                storepix(tmp, src);
+            }
    }
 }
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@ -479,9 +479,10 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
        haveMask = _mask.kind() != _InputArray::NONE,
        haveSrc2 = _src2.kind() != _InputArray::NONE;
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
+            kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
            mcn = std::max(cn, kercn);
    CV_Assert(!haveSrc2 || _src2.type() == type);
+    int convert_cn = haveSrc2 ? mcn : cn;

    if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
        return false;
@ -513,7 +514,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
                         haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
                         haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
-                         depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, mcn, cvt[1]) : "noconvert");
+                         depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");

    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
    if (k.empty())
@ -918,8 +919,14 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv
        int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
        bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
                isContinuous = _src.isContinuous();
-        int groups = ocl::Device::getDefault().maxComputeUnits();
-        size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+        const ocl::Device &defDev = ocl::Device::getDefault();
+        int groups = defDev.maxComputeUnits();
+        if (defDev.isIntel())
+        {
+            static const int subSliceEUCount = 10;
+            groups = (groups / subSliceEUCount) * 2;
+        }
+        size_t wgs = defDev.maxWorkGroupSize();

        int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
                dtype = CV_MAKE_TYPE(ddepth, cn),
@ -1445,6 +1452,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*

    CV_Assert(!haveSrc2 || _src2.type() == type);

+    if (depth == CV_32S || depth == CV_32F)
+        return false;
+
    if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
        return false;

@ -2178,6 +2188,9 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
         (!doubleSupport && depth == CV_64F))
        return false;

+    if( depth == CV_32F && (!_mask.empty() || normType == NORM_INF) )
+        return false;
+
    UMat src = _src.getUMat();

    if (normType == NORM_INF)
@ -2533,7 +2546,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
    normType &= ~NORM_RELATIVE;
    bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;

-    if ( !(normType == NORM_INF || normsum) )
+    if ( !normsum || !_mask.empty() )
        return false;

    if (normsum)
--- a/modules/core/test/ocl/test_dft.cpp
+++ b/modules/core/test/ocl/test_dft.cpp
@ -48,17 +48,26 @@

 #ifdef HAVE_OPENCL

+enum OCL_FFT_TYPE
+{
+    R2R = 0,
+    C2R = 1,
+    R2C = 2,
+    C2C = 3
+};
+
 namespace cvtest {
 namespace ocl {

 ////////////////////////////////////////////////////////////////////////////
 // Dft

-PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
+PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
 {
    cv::Size dft_size;
-    int	dft_flags, depth;
-    bool inplace;
+    int	dft_flags, depth, cn, dft_type;
+    bool hint;
+    bool is1d;

    TEST_DECLARE_INPUT_PARAMETER(src);
    TEST_DECLARE_OUTPUT_PARAMETER(dst);
@ -66,34 +75,50 @@ PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
    virtual void SetUp()
    {
        dft_size = GET_PARAM(0);
-        depth = GET_PARAM(1);
-        inplace = GET_PARAM(2);
+        dft_type = GET_PARAM(1);
+        depth = CV_32F;

        dft_flags = 0;
+        switch (dft_type)
+        {
+        case R2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 1; break;
+        case C2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 2; break;
+        case R2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 1; break;
+        case C2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 2; break;
+        }
+
+        if (GET_PARAM(2))
+            dft_flags |= cv::DFT_INVERSE;
        if (GET_PARAM(3))
            dft_flags |= cv::DFT_ROWS;
        if (GET_PARAM(4))
            dft_flags |= cv::DFT_SCALE;
-        if (GET_PARAM(5))
-            dft_flags |= cv::DFT_INVERSE;
+        hint = GET_PARAM(5);
+        is1d = (dft_flags & DFT_ROWS) != 0 || dft_size.height == 1;
    }

-    void generateTestData(int cn = 2)
+    void generateTestData()
    {
        src = randomMat(dft_size, CV_MAKE_TYPE(depth, cn), 0.0, 100.0);
        usrc = src.getUMat(ACCESS_READ);
-
-        if (inplace)
-            dst = src, udst = usrc;
    }
 };

-OCL_TEST_P(Dft, C2C)
+OCL_TEST_P(Dft, Mat)
 {
    generateTestData();

-    OCL_OFF(cv::dft(src, dst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
-    OCL_ON(cv::dft(usrc, udst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
+    int nonzero_rows = hint ? src.cols - randomInt(1, src.rows-1) : 0;
+    OCL_OFF(cv::dft(src, dst, dft_flags, nonzero_rows));
+    OCL_ON(cv::dft(usrc, udst, dft_flags, nonzero_rows));
+
+    // In case forward R2C 1d tranform dst contains only half of output
+    // without complex conjugate
+    if (dft_type == R2C && is1d && (dft_flags & cv::DFT_INVERSE) == 0)
+    {
+        dst = dst(cv::Range(0, dst.rows), cv::Range(0, dst.cols/2 + 1));
+        udst = udst(cv::Range(0, udst.rows), cv::Range(0, udst.cols/2 + 1));
+    }

    double eps = src.size().area() * 1e-4;
    EXPECT_MAT_NEAR(dst, udst, eps);
@ -150,15 +175,15 @@ OCL_TEST_P(MulSpectrums, Mat)

 OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool()));

-OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20),
-                                                       cv::Size(512, 1), cv::Size(1024, 768)),
-                                               Values(CV_32F, CV_64F),
-                                               Bool(), // inplace
+OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)),
+                                               Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R),
+                                               Bool(), // DFT_INVERSE
                                               Bool(), // DFT_ROWS
                                               Bool(), // DFT_SCALE
-                                               Bool()) // DFT_INVERSE
+                                               Bool()  // hint
+                                               )
                            );

 } } // namespace cvtest::ocl

-#endif // HAVE_OPENCL
+#endif // HAVE_OPENCL
--- a/modules/cudabgsegm/perf/perf_bgsegm.cpp
+++ b/modules/cudabgsegm/perf/perf_bgsegm.cpp
@ -42,8 +42,8 @@

 #include "perf_precomp.hpp"

-#ifdef HAVE_OPENCV_LEGACY
-#  include "opencv2/legacy.hpp"
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy.hpp"
 #endif

 #ifdef HAVE_OPENCV_CUDAIMGPROC
@ -72,7 +72,7 @@ using namespace perf;

 #if BUILD_WITH_VIDEO_INPUT_SUPPORT

-#ifdef HAVE_OPENCV_LEGACY
+#ifdef HAVE_OPENCV_CUDALEGACY

 namespace cv
 {
@ -150,7 +150,7 @@ PERF_TEST_P(Video, FGDStatModel,
    }
    else
    {
-#ifdef HAVE_OPENCV_LEGACY
+#ifdef HAVE_OPENCV_CUDALEGACY
        IplImage ipl_frame = frame;
        cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));

--- a/modules/cudabgsegm/test/test_bgsegm.cpp
+++ b/modules/cudabgsegm/test/test_bgsegm.cpp
@ -42,8 +42,8 @@

 #include "test_precomp.hpp"

-#ifdef HAVE_OPENCV_LEGACY
-#  include "opencv2/legacy.hpp"
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy.hpp"
 #endif

 #ifdef HAVE_CUDA
@ -66,7 +66,7 @@ using namespace cvtest;
 //////////////////////////////////////////////////////
 // FGDStatModel

-#if BUILD_WITH_VIDEO_INPUT_SUPPORT && defined(HAVE_OPENCV_LEGACY)
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT && defined(HAVE_OPENCV_CUDALEGACY)

 namespace cv
 {
--- a/modules/cudacodec/CMakeLists.txt
+++ b/modules/cudacodec/CMakeLists.txt
@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Video Encoding/Decoding")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef)

-ocv_add_module(cudacodec opencv_highgui OPTIONAL opencv_cudev)
+ocv_add_module(cudacodec opencv_core opencv_videoio OPTIONAL opencv_cudev)

 ocv_module_include_directories()
 ocv_glob_module_sources()
--- a/modules/cudastereo/src/cuda/disparity_bilateral_filter.cu
+++ b/modules/cudastereo/src/cuda/disparity_bilateral_filter.cu
@ -45,34 +45,12 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/limits.hpp"

+#include "cuda/disparity_bilateral_filter.hpp"
+
 namespace cv { namespace cuda { namespace device
 {
    namespace disp_bilateral_filter
    {
-        __constant__ float* ctable_color;
-        __constant__ float* ctable_space;
-        __constant__ size_t ctable_space_step;
-
-        __constant__ int cndisp;
-        __constant__ int cradius;
-
-        __constant__ short cedge_disc;
-        __constant__ short cmax_disc;
-
-        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
-            size_t table_space_step = table_space.step / sizeof(float);
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
-        }
-
        template <int channels>
        struct DistRgbMax
        {
@ -95,7 +73,11 @@ namespace cv { namespace cuda { namespace device
        };

        template <int channels, typename T>
-        __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+        __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step,
+            const uchar* img, size_t img_step, int h, int w,
+            const float* ctable_color, const float * ctable_space, size_t ctable_space_step,
+            int cradius,
+            short cedge_disc, short cmax_disc)
        {
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
@ -178,7 +160,7 @@ namespace cv { namespace cuda { namespace device
        }

        template <typename T>
-        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, const float *table_color, const float* table_space, size_t table_step, int radius, short edge_disc, short max_disc, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -190,20 +172,20 @@ namespace cv { namespace cuda { namespace device
            case 1:
                for (int i = 0; i < iters; ++i)
                {
-                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
                    cudaSafeCall( cudaGetLastError() );

-                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
                    cudaSafeCall( cudaGetLastError() );
                }
                break;
            case 3:
                for (int i = 0; i < iters; ++i)
                {
-                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
                    cudaSafeCall( cudaGetLastError() );

-                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
                    cudaSafeCall( cudaGetLastError() );
                }
                break;
@ -215,8 +197,8 @@ namespace cv { namespace cuda { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-        template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+        template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, short, short, cudaStream_t stream);
+        template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, short, short, cudaStream_t stream);
    } // namespace bilateral_filter
 }}} // namespace cv { namespace cuda { namespace cudev

--- a/modules/cudastereo/src/cuda/disparity_bilateral_filter.hpp
+++ b/modules/cudastereo/src/cuda/disparity_bilateral_filter.hpp
@ -0,0 +1,8 @@
+namespace cv { namespace cuda { namespace device
+{
+    namespace disp_bilateral_filter
+    {
+        template<typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, const float *, const float *, size_t, int radius, short edge_disc, short max_disc, cudaStream_t stream);
+    }
+}}}
--- a/modules/cudastereo/src/cuda/stereocsbp.cu
+++ b/modules/cudastereo/src/cuda/stereocsbp.cu
@ -48,109 +48,61 @@
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/functional.hpp"

+#include "cuda/stereocsbp.hpp"
+
 namespace cv { namespace cuda { namespace device
 {
    namespace stereocsbp
    {
-        ///////////////////////////////////////////////////////////////
-        /////////////////////// load constants ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        __constant__ int cndisp;
-
-        __constant__ float cmax_data_term;
-        __constant__ float cdata_weight;
-        __constant__ float cmax_disc_term;
-        __constant__ float cdisc_single_jump;
-
-        __constant__ int cth;
-
-        __constant__ size_t cimg_step;
-        __constant__ size_t cmsg_step;
-        __constant__ size_t cdisp_step1;
-        __constant__ size_t cdisp_step2;
-
-        __constant__ uchar* cleft;
-        __constant__ uchar* cright;
-        __constant__ uchar* ctemp;
-
-
-        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-                            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
-            cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
-        }
-
        ///////////////////////////////////////////////////////////////
        /////////////////////// init data cost ////////////////////////
        ///////////////////////////////////////////////////////////////

-        template <int channels> struct DataCostPerPixel;
-        template <> struct DataCostPerPixel<1>
+        template <int channels> static float __device__ pixeldiff(const uchar* left, const uchar* right, float max_data_term);
+        template<> __device__ __forceinline__ static float pixeldiff<1>(const uchar* left, const uchar* right, float max_data_term)
        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
-            }
-        };
-        template <> struct DataCostPerPixel<3>
+            return fmin( ::abs((int)*left - *right), max_data_term);
+        }
+        template<> __device__ __forceinline__ static float pixeldiff<3>(const uchar* left, const uchar* right, float max_data_term)
        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                float tb = 0.114f * ::abs((int)left[0] - right[0]);
-                float tg = 0.587f * ::abs((int)left[1] - right[1]);
-                float tr = 0.299f * ::abs((int)left[2] - right[2]);
+            float tb = 0.114f * ::abs((int)left[0] - right[0]);
+            float tg = 0.587f * ::abs((int)left[1] - right[1]);
+            float tr = 0.299f * ::abs((int)left[2] - right[2]);

-                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-            }
-        };
-        template <> struct DataCostPerPixel<4>
+            return fmin(tr + tg + tb, max_data_term);
+        }
+        template<> __device__ __forceinline__ static float pixeldiff<4>(const uchar* left, const uchar* right, float max_data_term)
        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                uchar4 l = *((const uchar4*)left);
-                uchar4 r = *((const uchar4*)right);
+            uchar4 l = *((const uchar4*)left);
+            uchar4 r = *((const uchar4*)right);

-                float tb = 0.114f * ::abs((int)l.x - r.x);
-                float tg = 0.587f * ::abs((int)l.y - r.y);
-                float tr = 0.299f * ::abs((int)l.z - r.z);
+            float tb = 0.114f * ::abs((int)l.x - r.x);
+            float tg = 0.587f * ::abs((int)l.y - r.y);
+            float tr = 0.299f * ::abs((int)l.z - r.z);

-                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-            }
-        };
+            return fmin(tr + tg + tb, max_data_term);
+        }

        template <typename T>
-        __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
+        __global__ void get_first_k_initial_global(uchar *ctemp, T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane, int ndisp,
+            size_t msg_step, size_t disp_step)
        {
            int x = blockIdx.x * blockDim.x + threadIdx.x;
            int y = blockIdx.y * blockDim.y + threadIdx.y;

            if (y < h && x < w)
            {
-                T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
+                T* selected_disparity = selected_disp_pyr + y * msg_step + x;
+                T* data_cost_selected = data_cost_selected_ + y * msg_step + x;
+                T* data_cost = (T*)ctemp + y * msg_step + x;

                for(int i = 0; i < nr_plane; i++)
                {
                    T minimum = device::numeric_limits<T>::max();
                    int id = 0;
-                    for(int d = 0; d < cndisp; d++)
+                    for(int d = 0; d < ndisp; d++)
                    {
-                        T cur = data_cost[d * cdisp_step1];
+                        T cur = data_cost[d * disp_step];
                        if(cur < minimum)
                        {
                            minimum = cur;
@ -158,46 +110,47 @@ namespace cv { namespace cuda { namespace device
                        }
                    }

-                    data_cost_selected[i  * cdisp_step1] = minimum;
-                    selected_disparity[i  * cdisp_step1] = id;
-                    data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
+                    data_cost_selected[i  * disp_step] = minimum;
+                    selected_disparity[i  * disp_step] = id;
+                    data_cost         [id * disp_step] = numeric_limits<T>::max();
                }
            }
        }


        template <typename T>
-        __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
+        __global__ void get_first_k_initial_local(uchar *ctemp, T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane, int ndisp,
+            size_t msg_step, size_t disp_step)
        {
            int x = blockIdx.x * blockDim.x + threadIdx.x;
            int y = blockIdx.y * blockDim.y + threadIdx.y;

            if (y < h && x < w)
            {
-                T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
+                T* selected_disparity = selected_disp_pyr + y * msg_step + x;
+                T* data_cost_selected = data_cost_selected_ + y * msg_step + x;
+                T* data_cost = (T*)ctemp + y * msg_step + x;

                int nr_local_minimum = 0;

-                T prev = data_cost[0 * cdisp_step1];
-                T cur  = data_cost[1 * cdisp_step1];
-                T next = data_cost[2 * cdisp_step1];
+                T prev = data_cost[0 * disp_step];
+                T cur  = data_cost[1 * disp_step];
+                T next = data_cost[2 * disp_step];

-                for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
+                for (int d = 1; d < ndisp - 1 && nr_local_minimum < nr_plane; d++)
                {
                    if (cur < prev && cur < next)
                    {
-                        data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
-                        selected_disparity[nr_local_minimum * cdisp_step1] = d;
+                        data_cost_selected[nr_local_minimum * disp_step] = cur;
+                        selected_disparity[nr_local_minimum * disp_step] = d;

-                        data_cost[d * cdisp_step1] = numeric_limits<T>::max();
+                        data_cost[d * disp_step] = numeric_limits<T>::max();

                        nr_local_minimum++;
                    }
                    prev = cur;
                    cur = next;
-                    next = data_cost[(d + 1) * cdisp_step1];
+                    next = data_cost[(d + 1) * disp_step];
                }

                for (int i = nr_local_minimum; i < nr_plane; i++)
@ -205,25 +158,27 @@ namespace cv { namespace cuda { namespace device
                    T minimum = numeric_limits<T>::max();
                    int id = 0;

-                    for (int d = 0; d < cndisp; d++)
+                    for (int d = 0; d < ndisp; d++)
                    {
-                        cur = data_cost[d * cdisp_step1];
+                        cur = data_cost[d * disp_step];
                        if (cur < minimum)
                        {
                            minimum = cur;
                            id = d;
                        }
                    }
-                    data_cost_selected[i * cdisp_step1] = minimum;
-                    selected_disparity[i * cdisp_step1] = id;
+                    data_cost_selected[i * disp_step] = minimum;
+                    selected_disparity[i * disp_step] = id;

-                    data_cost[id * cdisp_step1] = numeric_limits<T>::max();
+                    data_cost[id * disp_step] = numeric_limits<T>::max();
                }
            }
        }

        template <typename T, int channels>
-        __global__ void init_data_cost(int h, int w, int level)
+        __global__ void init_data_cost(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step,
+                                      int h, int w, int level, int ndisp, float data_weight, float max_data_term,
+                                      int min_disp, size_t msg_step, size_t disp_step)
        {
            int x = blockIdx.x * blockDim.x + threadIdx.x;
            int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -236,9 +191,9 @@ namespace cv { namespace cuda { namespace device
                int x0 = x << level;
                int xt = (x + 1) << level;

-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
+                T* data_cost = (T*)ctemp + y * msg_step + x;

-                for(int d = 0; d < cndisp; ++d)
+                for(int d = 0; d < ndisp; ++d)
                {
                    float val = 0.0f;
                    for(int yi = y0; yi < yt; yi++)
@ -246,24 +201,26 @@ namespace cv { namespace cuda { namespace device
                        for(int xi = x0; xi < xt; xi++)
                        {
                            int xr = xi - d;
-                            if(d < cth || xr < 0)
-                                val += cdata_weight * cmax_data_term;
+                            if(d < min_disp || xr < 0)
+                                val += data_weight * max_data_term;
                            else
                            {
                                const uchar* lle = cleft + yi * cimg_step + xi * channels;
                                const uchar* lri = cright + yi * cimg_step + xr * channels;

-                                val += DataCostPerPixel<channels>::compute(lle, lri);
+                                val += data_weight * pixeldiff<channels>(lle, lri, max_data_term);
                            }
                        }
                    }
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+                    data_cost[disp_step * d] = saturate_cast<T>(val);
                }
            }
        }

        template <typename T, int winsz, int channels>
-        __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
+        __global__ void init_data_cost_reduce(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step,
+                                              int level, int rows, int cols, int h, int ndisp, float data_weight, float max_data_term,
+                                              int min_disp, size_t msg_step, size_t disp_step)
        {
            int x_out = blockIdx.x;
            int y_out = blockIdx.y % h;
@ -271,7 +228,7 @@ namespace cv { namespace cuda { namespace device

            int tid = threadIdx.x;

-            if (d < cndisp)
+            if (d < ndisp)
            {
                int x0 = x_out << level;
                int y0 = y_out << level;
@ -281,8 +238,8 @@ namespace cv { namespace cuda { namespace device
                float val = 0.0f;
                if (x0 + tid < cols)
                {
-                    if (x0 + tid - d < 0 || d < cth)
-                        val = cdata_weight * cmax_data_term * len;
+                    if (x0 + tid - d < 0 || d < min_disp)
+                        val = data_weight * max_data_term * len;
                    else
                    {
                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
@ -290,7 +247,7 @@ namespace cv { namespace cuda { namespace device

                        for(int y = 0; y < len; ++y)
                        {
-                            val += DataCostPerPixel<channels>::compute(lle, lri);
+                            val += data_weight * pixeldiff<channels>(lle, lri, max_data_term);

                            lle += cimg_step;
                            lri += cimg_step;
@ -302,16 +259,16 @@ namespace cv { namespace cuda { namespace device

                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());

-                T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
+                T* data_cost = (T*)ctemp + y_out * msg_step + x_out;

                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+                    data_cost[disp_step * d] = saturate_cast<T>(val);
            }
        }


        template <typename T>
-        void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
+        void init_data_cost_caller_(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int /*rows*/, int /*cols*/, int h, int w, int level, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -321,15 +278,15 @@ namespace cv { namespace cuda { namespace device

            switch (channels)
            {
-            case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
-            case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
-            case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
+            case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(cleft, cright, ctemp, cimg_step, h, w, level, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(cleft, cright, ctemp, cimg_step, h, w, level, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(cleft, cright, ctemp, cimg_step, h, w, level, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
            }
        }

        template <typename T, int winsz>
-        void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
+        void init_data_cost_reduce_caller_(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int rows, int cols, int h, int w, int level, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step, cudaStream_t stream)
        {
            const int threadsNum = 256;
            const size_t smem_size = threadsNum * sizeof(float);
@ -340,19 +297,19 @@ namespace cv { namespace cuda { namespace device

            switch (channels)
            {
-            case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-            case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-            case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
+            case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(cleft, cright, ctemp, cimg_step, level, rows, cols, h, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(cleft, cright, ctemp, cimg_step, level, rows, cols, h, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(cleft, cright, ctemp, cimg_step, level, rows, cols, h, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
            }
        }

        template<class T>
-        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
+        void init_data_cost(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, bool use_local_init_data_cost, cudaStream_t stream)
        {

-            typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
+            typedef void (*InitDataCostCaller)(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int cols, int rows, int w, int h, int level, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step, cudaStream_t stream);

            static const InitDataCostCaller init_data_cost_callers[] =
            {
@ -362,10 +319,8 @@ namespace cv { namespace cuda { namespace device
            };

            size_t disp_step = msg_step * h;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );

-            init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
+            init_data_cost_callers[level](cleft, cright, ctemp, cimg_step, rows, cols, h, w, level, ndisp, channels, data_weight, max_data_term, min_disp, msg_step, disp_step, stream);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
@ -378,9 +333,9 @@ namespace cv { namespace cuda { namespace device
            grid.y = divUp(h, threads.y);

            if (use_local_init_data_cost == true)
-                get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
+                get_first_k_initial_local<<<grid, threads, 0, stream>>> (ctemp, data_cost_selected, disp_selected_pyr, h, w, nr_plane, ndisp, msg_step, disp_step);
            else
-                get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
+                get_first_k_initial_global<<<grid, threads, 0, stream>>>(ctemp, data_cost_selected, disp_selected_pyr, h, w, nr_plane, ndisp, msg_step, disp_step);

            cudaSafeCall( cudaGetLastError() );

@ -388,18 +343,18 @@ namespace cv { namespace cuda { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+        template void init_data_cost<short>(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, bool use_local_init_data_cost, cudaStream_t stream);

-        template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+        template void init_data_cost<float>(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, bool use_local_init_data_cost, cudaStream_t stream);

        ///////////////////////////////////////////////////////////////
        ////////////////////// compute data cost //////////////////////
        ///////////////////////////////////////////////////////////////

        template <typename T, int channels>
-        __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
+        __global__ void compute_data_cost(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2)
        {
            int x = blockIdx.x * blockDim.x + threadIdx.x;
            int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -412,8 +367,8 @@ namespace cv { namespace cuda { namespace device
                int x0 = x << level;
                int xt = (x + 1) << level;

-                const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step + x/2;
-                T* data_cost = data_cost_ + y * cmsg_step + x;
+                const T* selected_disparity = selected_disp_pyr + y/2 * msg_step + x/2;
+                T* data_cost = data_cost_ + y * msg_step + x;

                for(int d = 0; d < nr_plane; d++)
                {
@ -422,27 +377,27 @@ namespace cv { namespace cuda { namespace device
                    {
                        for(int xi = x0; xi < xt; xi++)
                        {
-                            int sel_disp = selected_disparity[d * cdisp_step2];
+                            int sel_disp = selected_disparity[d * disp_step2];
                            int xr = xi - sel_disp;

-                            if (xr < 0 || sel_disp < cth)
-                                val += cdata_weight * cmax_data_term;
+                            if (xr < 0 || sel_disp < min_disp)
+                                val += data_weight * max_data_term;
                            else
                            {
                                const uchar* left_x = cleft + yi * cimg_step + xi * channels;
                                const uchar* right_x = cright + yi * cimg_step + xr * channels;

-                                val += DataCostPerPixel<channels>::compute(left_x, right_x);
+                                val += data_weight * pixeldiff<channels>(left_x, right_x, max_data_term);
                            }
                        }
                    }
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+                    data_cost[disp_step1 * d] = saturate_cast<T>(val);
                }
            }
        }

        template <typename T, int winsz, int channels>
-        __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
+        __global__ void compute_data_cost_reduce(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2)
        {
            int x_out = blockIdx.x;
            int y_out = blockIdx.y % h;
@ -450,12 +405,12 @@ namespace cv { namespace cuda { namespace device

            int tid = threadIdx.x;

-            const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step + x_out/2;
-            T* data_cost = data_cost_ + y_out * cmsg_step + x_out;
+            const T* selected_disparity = selected_disp_pyr + y_out/2 * msg_step + x_out/2;
+            T* data_cost = data_cost_ + y_out * msg_step + x_out;

            if (d < nr_plane)
            {
-                int sel_disp = selected_disparity[d * cdisp_step2];
+                int sel_disp = selected_disparity[d * disp_step2];

                int x0 = x_out << level;
                int y0 = y_out << level;
@ -465,8 +420,8 @@ namespace cv { namespace cuda { namespace device
                float val = 0.0f;
                if (x0 + tid < cols)
                {
-                    if (x0 + tid - sel_disp < 0 || sel_disp < cth)
-                        val = cdata_weight * cmax_data_term * len;
+                    if (x0 + tid - sel_disp < 0 || sel_disp < min_disp)
+                        val = data_weight * max_data_term * len;
                    else
                    {
                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
@ -474,7 +429,7 @@ namespace cv { namespace cuda { namespace device

                        for(int y = 0; y < len; ++y)
                        {
-                            val += DataCostPerPixel<channels>::compute(lle, lri);
+                            val += data_weight * pixeldiff<channels>(lle, lri, max_data_term);

                            lle += cimg_step;
                            lri += cimg_step;
@ -487,13 +442,13 @@ namespace cv { namespace cuda { namespace device
                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());

                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+                    data_cost[disp_step1 * d] = saturate_cast<T>(val);
            }
        }

        template <typename T>
-        void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
-                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
+        void compute_data_cost_caller_(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
+                                      int h, int w, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -503,16 +458,16 @@ namespace cv { namespace cuda { namespace device

            switch(channels)
            {
-            case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-            case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-            case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
+            case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, h, w, level, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, h, w, level, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, h, w, level, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
            }
        }

        template <typename T, int winsz>
-        void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
-                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
+        void compute_data_cost_reduce_caller_(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, int rows, int cols,
+                                      int h, int w, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2, cudaStream_t stream)
        {
            const int threadsNum = 256;
            const size_t smem_size = threadsNum * sizeof(float);
@ -523,19 +478,20 @@ namespace cv { namespace cuda { namespace device

            switch (channels)
            {
-            case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-            case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-            case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
+            case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
            }
        }

        template<class T>
-        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
+        void compute_data_cost(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, float data_weight, float max_data_term,
+                               int min_disp, cudaStream_t stream)
        {
-            typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
-                int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
+            typedef void (*ComputeDataCostCaller)(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, int rows, int cols,
+                int h, int w, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2, cudaStream_t stream);

            static const ComputeDataCostCaller callers[] =
            {
@ -546,22 +502,19 @@ namespace cv { namespace cuda { namespace device

            size_t disp_step1 = msg_step * h;
            size_t disp_step2 = msg_step * h2;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );

-            callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
+            callers[level](cleft, cright, cimg_step, disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2, stream);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+        template void compute_data_cost(const uchar *cleft, const uchar *cright, size_t cimg_step, const short* disp_selected_pyr, short* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, cudaStream_t stream);

-        template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+        template void compute_data_cost(const uchar *cleft, const uchar *cright, size_t cimg_step, const float* disp_selected_pyr, float* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, cudaStream_t stream);


        ///////////////////////////////////////////////////////////////
@ -574,7 +527,7 @@ namespace cv { namespace cuda { namespace device
                                                     const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
                                                     T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
                                                     const T* data_cost_cur, const T* disparity_selected_cur,
-                                                     int nr_plane, int nr_plane2)
+                                                     int nr_plane, int nr_plane2, size_t disp_step1, size_t disp_step2)
        {
            for(int i = 0; i < nr_plane; i++)
            {
@ -582,7 +535,7 @@ namespace cv { namespace cuda { namespace device
                int id = 0;
                for(int j = 0; j < nr_plane2; j++)
                {
-                    T cur = data_cost_new[j * cdisp_step1];
+                    T cur = data_cost_new[j * disp_step1];
                    if(cur < minimum)
                    {
                        minimum = cur;
@ -590,70 +543,72 @@ namespace cv { namespace cuda { namespace device
                    }
                }

-                data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
-                disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
+                data_cost_selected[i * disp_step1] = data_cost_cur[id * disp_step1];
+                disparity_selected_new[i * disp_step1] = disparity_selected_cur[id * disp_step2];

-                u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
-                d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
-                l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
-                r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
+                u_new[i * disp_step1] = u_cur[id * disp_step2];
+                d_new[i * disp_step1] = d_cur[id * disp_step2];
+                l_new[i * disp_step1] = l_cur[id * disp_step2];
+                r_new[i * disp_step1] = r_cur[id * disp_step2];

-                data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
+                data_cost_new[id * disp_step1] = numeric_limits<T>::max();
            }
        }

        template <typename T>
-        __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
+        __global__ void init_message(uchar *ctemp, T* u_new_, T* d_new_, T* l_new_, T* r_new_,
                                     const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
                                     T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
                                     T* data_cost_selected_, const T* data_cost_,
-                                     int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
+                                     int h, int w, int nr_plane, int h2, int w2, int nr_plane2,
+                                     size_t msg_step, size_t disp_step1, size_t disp_step2)
        {
            int x = blockIdx.x * blockDim.x + threadIdx.x;
            int y = blockIdx.y * blockDim.y + threadIdx.y;

            if (y < h && x < w)
            {
-                const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step + x/2;
-                const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step + x/2;
-                const T* l_cur = l_cur_ + (y/2)                * cmsg_step + ::min(w2-1, x/2 + 1);
-                const T* r_cur = r_cur_ + (y/2)                * cmsg_step + ::max(0, x/2 - 1);
+                const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * msg_step + x/2;
+                const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * msg_step + x/2;
+                const T* l_cur = l_cur_ + (y/2)                * msg_step + ::min(w2-1, x/2 + 1);
+                const T* r_cur = r_cur_ + (y/2)                * msg_step + ::max(0, x/2 - 1);

-                T* data_cost_new = (T*)ctemp + y * cmsg_step + x;
+                T* data_cost_new = (T*)ctemp + y * msg_step + x;

-                const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step + x/2;
-                const T* data_cost = data_cost_ + y * cmsg_step + x;
+                const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * msg_step + x/2;
+                const T* data_cost = data_cost_ + y * msg_step + x;

                for(int d = 0; d < nr_plane2; d++)
                {
-                    int idx2 = d * cdisp_step2;
+                    int idx2 = d * disp_step2;

-                    T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
-                    data_cost_new[d * cdisp_step1] = val;
+                    T val  = data_cost[d * disp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
+                    data_cost_new[d * disp_step1] = val;
                }

-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step + x;
+                T* data_cost_selected = data_cost_selected_ + y * msg_step + x;
+                T* disparity_selected_new = selected_disp_pyr_new + y * msg_step + x;

-                T* u_new = u_new_ + y * cmsg_step + x;
-                T* d_new = d_new_ + y * cmsg_step + x;
-                T* l_new = l_new_ + y * cmsg_step + x;
-                T* r_new = r_new_ + y * cmsg_step + x;
+                T* u_new = u_new_ + y * msg_step + x;
+                T* d_new = d_new_ + y * msg_step + x;
+                T* l_new = l_new_ + y * msg_step + x;
+                T* r_new = r_new_ + y * msg_step + x;

-                u_cur = u_cur_ + y/2 * cmsg_step + x/2;
-                d_cur = d_cur_ + y/2 * cmsg_step + x/2;
-                l_cur = l_cur_ + y/2 * cmsg_step + x/2;
-                r_cur = r_cur_ + y/2 * cmsg_step + x/2;
+                u_cur = u_cur_ + y/2 * msg_step + x/2;
+                d_cur = d_cur_ + y/2 * msg_step + x/2;
+                l_cur = l_cur_ + y/2 * msg_step + x/2;
+                r_cur = r_cur_ + y/2 * msg_step + x/2;

                get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
                                             data_cost_selected, disparity_selected_new, data_cost_new,
-                                             data_cost, disparity_selected_cur, nr_plane, nr_plane2);
+                                             data_cost, disparity_selected_cur, nr_plane, nr_plane2,
+                                             disp_step1, disp_step2);
            }
        }


        template<class T>
-        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
+        void init_message(uchar *ctemp, T* u_new, T* d_new, T* l_new, T* r_new,
                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
                          T* data_cost_selected, const T* data_cost, size_t msg_step,
@ -662,9 +617,6 @@ namespace cv { namespace cuda { namespace device

            size_t disp_step1 = msg_step * h;
            size_t disp_step2 = msg_step * h2;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,   &msg_step, sizeof(size_t)) );

            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -672,11 +624,12 @@ namespace cv { namespace cuda { namespace device
            grid.x = divUp(w, threads.x);
            grid.y = divUp(h, threads.y);

-            init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
+            init_message<<<grid, threads, 0, stream>>>(ctemp, u_new, d_new, l_new, r_new,
                                                       u_cur, d_cur, l_cur, r_cur,
                                                       selected_disp_pyr_new, selected_disp_pyr_cur,
                                                       data_cost_selected, data_cost,
-                                                       h, w, nr_plane, h2, w2, nr_plane2);
+                                                       h, w, nr_plane, h2, w2, nr_plane2,
+                                                       msg_step, disp_step1, disp_step2);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
@ -684,13 +637,13 @@ namespace cv { namespace cuda { namespace device
        }


-        template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
+        template void init_message(uchar *ctemp, short* u_new, short* d_new, short* l_new, short* r_new,
                          const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
                          short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
                          short* data_cost_selected, const short* data_cost, size_t msg_step,
                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);

-        template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
+        template void init_message(uchar *ctemp, float* u_new, float* d_new, float* l_new, float* r_new,
                          const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
                          float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
                          float* data_cost_selected, const float* data_cost, size_t msg_step,
@ -702,13 +655,14 @@ namespace cv { namespace cuda { namespace device

        template <typename T>
        __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
-                                          const T* dst_disp, const T* src_disp, int nr_plane, volatile T* temp)
+                                          const T* dst_disp, const T* src_disp, int nr_plane, int max_disc_term, float disc_single_jump, volatile T* temp,
+                                          size_t disp_step)
        {
            T minimum = numeric_limits<T>::max();

            for(int d = 0; d < nr_plane; d++)
            {
-                int idx = d * cdisp_step1;
+                int idx = d * disp_step;
                T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];

                if(val < minimum)
@ -720,55 +674,53 @@ namespace cv { namespace cuda { namespace device
            float sum = 0;
            for(int d = 0; d < nr_plane; d++)
            {
-                float cost_min = minimum + cmax_disc_term;
-                T src_disp_reg = src_disp[d * cdisp_step1];
+                float cost_min = minimum + max_disc_term;
+                T src_disp_reg = src_disp[d * disp_step];

                for(int d2 = 0; d2 < nr_plane; d2++)
-                    cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
+                    cost_min = fmin(cost_min, msg_dst[d2 * disp_step] + disc_single_jump * ::abs(dst_disp[d2 * disp_step] - src_disp_reg));

-                temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
+                temp[d * disp_step] = saturate_cast<T>(cost_min);
                sum += cost_min;
            }
            sum /= nr_plane;

            for(int d = 0; d < nr_plane; d++)
-                msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
+                msg_dst[d * disp_step] = saturate_cast<T>(temp[d * disp_step] - sum);
        }

        template <typename T>
-        __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
+        __global__ void compute_message(uchar *ctemp, T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i, int max_disc_term, float disc_single_jump, size_t msg_step, size_t disp_step)
        {
            int y = blockIdx.y * blockDim.y + threadIdx.y;
            int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);

            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
            {
-                const T* data = data_cost_selected + y * cmsg_step + x;
+                const T* data = data_cost_selected + y * msg_step + x;

-                T* u = u_ + y * cmsg_step + x;
-                T* d = d_ + y * cmsg_step + x;
-                T* l = l_ + y * cmsg_step + x;
-                T* r = r_ + y * cmsg_step + x;
+                T* u = u_ + y * msg_step + x;
+                T* d = d_ + y * msg_step + x;
+                T* l = l_ + y * msg_step + x;
+                T* r = r_ + y * msg_step + x;

-                const T* disp = selected_disp_pyr_cur + y * cmsg_step + x;
+                const T* disp = selected_disp_pyr_cur + y * msg_step + x;

-                T* temp = (T*)ctemp + y * cmsg_step + x;
+                T* temp = (T*)ctemp + y * msg_step + x;

-                message_per_pixel(data, u, r - 1, u + cmsg_step, l + 1, disp, disp - cmsg_step, nr_plane, temp);
-                message_per_pixel(data, d, d - cmsg_step, r - 1, l + 1, disp, disp + cmsg_step, nr_plane, temp);
-                message_per_pixel(data, l, u + cmsg_step, d - cmsg_step, l + 1, disp, disp - 1, nr_plane, temp);
-                message_per_pixel(data, r, u + cmsg_step, d - cmsg_step, r - 1, disp, disp + 1, nr_plane, temp);
+                message_per_pixel(data, u, r - 1, u + msg_step, l + 1, disp, disp - msg_step, nr_plane, max_disc_term, disc_single_jump, temp, disp_step);
+                message_per_pixel(data, d, d - msg_step, r - 1, l + 1, disp, disp + msg_step, nr_plane, max_disc_term, disc_single_jump, temp, disp_step);
+                message_per_pixel(data, l, u + msg_step, d - msg_step, l + 1, disp, disp - 1, nr_plane, max_disc_term, disc_single_jump, temp, disp_step);
+                message_per_pixel(data, r, u + msg_step, d - msg_step, r - 1, disp, disp + 1, nr_plane, max_disc_term, disc_single_jump, temp, disp_step);
            }
        }


        template<class T>
-        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
-            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
+        void calc_all_iterations(uchar *ctemp, T* u, T* d, T* l, T* r, const T* data_cost_selected,
+            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, int max_disc_term, float disc_single_jump, cudaStream_t stream)
        {
            size_t disp_step = msg_step * h;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );

            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -778,18 +730,18 @@ namespace cv { namespace cuda { namespace device

            for(int t = 0; t < iters; ++t)
            {
-                compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
+                compute_message<<<grid, threads, 0, stream>>>(ctemp, u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1, max_disc_term, disc_single_jump, msg_step, disp_step);
                cudaSafeCall( cudaGetLastError() );
            }
            if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() );
        };

-        template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
-            int h, int w, int nr_plane, int iters, cudaStream_t stream);
+        template void calc_all_iterations(uchar *ctemp, short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
+            int h, int w, int nr_plane, int iters, int max_disc_term, float disc_single_jump, cudaStream_t stream);

-        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
-            int h, int w, int nr_plane, int iters, cudaStream_t stream);
+        template void calc_all_iterations(uchar *ctemp, float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
+            int h, int w, int nr_plane, int iters, int max_disc_term, float disc_single_jump, cudaStream_t stream);


        ///////////////////////////////////////////////////////////////
@ -800,26 +752,26 @@ namespace cv { namespace cuda { namespace device
        template <typename T>
        __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
                                     const T* data_cost_selected, const T* disp_selected_pyr,
-                                     PtrStepSz<short> disp, int nr_plane)
+                                     PtrStepSz<short> disp, int nr_plane, size_t msg_step, size_t disp_step)
        {
            int x = blockIdx.x * blockDim.x + threadIdx.x;
            int y = blockIdx.y * blockDim.y + threadIdx.y;

            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
            {
-                const T* data = data_cost_selected + y * cmsg_step + x;
-                const T* disp_selected = disp_selected_pyr + y * cmsg_step + x;
+                const T* data = data_cost_selected + y * msg_step + x;
+                const T* disp_selected = disp_selected_pyr + y * msg_step + x;

-                const T* u = u_ + (y+1) * cmsg_step + (x+0);
-                const T* d = d_ + (y-1) * cmsg_step + (x+0);
-                const T* l = l_ + (y+0) * cmsg_step + (x+1);
-                const T* r = r_ + (y+0) * cmsg_step + (x-1);
+                const T* u = u_ + (y+1) * msg_step + (x+0);
+                const T* d = d_ + (y-1) * msg_step + (x+0);
+                const T* l = l_ + (y+0) * msg_step + (x+1);
+                const T* r = r_ + (y+0) * msg_step + (x-1);

                int best = 0;
                T best_val = numeric_limits<T>::max();
                for (int i = 0; i < nr_plane; ++i)
                {
-                    int idx = i * cdisp_step1;
+                    int idx = i * disp_step;
                    T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];

                    if (val < best_val)
@ -837,8 +789,6 @@ namespace cv { namespace cuda { namespace device
            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
        {
            size_t disp_step = disp.rows * msg_step;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );

            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@ -846,7 +796,7 @@ namespace cv { namespace cuda { namespace device
            grid.x = divUp(disp.cols, threads.x);
            grid.y = divUp(disp.rows, threads.y);

-            compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane);
+            compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane, msg_step, disp_step);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
--- a/modules/cudastereo/src/cuda/stereocsbp.hpp
+++ b/modules/cudastereo/src/cuda/stereocsbp.hpp
@ -0,0 +1,29 @@
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereocsbp
+    {
+        template<class T>
+        void init_data_cost(const uchar *left, const uchar *right, uchar *ctemp, size_t cimg_step, int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, bool use_local_init_data_cost, cudaStream_t stream);
+
+        template<class T>
+        void compute_data_cost(const uchar *left, const uchar *right, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, float data_weight, float max_data_term,
+                               int min_disp, cudaStream_t stream);
+
+        template<class T>
+        void init_message(uchar *ctemp, T* u_new, T* d_new, T* l_new, T* r_new,
+                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                          T* data_cost_selected, const T* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+
+        template<class T>
+        void calc_all_iterations(uchar *ctemp, T* u, T* d, T* l, T* r, const T* data_cost_selected,
+            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, int max_disc_term, float disc_single_jump, cudaStream_t stream);
+
+        template<class T>
+        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
+    }
+}}}
--- a/modules/cudastereo/src/disparity_bilateral_filter.cpp
+++ b/modules/cudastereo/src/disparity_bilateral_filter.cpp
@ -51,16 +51,7 @@ Ptr<cuda::DisparityBilateralFilter> cv::cuda::createDisparityBilateralFilter(int

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace cuda { namespace device
-{
-    namespace disp_bilateral_filter
-    {
-        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
-
-        template<typename T>
-        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-    }
-}}}
+#include "cuda/disparity_bilateral_filter.hpp"

 namespace
 {
@ -165,7 +156,7 @@ namespace
        const short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
        const short max_disc = short(ndisp * max_disc_threshold + 0.5);

-        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
+        size_t table_space_step = table_space.step / sizeof(float);

        _dst.create(disp.size(), disp.type());
        GpuMat dst = _dst.getGpuMat();
@ -173,7 +164,7 @@ namespace
        if (dst.data != disp.data)
            disp.copyTo(dst, stream);

-        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+        disp_bilateral_filter<T>(dst, img, img.channels(), iters, table_color.ptr<float>(), (float *)table_space.data, table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream));
    }

    void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
--- a/modules/cudastereo/src/stereocsbp.cpp
+++ b/modules/cudastereo/src/stereocsbp.cpp
@ -53,37 +53,7 @@ Ptr<cuda::StereoConstantSpaceBP> cv::cuda::createStereoConstantSpaceBP(int, int,

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace cuda { namespace device
-{
-    namespace stereocsbp
-    {
-        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp);
-
-        template<class T>
-        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
-
-        template<class T>
-        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-
-        template<class T>
-        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
-                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                          T* data_cost_selected, const T* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
-
-        template<class T>
-        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
-            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-        template<class T>
-        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
-            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
-    }
-}}}
+#include "cuda/stereocsbp.hpp"

 namespace
 {
@ -252,8 +222,6 @@ namespace
        ////////////////////////////////////////////////////////////////////////////
        // Compute

-        load_constants(ndisp_, max_data_term_, data_weight_, max_disc_term_, disc_single_jump_, min_disp_th_, left, right, temp_);
-
        l[0].setTo(0, _stream);
        d[0].setTo(0, _stream);
        r[0].setTo(0, _stream);
@ -275,17 +243,18 @@ namespace
            {
                if (i == levels_ - 1)
                {
-                    init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<float>(), data_cost_selected.ptr<float>(),
-                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
+                    init_data_cost(left.ptr<uchar>(), right.ptr<uchar>(), temp_.ptr<uchar>(), left.step, left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<float>(), data_cost_selected.ptr<float>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), data_weight_, max_data_term_, min_disp_th_, use_local_init_data_cost_, stream);
                }
                else
                {
-                    compute_data_cost(disp_selected_pyr[cur_idx].ptr<float>(), data_cost.ptr<float>(), elem_step,
-                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
+                    compute_data_cost(left.ptr<uchar>(), right.ptr<uchar>(), left.step, disp_selected_pyr[cur_idx].ptr<float>(), data_cost.ptr<float>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), data_weight_, max_data_term_, min_disp_th_, stream);

                    int new_idx = (cur_idx + 1) & 1;

-                    init_message(u[new_idx].ptr<float>(), d[new_idx].ptr<float>(), l[new_idx].ptr<float>(), r[new_idx].ptr<float>(),
+                    init_message(temp_.ptr<uchar>(),
+                                 u[new_idx].ptr<float>(), d[new_idx].ptr<float>(), l[new_idx].ptr<float>(), r[new_idx].ptr<float>(),
                                 u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
                                 disp_selected_pyr[new_idx].ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(),
                                 data_cost_selected.ptr<float>(), data_cost.ptr<float>(), elem_step, rows_pyr[i],
@ -294,9 +263,9 @@ namespace
                    cur_idx = new_idx;
                }

-                calc_all_iterations(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                calc_all_iterations(temp_.ptr<uchar>(), u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
                                    data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step,
-                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, max_disc_term_, disc_single_jump_, stream);
            }
        }
        else
@ -305,17 +274,18 @@ namespace
            {
                if (i == levels_ - 1)
                {
-                    init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<short>(), data_cost_selected.ptr<short>(),
-                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
+                    init_data_cost(left.ptr<uchar>(), right.ptr<uchar>(), temp_.ptr<uchar>(), left.step, left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<short>(), data_cost_selected.ptr<short>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), data_weight_, max_data_term_, min_disp_th_, use_local_init_data_cost_, stream);
                }
                else
                {
-                    compute_data_cost(disp_selected_pyr[cur_idx].ptr<short>(), data_cost.ptr<short>(), elem_step,
-                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
+                    compute_data_cost(left.ptr<uchar>(), right.ptr<uchar>(), left.step, disp_selected_pyr[cur_idx].ptr<short>(), data_cost.ptr<short>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), data_weight_, max_data_term_, min_disp_th_, stream);

                    int new_idx = (cur_idx + 1) & 1;

-                    init_message(u[new_idx].ptr<short>(), d[new_idx].ptr<short>(), l[new_idx].ptr<short>(), r[new_idx].ptr<short>(),
+                    init_message(temp_.ptr<uchar>(),
+                                 u[new_idx].ptr<short>(), d[new_idx].ptr<short>(), l[new_idx].ptr<short>(), r[new_idx].ptr<short>(),
                                 u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
                                 disp_selected_pyr[new_idx].ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(),
                                 data_cost_selected.ptr<short>(), data_cost.ptr<short>(), elem_step, rows_pyr[i],
@ -324,9 +294,9 @@ namespace
                    cur_idx = new_idx;
                }

-                calc_all_iterations(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                calc_all_iterations(temp_.ptr<uchar>(), u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
                                    data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step,
-                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, max_disc_term_, disc_single_jump_, stream);
            }
        }

--- a/modules/cudev/test/CMakeLists.txt
+++ b/modules/cudev/test/CMakeLists.txt
@ -1,4 +1,4 @@
-set(test_deps opencv_cudev opencv_core opencv_imgproc opencv_highgui opencv_ts ${OPENCV_MODULE_opencv_ts_DEPS})
+set(test_deps opencv_cudev opencv_core opencv_imgproc opencv_imgcodecs opencv_videoio opencv_highgui opencv_ts ${OPENCV_MODULE_opencv_ts_DEPS})

 ocv_check_dependencies(${test_deps})

--- a/modules/features2d/doc/feature_detection_and_description.rst
+++ b/modules/features2d/doc/feature_detection_and_description.rst
@ -31,7 +31,7 @@ Detects corners using the FAST algorithm

 Detects corners using the FAST algorithm by [Rosten06]_.

-..note:: In Python API, types are given as ``cv2.FAST_FEATURE_DETECTOR_TYPE_5_8``, ``cv2.FAST_FEATURE_DETECTOR_TYPE_7_12`` and  ``cv2.FAST_FEATURE_DETECTOR_TYPE_9_16``. For corner detection, use ``cv2.FAST.detect()`` method.
+.. note:: In Python API, types are given as ``cv2.FAST_FEATURE_DETECTOR_TYPE_5_8``, ``cv2.FAST_FEATURE_DETECTOR_TYPE_7_12`` and  ``cv2.FAST_FEATURE_DETECTOR_TYPE_9_16``. For corner detection, use ``cv2.FAST.detect()`` method.


 .. [Rosten06] E. Rosten. Machine Learning for High-speed Corner Detection, 2006.
@ -254,7 +254,17 @@ KAZE
 ----
 .. ocv:class:: KAZE : public Feature2D

-Class implementing the KAZE keypoint detector and descriptor extractor, described in [ABD12]_.
+Class implementing the KAZE keypoint detector and descriptor extractor, described in [ABD12]_. ::
+
+    class CV_EXPORTS_W KAZE : public Feature2D
+    {
+    public:
+        CV_WRAP KAZE();
+        CV_WRAP explicit KAZE(bool extended, bool upright, float threshold = 0.001f,
+                              int octaves = 4, int sublevels = 4, int diffusivity = DIFF_PM_G2);
+    };
+
+.. note:: AKAZE descriptor can only be used with KAZE or AKAZE keypoints

 .. [ABD12] KAZE Features. Pablo F. Alcantarilla, Adrien Bartoli and Andrew J. Davison. In European Conference on Computer Vision (ECCV), Fiorenze, Italy, October 2012.

@ -262,12 +272,14 @@ KAZE::KAZE
 ----------
 The KAZE constructor

-.. ocv:function:: KAZE::KAZE(bool extended, bool upright)
+.. ocv:function:: KAZE::KAZE(bool extended, bool upright, float threshold, int octaves, int sublevels, int diffusivity)

    :param extended: Set to enable extraction of extended (128-byte) descriptor.
    :param upright: Set to enable use of upright descriptors (non rotation-invariant).
-
-
+    :param threshold: Detector response threshold to accept point
+    :param octaves: Maximum octave evolution of the image
+    :param sublevels: Default number of sublevels per scale level
+    :param diffusivity: Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or DIFF_CHARBONNIER

 AKAZE
 -----
@ -278,25 +290,25 @@ Class implementing the AKAZE keypoint detector and descriptor extractor, describ
    class CV_EXPORTS_W AKAZE : public Feature2D
    {
    public:
-        /// AKAZE Descriptor Type
-        enum DESCRIPTOR_TYPE {
-            DESCRIPTOR_KAZE_UPRIGHT = 2, ///< Upright descriptors, not invariant to rotation
-            DESCRIPTOR_KAZE = 3,
-            DESCRIPTOR_MLDB_UPRIGHT = 4, ///< Upright descriptors, not invariant to rotation
-            DESCRIPTOR_MLDB = 5
-        };
        CV_WRAP AKAZE();
-        explicit AKAZE(DESCRIPTOR_TYPE descriptor_type, int descriptor_size = 0, int descriptor_channels = 3);
+        CV_WRAP explicit AKAZE(int descriptor_type, int descriptor_size = 0, int descriptor_channels = 3,
+                               float threshold = 0.001f, int octaves = 4, int sublevels = 4, int diffusivity = DIFF_PM_G2);
    };

+.. note:: AKAZE descriptor can only be used with KAZE or AKAZE keypoints
+
 .. [ANB13] Fast Explicit Diffusion for Accelerated Features in Nonlinear Scale Spaces. Pablo F. Alcantarilla, Jesús Nuevo and Adrien Bartoli. In British Machine Vision Conference (BMVC), Bristol, UK, September 2013.

 AKAZE::AKAZE
 ------------
 The AKAZE constructor

-.. ocv:function:: AKAZE::AKAZE(DESCRIPTOR_TYPE descriptor_type, int descriptor_size = 0, int descriptor_channels = 3)
+.. ocv:function:: AKAZE::AKAZE(int descriptor_type, int descriptor_size, int descriptor_channels, float threshold, int octaves, int sublevels, int diffusivity)

-    :param descriptor_type: Type of the extracted descriptor.
+    :param descriptor_type: Type of the extracted descriptor: DESCRIPTOR_KAZE, DESCRIPTOR_KAZE_UPRIGHT, DESCRIPTOR_MLDB or DESCRIPTOR_MLDB_UPRIGHT.
    :param descriptor_size: Size of the descriptor in bits. 0 -> Full size
-    :param descriptor_channels: Number of channels in the descriptor (1, 2, 3).
+    :param descriptor_channels: Number of channels in the descriptor (1, 2, 3)
+    :param threshold: Detector response threshold to accept point
+    :param octaves: Maximum octave evolution of the image
+    :param sublevels: Default number of sublevels per scale level
+    :param diffusivity: Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or DIFF_CHARBONNIER
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -895,6 +895,22 @@ protected:
    PixelTestFn test_fn_;
 };

+// KAZE/AKAZE diffusivity
+enum {
+    DIFF_PM_G1 = 0,
+    DIFF_PM_G2 = 1,
+    DIFF_WEICKERT = 2,
+    DIFF_CHARBONNIER = 3
+};
+
+// AKAZE descriptor type
+enum {
+    DESCRIPTOR_KAZE_UPRIGHT = 2, ///< Upright descriptors, not invariant to rotation
+    DESCRIPTOR_KAZE = 3,
+    DESCRIPTOR_MLDB_UPRIGHT = 4, ///< Upright descriptors, not invariant to rotation
+    DESCRIPTOR_MLDB = 5
+};
+
 /*!
 KAZE implementation
 */
@ -902,7 +918,8 @@ class CV_EXPORTS_W KAZE : public Feature2D
 {
 public:
    CV_WRAP KAZE();
-    CV_WRAP explicit KAZE(bool extended, bool upright);
+    CV_WRAP explicit KAZE(bool extended, bool upright, float threshold = 0.001f,
+                          int octaves = 4, int sublevels = 4, int diffusivity = DIFF_PM_G2);

    virtual ~KAZE();

@ -928,6 +945,10 @@ protected:

    CV_PROP bool extended;
    CV_PROP bool upright;
+    CV_PROP float threshold;
+    CV_PROP int octaves;
+    CV_PROP int sublevels;
+    CV_PROP int diffusivity;
 };

 /*!
@ -936,16 +957,9 @@ AKAZE implementation
 class CV_EXPORTS_W AKAZE : public Feature2D
 {
 public:
-    /// AKAZE Descriptor Type
-    enum DESCRIPTOR_TYPE {
-        DESCRIPTOR_KAZE_UPRIGHT = 2, ///< Upright descriptors, not invariant to rotation
-        DESCRIPTOR_KAZE = 3,
-        DESCRIPTOR_MLDB_UPRIGHT = 4, ///< Upright descriptors, not invariant to rotation
-        DESCRIPTOR_MLDB = 5
-    };
-
    CV_WRAP AKAZE();
-    explicit AKAZE(DESCRIPTOR_TYPE descriptor_type, int descriptor_size = 0, int descriptor_channels = 3);
+    CV_WRAP explicit AKAZE(int descriptor_type, int descriptor_size = 0, int descriptor_channels = 3,
+                   float threshold = 0.001f, int octaves = 4, int sublevels = 4, int diffusivity = DIFF_PM_G2);

    virtual ~AKAZE();

@ -973,7 +987,10 @@ protected:
    CV_PROP int descriptor;
    CV_PROP int descriptor_channels;
    CV_PROP int descriptor_size;
-
+    CV_PROP float threshold;
+    CV_PROP int octaves;
+    CV_PROP int sublevels;
+    CV_PROP int diffusivity;
 };
 /****************************************************************************************\
 *                                      Distance                                          *
--- a/modules/features2d/perf/perf_precomp.hpp
+++ b/modules/features2d/perf/perf_precomp.hpp
@ -10,7 +10,7 @@
 #define __OPENCV_PERF_PRECOMP_HPP__

 #include "opencv2/ts.hpp"
-#include "opencv2/highgui.hpp"
+#include "opencv2/imgcodecs.hpp"
 #include "opencv2/features2d.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
--- a/modules/features2d/src/akaze.cpp
+++ b/modules/features2d/src/akaze.cpp
@ -49,7 +49,10 @@ http://www.robesafe.com/personal/pablo.alcantarilla/papers/Alcantarilla13bmvc.pd
 */

 #include "precomp.hpp"
-#include "akaze/AKAZEFeatures.h"
+#include "kaze/AKAZEFeatures.h"
+
+#include <iostream>
+using namespace std;

 namespace cv
 {
@ -57,13 +60,22 @@ namespace cv
        : descriptor(DESCRIPTOR_MLDB)
        , descriptor_channels(3)
        , descriptor_size(0)
+        , threshold(0.001f)
+        , octaves(4)
+        , sublevels(4)
+        , diffusivity(DIFF_PM_G2)
    {
    }

-    AKAZE::AKAZE(DESCRIPTOR_TYPE _descriptor_type, int _descriptor_size, int _descriptor_channels)
+    AKAZE::AKAZE(int _descriptor_type, int _descriptor_size, int _descriptor_channels,
+                 float _threshold, int _octaves, int _sublevels, int _diffusivity)
        : descriptor(_descriptor_type)
        , descriptor_channels(_descriptor_channels)
        , descriptor_size(_descriptor_size)
+        , threshold(_threshold)
+        , octaves(_octaves)
+        , sublevels(_sublevels)
+        , diffusivity(_diffusivity)
    {

    }
@ -78,12 +90,12 @@ namespace cv
    {
        switch (descriptor)
        {
-        case cv::AKAZE::DESCRIPTOR_KAZE:
-        case cv::AKAZE::DESCRIPTOR_KAZE_UPRIGHT:
+        case cv::DESCRIPTOR_KAZE:
+        case cv::DESCRIPTOR_KAZE_UPRIGHT:
            return 64;

-        case cv::AKAZE::DESCRIPTOR_MLDB:
-        case cv::AKAZE::DESCRIPTOR_MLDB_UPRIGHT:
+        case cv::DESCRIPTOR_MLDB:
+        case cv::DESCRIPTOR_MLDB_UPRIGHT:
            // We use the full length binary descriptor -> 486 bits
            if (descriptor_size == 0)
            {
@ -106,12 +118,12 @@ namespace cv
    {
        switch (descriptor)
        {
-        case cv::AKAZE::DESCRIPTOR_KAZE:
-        case cv::AKAZE::DESCRIPTOR_KAZE_UPRIGHT:
+        case cv::DESCRIPTOR_KAZE:
+        case cv::DESCRIPTOR_KAZE_UPRIGHT:
                return CV_32F;

-        case cv::AKAZE::DESCRIPTOR_MLDB:
-        case cv::AKAZE::DESCRIPTOR_MLDB_UPRIGHT:
+        case cv::DESCRIPTOR_MLDB:
+        case cv::DESCRIPTOR_MLDB_UPRIGHT:
                return CV_8U;

            default:
@ -124,12 +136,12 @@ namespace cv
    {
        switch (descriptor)
        {
-        case cv::AKAZE::DESCRIPTOR_KAZE:
-        case cv::AKAZE::DESCRIPTOR_KAZE_UPRIGHT:
+        case cv::DESCRIPTOR_KAZE:
+        case cv::DESCRIPTOR_KAZE_UPRIGHT:
            return cv::NORM_L2;

-        case cv::AKAZE::DESCRIPTOR_MLDB:
-        case cv::AKAZE::DESCRIPTOR_MLDB_UPRIGHT:
+        case cv::DESCRIPTOR_MLDB:
+        case cv::DESCRIPTOR_MLDB_UPRIGHT:
            return cv::NORM_HAMMING;

        default:
@ -153,11 +165,15 @@ namespace cv
        cv::Mat& desc = descriptors.getMatRef();

        AKAZEOptions options;
-        options.descriptor = static_cast<DESCRIPTOR_TYPE>(descriptor);
+        options.descriptor = descriptor;
        options.descriptor_channels = descriptor_channels;
        options.descriptor_size = descriptor_size;
        options.img_width = img.cols;
        options.img_height = img.rows;
+        options.dthreshold = threshold;
+        options.omax = octaves;
+        options.nsublevels = sublevels;
+        options.diffusivity = diffusivity;

        AKAZEFeatures impl(options);
        impl.Create_Nonlinear_Scale_Space(img1_32);
@ -188,7 +204,7 @@ namespace cv
        img.convertTo(img1_32, CV_32F, 1.0 / 255.0, 0);

        AKAZEOptions options;
-        options.descriptor = static_cast<DESCRIPTOR_TYPE>(descriptor);
+        options.descriptor = descriptor;
        options.descriptor_channels = descriptor_channels;
        options.descriptor_size = descriptor_size;
        options.img_width = img.cols;
@ -216,7 +232,7 @@ namespace cv
        cv::Mat& desc = descriptors.getMatRef();

        AKAZEOptions options;
-        options.descriptor = static_cast<DESCRIPTOR_TYPE>(descriptor);
+        options.descriptor = descriptor;
        options.descriptor_channels = descriptor_channels;
        options.descriptor_size = descriptor_size;
        options.img_width = img.cols;
@ -229,4 +245,4 @@ namespace cv
        CV_Assert((!desc.rows || desc.cols == descriptorSize()));
        CV_Assert((!desc.rows || (desc.type() == descriptorType())));
    }
-}
+}
--- a/modules/features2d/src/akaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/akaze/AKAZEFeatures.cpp
--- a/modules/features2d/src/akaze/AKAZEFeatures.h
+++ b/modules/features2d/src/akaze/AKAZEFeatures.h
@ -1,65 +0,0 @@
-/**
- * @file AKAZE.h
- * @brief Main class for detecting and computing binary descriptors in an
- * accelerated nonlinear scale space
- * @date Mar 27, 2013
- * @author Pablo F. Alcantarilla, Jesus Nuevo
- */
-
-#pragma once
-
-/* ************************************************************************* */
-// Includes
-#include "precomp.hpp"
-#include "AKAZEConfig.h"
-
-/* ************************************************************************* */
-// AKAZE Class Declaration
-class AKAZEFeatures {
-
-private:
-
-    AKAZEOptions options_;                ///< Configuration options for AKAZE
-    std::vector<TEvolution> evolution_;	///< Vector of nonlinear diffusion evolution
-
-    /// FED parameters
-    int ncycles_;                  ///< Number of cycles
-    bool reordering_;              ///< Flag for reordering time steps
-    std::vector<std::vector<float > > tsteps_;  ///< Vector of FED dynamic time steps
-    std::vector<int> nsteps_;      ///< Vector of number of steps per cycle
-
-    /// Matrices for the M-LDB descriptor computation
-    cv::Mat descriptorSamples_;  // List of positions in the grids to sample LDB bits from.
-    cv::Mat descriptorBits_;
-    cv::Mat bitMask_;
-
-public:
-
-    /// Constructor with input arguments
-    AKAZEFeatures(const AKAZEOptions& options);
-
-    /// Scale Space methods
-    void Allocate_Memory_Evolution();
-    int Create_Nonlinear_Scale_Space(const cv::Mat& img);
-    void Feature_Detection(std::vector<cv::KeyPoint>& kpts);
-    void Compute_Determinant_Hessian_Response(void);
-    void Compute_Multiscale_Derivatives(void);
-    void Find_Scale_Space_Extrema(std::vector<cv::KeyPoint>& kpts);
-    void Do_Subpixel_Refinement(std::vector<cv::KeyPoint>& kpts);
-
-    // Feature description methods
-    void Compute_Descriptors(std::vector<cv::KeyPoint>& kpts, cv::Mat& desc);
-
-    static void Compute_Main_Orientation(cv::KeyPoint& kpt, const std::vector<TEvolution>& evolution_);
-};
-
-/* ************************************************************************* */
-// Inline functions
-
-// Inline functions
-void generateDescriptorSubsample(cv::Mat& sampleList, cv::Mat& comparisons,
-    int nbits, int pattern_size, int nchannels);
-float get_angle(float x, float y);
-float gaussian(float x, float y, float sigma);
-void check_descriptor_limits(int& x, int& y, int width, int height);
-int fRound(float flt);
--- a/modules/features2d/src/kaze.cpp
+++ b/modules/features2d/src/kaze.cpp
@ -55,12 +55,21 @@ namespace cv
    KAZE::KAZE()
        : extended(false)
        , upright(false)
+        , threshold(0.001f)
+        , octaves(4)
+        , sublevels(4)
+        , diffusivity(DIFF_PM_G2)
    {
    }

-    KAZE::KAZE(bool _extended, bool _upright)
+    KAZE::KAZE(bool _extended, bool _upright, float _threshold, int _octaves,
+               int _sublevels, int _diffusivity)
        : extended(_extended)
        , upright(_upright)
+        , threshold(_threshold)
+        , octaves(_octaves)
+        , sublevels(_sublevels)
+        , diffusivity(_diffusivity)
    {

    }
@ -111,6 +120,10 @@ namespace cv
        options.img_height = img.rows;
        options.extended = extended;
        options.upright = upright;
+        options.dthreshold = threshold;
+        options.omax = octaves;
+        options.nsublevels = sublevels;
+        options.diffusivity = diffusivity;

        KAZEFeatures impl(options);
        impl.Create_Nonlinear_Scale_Space(img1_32);
@ -180,4 +193,4 @@ namespace cv
        CV_Assert((!desc.rows || desc.cols == descriptorSize()));
        CV_Assert((!desc.rows || (desc.type() == descriptorType())));
    }
-}
+}
--- a/modules/features2d/src/akaze/AKAZEConfig.h
+++ b/modules/features2d/src/akaze/AKAZEConfig.h
@ -5,7 +5,8 @@
 * @author Pablo F. Alcantarilla, Jesus Nuevo
 */

-#pragma once
+#ifndef __OPENCV_FEATURES_2D_AKAZE_CONFIG_H__
+#define __OPENCV_FEATURES_2D_AKAZE_CONFIG_H__

 /* ************************************************************************* */
 // OpenCV
@ -28,14 +29,6 @@ const float gauss25[7][7] = {
 /// AKAZE configuration options structure
 struct AKAZEOptions {

-    /// AKAZE Diffusivities
-    enum DIFFUSIVITY_TYPE {
-        PM_G1 = 0,
-        PM_G2 = 1,
-        WEICKERT = 2,
-        CHARBONNIER = 3
-    };
-
    AKAZEOptions()
        : omax(4)
        , nsublevels(4)
@ -44,12 +37,12 @@ struct AKAZEOptions {
        , soffset(1.6f)
        , derivative_factor(1.5f)
        , sderivatives(1.0)
-        , diffusivity(PM_G2)
+        , diffusivity(cv::DIFF_PM_G2)

        , dthreshold(0.001f)
        , min_dthreshold(0.00001f)

-        , descriptor(cv::AKAZE::DESCRIPTOR_MLDB)
+        , descriptor(cv::DESCRIPTOR_MLDB)
        , descriptor_size(0)
        , descriptor_channels(3)
        , descriptor_pattern_size(10)
@ -67,12 +60,12 @@ struct AKAZEOptions {
    float soffset;                  ///< Base scale offset (sigma units)
    float derivative_factor;        ///< Factor for the multiscale derivatives
    float sderivatives;             ///< Smoothing factor for the derivatives
-    DIFFUSIVITY_TYPE diffusivity;   ///< Diffusivity type
+    int diffusivity;   ///< Diffusivity type

    float dthreshold;               ///< Detector response threshold to accept point
    float min_dthreshold;           ///< Minimum detector threshold to accept a point

-    cv::AKAZE::DESCRIPTOR_TYPE descriptor;     ///< Type of descriptor
+    int descriptor;     ///< Type of descriptor
    int descriptor_size;            ///< Size of the descriptor in bits. 0->Full size
    int descriptor_channels;        ///< Number of channels in the descriptor (1, 2, 3)
    int descriptor_pattern_size;    ///< Actual patch size is 2*pattern_size*point.scale
@ -82,28 +75,4 @@ struct AKAZEOptions {
    int kcontrast_nbins;            ///< Number of bins for the contrast factor histogram
 };

-/* ************************************************************************* */
-/// AKAZE nonlinear diffusion filtering evolution
-struct TEvolution {
-
-    TEvolution() {
-        etime = 0.0f;
-        esigma = 0.0f;
-        octave = 0;
-        sublevel = 0;
-        sigma_size = 0;
-    }
-
-    cv::Mat Lx, Ly;	// First order spatial derivatives
-    cv::Mat Lxx, Lxy, Lyy;	// Second order spatial derivatives
-    cv::Mat Lflow;	// Diffusivity image
-    cv::Mat Lt;	// Evolution image
-    cv::Mat Lsmooth; // Smoothed image
-    cv::Mat Lstep; // Evolution step update
-    cv::Mat Ldet; // Detector response
-    float etime;	// Evolution time
-    float esigma;	// Evolution sigma. For linear diffusion t = sigma^2 / 2
-    size_t octave;	// Image octave
-    size_t sublevel;	// Image sublevel in each octave
-    size_t sigma_size;	// Integer sigma. For computing the feature detector responses
-};
+#endif
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
--- a/modules/features2d/src/kaze/AKAZEFeatures.h
+++ b/modules/features2d/src/kaze/AKAZEFeatures.h
@ -0,0 +1,62 @@
+/**
+ * @file AKAZE.h
+ * @brief Main class for detecting and computing binary descriptors in an
+ * accelerated nonlinear scale space
+ * @date Mar 27, 2013
+ * @author Pablo F. Alcantarilla, Jesus Nuevo
+ */
+
+#ifndef __OPENCV_FEATURES_2D_AKAZE_FEATURES_H__
+#define __OPENCV_FEATURES_2D_AKAZE_FEATURES_H__
+
+/* ************************************************************************* */
+// Includes
+#include "precomp.hpp"
+#include "AKAZEConfig.h"
+#include "TEvolution.h"
+
+/* ************************************************************************* */
+// AKAZE Class Declaration
+class AKAZEFeatures {
+
+private:
+
+  AKAZEOptions options_;                ///< Configuration options for AKAZE
+    std::vector<TEvolution> evolution_;        ///< Vector of nonlinear diffusion evolution
+
+  /// FED parameters
+  int ncycles_;                  ///< Number of cycles
+  bool reordering_;              ///< Flag for reordering time steps
+  std::vector<std::vector<float > > tsteps_;  ///< Vector of FED dynamic time steps
+  std::vector<int> nsteps_;      ///< Vector of number of steps per cycle
+
+  /// Matrices for the M-LDB descriptor computation
+  cv::Mat descriptorSamples_;  // List of positions in the grids to sample LDB bits from.
+  cv::Mat descriptorBits_;
+  cv::Mat bitMask_;
+
+public:
+
+  /// Constructor with input arguments
+  AKAZEFeatures(const AKAZEOptions& options);
+
+  /// Scale Space methods
+  void Allocate_Memory_Evolution();
+  int Create_Nonlinear_Scale_Space(const cv::Mat& img);
+  void Feature_Detection(std::vector<cv::KeyPoint>& kpts);
+  void Compute_Determinant_Hessian_Response(void);
+  void Compute_Multiscale_Derivatives(void);
+  void Find_Scale_Space_Extrema(std::vector<cv::KeyPoint>& kpts);
+  void Do_Subpixel_Refinement(std::vector<cv::KeyPoint>& kpts);
+
+  /// Feature description methods
+  void Compute_Descriptors(std::vector<cv::KeyPoint>& kpts, cv::Mat& desc);
+  static void Compute_Main_Orientation(cv::KeyPoint& kpt, const std::vector<TEvolution>& evolution_);
+};
+
+/* ************************************************************************* */
+/// Inline functions
+void generateDescriptorSubsample(cv::Mat& sampleList, cv::Mat& comparisons,
+                                 int nbits, int pattern_size, int nchannels);
+
+#endif
--- a/modules/features2d/src/kaze/KAZEConfig.h
+++ b/modules/features2d/src/kaze/KAZEConfig.h
@ -5,7 +5,8 @@
 * @author Pablo F. Alcantarilla
 */

-#pragma once
+#ifndef __OPENCV_FEATURES_2D_AKAZE_CONFIG_H__
+#define __OPENCV_FEATURES_2D_AKAZE_CONFIG_H__

 // OpenCV Includes
 #include "precomp.hpp"
@ -15,14 +16,8 @@

 struct KAZEOptions {

-    enum DIFFUSIVITY_TYPE {
-        PM_G1 = 0,
-        PM_G2 = 1,
-        WEICKERT = 2
-    };
-
    KAZEOptions()
-        : diffusivity(PM_G2)
+        : diffusivity(cv::DIFF_PM_G2)

        , soffset(1.60f)
        , omax(4)
@ -33,20 +28,13 @@ struct KAZEOptions {
        , dthreshold(0.001f)
        , kcontrast(0.01f)
        , kcontrast_percentille(0.7f)
-        , kcontrast_bins(300)
-
-        , use_fed(true)
+                , kcontrast_bins(300)
        , upright(false)
        , extended(false)
-
-        , use_clipping_normalilzation(false)
-        , clipping_normalization_ratio(1.6f)
-        , clipping_normalization_niter(5)
    {
    }

-    DIFFUSIVITY_TYPE diffusivity;
-
+    int diffusivity;
    float soffset;
    int omax;
    int nsublevels;
@ -57,27 +45,8 @@ struct KAZEOptions {
    float kcontrast;
    float kcontrast_percentille;
    int  kcontrast_bins;
-
-    bool use_fed;
    bool upright;
    bool extended;
-
-    bool  use_clipping_normalilzation;
-    float clipping_normalization_ratio;
-    int   clipping_normalization_niter;
 };

-struct TEvolution {
-    cv::Mat Lx, Ly;	// First order spatial derivatives
-    cv::Mat Lxx, Lxy, Lyy;	// Second order spatial derivatives
-    cv::Mat Lflow;	// Diffusivity image
-    cv::Mat Lt;	// Evolution image
-    cv::Mat Lsmooth; // Smoothed image
-    cv::Mat Lstep; // Evolution step update
-    cv::Mat Ldet; // Detector response
-    float etime;	// Evolution time
-    float esigma;	// Evolution sigma. For linear diffusion t = sigma^2 / 2
-    float octave;	// Image octave
-    float sublevel;	// Image sublevel in each octave
-    int sigma_size;	// Integer esigma. For computing the feature detector responses
-};
+#endif
--- a/modules/features2d/src/kaze/KAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/KAZEFeatures.cpp
--- a/modules/features2d/src/kaze/KAZEFeatures.h
+++ b/modules/features2d/src/kaze/KAZEFeatures.h
@ -7,84 +7,53 @@
 * @author Pablo F. Alcantarilla
 */

-#ifndef KAZE_H_
-#define KAZE_H_
-
-//*************************************************************************************
-//*************************************************************************************
+#ifndef __OPENCV_FEATURES_2D_KAZE_FEATURES_H__
+#define __OPENCV_FEATURES_2D_KAZE_FEATURES_H__

+/* ************************************************************************* */
 // Includes
 #include "KAZEConfig.h"
 #include "nldiffusion_functions.h"
 #include "fed.h"
+#include "TEvolution.h"

-//*************************************************************************************
-//*************************************************************************************
-
+/* ************************************************************************* */
 // KAZE Class Declaration
 class KAZEFeatures {

 private:

-    KAZEOptions options;
-
-    // Parameters of the Nonlinear diffusion class
-    std::vector<TEvolution> evolution_;	// Vector of nonlinear diffusion evolution
+        /// Parameters of the Nonlinear diffusion class
+        KAZEOptions options_;               ///< Configuration options for KAZE
+        std::vector<TEvolution> evolution_;    ///< Vector of nonlinear diffusion evolution

-    // Vector of keypoint vectors for finding extrema in multiple threads
+        /// Vector of keypoint vectors for finding extrema in multiple threads
    std::vector<std::vector<cv::KeyPoint> > kpts_par_;

-    // FED parameters
-    int ncycles_;                  // Number of cycles
-    bool reordering_;              // Flag for reordering time steps
-    std::vector<std::vector<float > > tsteps_;  // Vector of FED dynamic time steps
-    std::vector<int> nsteps_;      // Vector of number of steps per cycle
-
-    // Some auxiliary variables used in the AOS step
-    cv::Mat Ltx_, Lty_, px_, py_, ax_, ay_, bx_, by_, qr_, qc_;
+        /// FED parameters
+        int ncycles_;                  ///< Number of cycles
+        bool reordering_;              ///< Flag for reordering time steps
+        std::vector<std::vector<float > > tsteps_;  ///< Vector of FED dynamic time steps
+        std::vector<int> nsteps_;      ///< Vector of number of steps per cycle

 public:

-    // Constructor
+        /// Constructor
    KAZEFeatures(KAZEOptions& options);

-    // Public methods for KAZE interface
+        /// Public methods for KAZE interface
    void Allocate_Memory_Evolution(void);
    int Create_Nonlinear_Scale_Space(const cv::Mat& img);
    void Feature_Detection(std::vector<cv::KeyPoint>& kpts);
    void Feature_Description(std::vector<cv::KeyPoint>& kpts, cv::Mat& desc);
-
    static void Compute_Main_Orientation(cv::KeyPoint& kpt, const std::vector<TEvolution>& evolution_, const KAZEOptions& options);

-private:
-
-    // Feature Detection Methods
+        /// Feature Detection Methods
    void Compute_KContrast(const cv::Mat& img, const float& kper);
    void Compute_Multiscale_Derivatives(void);
    void Compute_Detector_Response(void);
-    void Determinant_Hessian_Parallel(std::vector<cv::KeyPoint>& kpts);
-    void Find_Extremum_Threading(const int& level);
+        void Determinant_Hessian(std::vector<cv::KeyPoint>& kpts);
    void Do_Subpixel_Refinement(std::vector<cv::KeyPoint>& kpts);
-
-    // AOS Methods
-    void AOS_Step_Scalar(cv::Mat &Ld, const cv::Mat &Ldprev, const cv::Mat &c, const float& stepsize);
-    void AOS_Rows(const cv::Mat &Ldprev, const cv::Mat &c, const float& stepsize);
-    void AOS_Columns(const cv::Mat &Ldprev, const cv::Mat &c, const float& stepsize);
-    void Thomas(const cv::Mat &a, const cv::Mat &b, const cv::Mat &Ld, cv::Mat &x);
-
 };

-//*************************************************************************************
-//*************************************************************************************
-
-// Inline functions
-float getAngle(const float& x, const float& y);
-float gaussian(const float& x, const float& y, const float& sig);
-void checkDescriptorLimits(int &x, int &y, const int& width, const int& height);
-void clippingDescriptor(float *desc, const int& dsize, const int& niter, const float& ratio);
-int fRound(const float& flt);
-
-//*************************************************************************************
-//*************************************************************************************
-
-#endif // KAZE_H_
+#endif
--- a/modules/features2d/src/kaze/TEvolution.h
+++ b/modules/features2d/src/kaze/TEvolution.h
@ -0,0 +1,35 @@
+/**
+ * @file TEvolution.h
+ * @brief Header file with the declaration of the TEvolution struct
+ * @date Jun 02, 2014
+ * @author Pablo F. Alcantarilla
+ */
+
+#ifndef __OPENCV_FEATURES_2D_TEVOLUTION_H__
+#define __OPENCV_FEATURES_2D_TEVOLUTION_H__
+
+/* ************************************************************************* */
+/// KAZE/A-KAZE nonlinear diffusion filtering evolution
+struct TEvolution {
+
+  TEvolution() {
+    etime = 0.0f;
+    esigma = 0.0f;
+    octave = 0;
+    sublevel = 0;
+    sigma_size = 0;
+  }
+
+  cv::Mat Lx, Ly;           ///< First order spatial derivatives
+  cv::Mat Lxx, Lxy, Lyy;    ///< Second order spatial derivatives
+  cv::Mat Lt;               ///< Evolution image
+  cv::Mat Lsmooth;          ///< Smoothed image
+  cv::Mat Ldet;             ///< Detector response
+  float etime;              ///< Evolution time
+  float esigma;             ///< Evolution sigma. For linear diffusion t = sigma^2 / 2
+  int octave;               ///< Image octave
+  int sublevel;             ///< Image sublevel in each octave
+  int sigma_size;           ///< Integer esigma. For computing the feature detector responses
+};
+
+#endif
--- a/modules/features2d/src/kaze/fed.h
+++ b/modules/features2d/src/kaze/fed.h
@ -1,5 +1,5 @@
-#ifndef FED_H
-#define FED_H
+#ifndef __OPENCV_FEATURES_2D_FED_H__
+#define __OPENCV_FEATURES_2D_FED_H__

 //******************************************************************************
 //******************************************************************************
@ -22,4 +22,4 @@ bool fed_is_prime_internal(const int& number);
 //*************************************************************************************
 //*************************************************************************************

-#endif // FED_H
+#endif // __OPENCV_FEATURES_2D_FED_H__
--- a/modules/features2d/src/kaze/nldiffusion_functions.h
+++ b/modules/features2d/src/kaze/nldiffusion_functions.h
@ -8,8 +8,8 @@
 * @author Pablo F. Alcantarilla
 */

-#ifndef KAZE_NLDIFFUSION_FUNCTIONS_H
-#define KAZE_NLDIFFUSION_FUNCTIONS_H
+#ifndef __OPENCV_FEATURES_2D_NLDIFFUSION_FUNCTIONS_H__
+#define __OPENCV_FEATURES_2D_NLDIFFUSION_FUNCTIONS_H__

 /* ************************************************************************* */
 // Includes
--- a/modules/features2d/src/kaze/utils.h
+++ b/modules/features2d/src/kaze/utils.h
@ -0,0 +1,77 @@
+#ifndef __OPENCV_FEATURES_2D_KAZE_UTILS_H__
+#define __OPENCV_FEATURES_2D_KAZE_UTILS_H__
+
+/* ************************************************************************* */
+/**
+ * @brief This function computes the angle from the vector given by (X Y). From 0 to 2*Pi
+ */
+inline float getAngle(float x, float y) {
+
+  if (x >= 0 && y >= 0) {
+    return atanf(y / x);
+  }
+
+  if (x < 0 && y >= 0) {
+    return static_cast<float>(CV_PI)-atanf(-y / x);
+  }
+
+  if (x < 0 && y < 0) {
+    return static_cast<float>(CV_PI)+atanf(y / x);
+  }
+
+  if (x >= 0 && y < 0) {
+    return static_cast<float>(2.0 * CV_PI) - atanf(-y / x);
+  }
+
+  return 0;
+}
+
+/* ************************************************************************* */
+/**
+ * @brief This function computes the value of a 2D Gaussian function
+ * @param x X Position
+ * @param y Y Position
+ * @param sig Standard Deviation
+ */
+inline float gaussian(float x, float y, float sigma) {
+  return expf(-(x*x + y*y) / (2.0f*sigma*sigma));
+}
+
+/* ************************************************************************* */
+/**
+ * @brief This function checks descriptor limits
+ * @param x X Position
+ * @param y Y Position
+ * @param width Image width
+ * @param height Image height
+ */
+inline void checkDescriptorLimits(int &x, int &y, int width, int height) {
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x > width - 1) {
+    x = width - 1;
+  }
+
+  if (y > height - 1) {
+    y = height - 1;
+  }
+}
+
+/* ************************************************************************* */
+/**
+ * @brief This funtion rounds float to nearest integer
+ * @param flt Input float
+ * @return dst Nearest integer
+ */
+inline int fRound(float flt) {
+  return (int)(flt + 0.5f);
+}
+
+#endif
--- a/modules/features2d/test/test_descriptors_regression.cpp
+++ b/modules/features2d/test/test_descriptors_regression.cpp
@ -40,7 +40,6 @@
 //M*/

 #include "test_precomp.hpp"
-#include "opencv2/highgui.hpp"

 using namespace std;
 using namespace cv;
@ -102,8 +101,14 @@ public:
    typedef typename Distance::ResultType DistanceType;

    CV_DescriptorExtractorTest( const string _name, DistanceType _maxDist, const Ptr<DescriptorExtractor>& _dextractor,
-                                Distance d = Distance() ):
-            name(_name), maxDist(_maxDist), dextractor(_dextractor), distance(d) {}
+                                Distance d = Distance(), Ptr<FeatureDetector> _detector = Ptr<FeatureDetector>()):
+        name(_name), maxDist(_maxDist), dextractor(_dextractor), distance(d) , detector(_detector) {}
+
+    ~CV_DescriptorExtractorTest()
+    {
+        if(!detector.empty())
+            detector.release();
+    }
 protected:
    virtual void createDescriptorExtractor() {}

@ -190,7 +195,6 @@ protected:

        // Read the test image.
        string imgFilename =  string(ts->get_data_path()) + FEATURES2D_DIR + "/" + IMAGE_FILENAME;
-
        Mat img = imread( imgFilename );
        if( img.empty() )
        {
@ -198,13 +202,15 @@ protected:
            ts->set_failed_test_info( cvtest::TS::FAIL_INVALID_TEST_DATA );
            return;
        }
-
        vector<KeyPoint> keypoints;
        FileStorage fs( string(ts->get_data_path()) + FEATURES2D_DIR + "/keypoints.xml.gz", FileStorage::READ );
-        if( fs.isOpened() )
-        {
+        if(!detector.empty()) {
+            detector->detect(img, keypoints);
+        } else {
            read( fs.getFirstTopLevelNode(), keypoints );
-
+        }
+        if(!keypoints.empty())
+        {
            Mat calcDescriptors;
            double t = (double)getTickCount();
            dextractor->compute( img, keypoints, calcDescriptors );
@ -245,7 +251,7 @@ protected:
                }
            }
        }
-        else
+        if(!fs.isOpened())
        {
            ts->printf( cvtest::TS::LOG, "Compute and write keypoints.\n" );
            fs.open( string(ts->get_data_path()) + FEATURES2D_DIR + "/keypoints.xml.gz", FileStorage::WRITE );
@ -296,6 +302,7 @@ protected:
    const DistanceType maxDist;
    Ptr<DescriptorExtractor> dextractor;
    Distance distance;
+    Ptr<FeatureDetector> detector;

 private:
    CV_DescriptorExtractorTest& operator=(const CV_DescriptorExtractorTest&) { return *this; }
@ -341,3 +348,19 @@ TEST( Features2d_DescriptorExtractor_OpponentBRIEF, regression )
                                               DescriptorExtractor::create("OpponentBRIEF") );
    test.safe_run();
 }
+
+TEST( Features2d_DescriptorExtractor_KAZE, regression )
+{
+    CV_DescriptorExtractorTest< L2<float> > test( "descriptor-kaze",  0.03f,
+                                                 DescriptorExtractor::create("KAZE"),
+                                                 L2<float>(), FeatureDetector::create("KAZE"));
+    test.safe_run();
+}
+
+TEST( Features2d_DescriptorExtractor_AKAZE, regression )
+{
+    CV_DescriptorExtractorTest<Hamming> test( "descriptor-akaze",  (CV_DescriptorExtractorTest<Hamming>::DistanceType)12.f,
+                                              DescriptorExtractor::create("AKAZE"),
+                                              Hamming(), FeatureDetector::create("AKAZE"));
+    test.safe_run();
+}
--- a/Show More
+++ b/Show More