diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index babefd3182..4f1453a2ff 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -1,37 +1,30 @@
-This is a template helping you to create an issue which can be processes as quickly as possible. Feel free to add additional information or remove not relevant points if you do not need them.
-
+<!--
 If you have a question rather than reporting a bug please go to http://answers.opencv.org where you get much faster responses.
+If you need further assistance please read [How To Contribute](https://github.com/Itseez/opencv/wiki/How_to_contribute).
 
-### Please state the information for your system
-- OpenCV version: 2.4 / 3.x
-- Host OS: Linux (Ubuntu 14.04)  / Mac OS X 10.11.3 / Windows 10
-- *(if needed, only cross-platform builds)* Target OS: host / Android 6.0 / ARM board / Raspberry Pi 2
-- *(if needed)* Compiler & CMake: GCC 5.3 & CMake 3.5
-
-### In which part of the OpenCV library you got the issue?
-Examples:
-- objdetect, highgui, imgproc, cuda, tests
-- face recognition, resizing an image, reading an jpg image
+This is a template helping you to create an issue which can be processed as quickly as possible. This is the bug reporting section for the OpenCV library.
+-->
 
-### Expected behaviour
+##### System information (version)
+<!-- Example
+- OpenCV => 3.1
+- Operating System / Platform => Windows 64 Bit
+- Compiler => Visual Studio 2015
+-->
 
-### Actual behaviour
+- OpenCV => :grey_question:
+- Operating System / Platform => :grey_question:
+- Compiler => :grey_question:
 
-### Additional description
+##### Detailed description
 
-### Code example to reproduce the issue / Steps to reproduce the issue
-Please try to give a full example which will compile as is.
-```
-#include "opencv2/core.hpp"
-#include <iostream>
-using namespace std;
-using namespace cv;
+<!-- your description -->
 
-int main()
-{
-    double d[] = { 546,2435,7,4534,23423,3 };
-    cout << "d = 0x" << reinterpret_cast<void*>(d) << endl;
+##### Steps to reproduce
 
-    return 0;
-}
-```
+<!-- to add code example fence it with triple backticks and optional file extension
+    ```.cpp
+    // C++ code example
+    ```
+ or attach as .txt or .zip file
+-->
\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 496d748731..210a253113 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,9 @@
-resolves #XXXX
+<!-- Please use this line to close one or multiple issues when this pullrequest gets merged
+You can add another line right under the first one:
+resolves #1234
+resolves #1235
+-->
 
-### What does this PR change?
-Please add your changes here.
+### This pullrequest changes
+
+<!-- Please describe what your pullrequest is changing -->
diff --git a/3rdparty/carotene/.gitignore b/3rdparty/carotene/.gitignore
new file mode 100644
index 0000000000..062445879b
--- /dev/null
+++ b/3rdparty/carotene/.gitignore
@@ -0,0 +1,8 @@
+# Gedit temp files
+*~
+
+# Qt Creator file
+*.user
+
+# MacOS-specific (Desktop Services Store)
+.DS_Store
diff --git a/3rdparty/carotene/CMakeLists.txt b/3rdparty/carotene/CMakeLists.txt
new file mode 100644
index 0000000000..4dd7807c61
--- /dev/null
+++ b/3rdparty/carotene/CMakeLists.txt
@@ -0,0 +1,42 @@
+cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
+
+project(Carotene)
+
+set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions")
+
+set(CAROTENE_INCLUDE_DIR include)
+set(CAROTENE_SOURCE_DIR src)
+
+file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp")
+file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp"
+                                                                        "${CAROTENE_SOURCE_DIR}/*.hpp")
+
+include_directories(${CAROTENE_INCLUDE_DIR})
+
+if(CMAKE_COMPILER_IS_GNUCC)
+    set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}")
+
+    # allow more inlines - these parameters improve performance for:
+    # - matchTemplate about 5-10%
+    # - goodFeaturesToTrack 10-20%
+    # - cornerHarris 30% for some cases
+
+    set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
+endif()
+
+add_library(carotene_objs OBJECT
+  ${carotene_headers}
+  ${carotene_sources}
+)
+
+if(NOT CAROTENE_NS STREQUAL "carotene")
+    target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}")
+endif()
+
+if(WITH_NEON)
+    target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
+endif()
+
+set_target_properties(carotene_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+
+add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")
diff --git a/3rdparty/carotene/README.md b/3rdparty/carotene/README.md
new file mode 100644
index 0000000000..fbaae5e970
--- /dev/null
+++ b/3rdparty/carotene/README.md
@@ -0,0 +1,2 @@
+This is Carotene, a low-level library containing optimized CPU routines
+that are useful for computer vision algorithms.
diff --git a/3rdparty/carotene/hal/CMakeLists.txt b/3rdparty/carotene/hal/CMakeLists.txt
new file mode 100644
index 0000000000..2fb92b907b
--- /dev/null
+++ b/3rdparty/carotene/hal/CMakeLists.txt
@@ -0,0 +1,112 @@
+cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../")
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
+  set(ARM TRUE)
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*")
+  set(AARCH64 TRUE)
+endif()
+
+set(TEGRA_COMPILER_FLAGS "")
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  # Generate unwind information even for functions that can't throw/propagate exceptions.
+  # This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols.
+  list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6"))
+    list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
+  else()
+    list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces
+                                     -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
+  endif()
+  if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" )
+    list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange
+                                     -floop-strip-mine -floop-parallelize-all -ftree-loop-linear)
+  endif()
+endif()
+
+string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}")
+
+if(ARMEABI_V7A)
+  if (CMAKE_COMPILER_IS_GNUCXX)
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" )
+    set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" )
+  endif()
+endif()
+
+if(WITH_LOGS)
+  add_definitions(-DHAVE_LOGS)
+endif()
+
+set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE)
+
+function(compile_carotene)
+  if(ENABLE_NEON)
+    set(WITH_NEON ON)
+  endif()
+
+  add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene")
+
+  if(ARM OR AARCH64)
+    if(CMAKE_BUILD_TYPE)
+      set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
+    endif()
+    check_cxx_compiler_flag("-mfpu=neon" CXX_HAS_MFPU_NEON)
+    check_c_compiler_flag("-mfpu=neon" C_HAS_MFPU_NEON)
+    if(${CXX_HAS_MFPU_NEON} AND ${C_HAS_MFPU_NEON})
+      get_target_property(old_flags "carotene_objs" COMPILE_FLAGS)
+      if(old_flags)
+        set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "${old_flags} -mfpu=neon")
+      else()
+        set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "-mfpu=neon")
+      endif()
+    endif()
+  endif()
+endfunction()
+
+compile_carotene()
+
+include_directories("${CAROTENE_DIR}/include")
+
+get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS)
+set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
+
+  if (CMAKE_COMPILER_IS_GNUCXX)
+    # allow more inlines - these parameters improve performance for:
+    #   matchTemplate about 5-10%
+    #   goodFeaturesToTrack 10-20%
+    #   cornerHarris 30% for some cases
+    set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
+#    set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
+  endif()
+
+add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
+set_target_properties(tegra_hal PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
+
+set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE)
+set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE)
+set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE)
+set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE)
+
+configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY)
+configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY)
+configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY)
+configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY)
diff --git a/3rdparty/carotene/hal/tegra_hal.hpp b/3rdparty/carotene/hal/tegra_hal.hpp
new file mode 100644
index 0000000000..401f521a64
--- /dev/null
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
@@ -0,0 +1,1851 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef _tegra_hal_H_INCLUDED_
+#define _tegra_hal_H_INCLUDED_
+
+#define CAROTENE_NS carotene_o4t
+
+#include "carotene/functions.hpp"
+#include <cstddef>
+#include <cstring>
+#include <vector>
+#include <opencv2/core/base.hpp>
+
+#define RANGE_DATA(type, base, step) reinterpret_cast<type*>(const_cast<char *>(reinterpret_cast<const char *>(base)) + static_cast<size_t>(range.start) * step)
+
+#define PARALLEL_CORE 0
+#if PARALLEL_CORE
+
+#define SRC_ARG1 ST * src1_data_, size_t src1_step_,
+#define SRC_STORE1 src1_data(src1_data_), src1_step(src1_step_),
+#define SRC_VAR1 ST * src1_data; \
+                 size_t src1_step;
+#define SRC_ARG2 ST * src1_data_, size_t src1_step_, \
+                 ST * src2_data_, size_t src2_step_,
+#define SRC_STORE2 src1_data(src1_data_), src1_step(src1_step_), \
+                   src2_data(src2_data_), src2_step(src2_step_),
+#define SRC_VAR2 ST * src1_data; \
+                 size_t src1_step; \
+                 ST * src2_data; \
+                 size_t src2_step;
+
+#define DST_ARG1 DT * dst1_data_, size_t dst1_step_,
+#define DST_STORE1 dst1_data(dst1_data_), dst1_step(dst1_step_), 
+#define DST_VAR1 DT * dst1_data; \
+                 size_t dst1_step;
+
+#define SCALE_ARG0
+#define SCALE_STORE0
+#define SCALE_VAR0
+#define SCALE_ARG1 , double scale_
+#define SCALE_STORE1 , scale(scale_)
+#define SCALE_VAR1 double scale;
+#define SCALE_ARG3 , const double *scales_
+#define SCALE_STORE3 , scales(scales_, scales_ + 3)
+#define SCALE_VAR3 std::vector<double> scales;
+
+#define TegraGenOp_Invoker(name, func, src_cnt, dst_cnt, scale_cnt, ...) \
+template <typename ST, typename DT> \
+class TegraGenOp_##name##_Invoker : public cv::ParallelLoopBody \
+{ \
+public: \
+    TegraGenOp_##name##_Invoker(SRC_ARG##src_cnt \
+                                DST_ARG##dst_cnt \
+                                int width_, int height_ \
+                                SCALE_ARG##scale_cnt) : \
+        cv::ParallelLoopBody(), SRC_STORE##src_cnt \
+                                DST_STORE##dst_cnt \
+                                width(width_), height(height_) \
+                                SCALE_STORE##scale_cnt {} \
+    virtual void operator()(const cv::Range& range) const \
+    { \
+        CAROTENE_NS::func(CAROTENE_NS::Size2D(width, range.end-range.start), __VA_ARGS__); \
+    } \
+private: \
+    SRC_VAR##src_cnt \
+    DST_VAR##dst_cnt \
+    int width, height; \
+    SCALE_VAR##scale_cnt \
+    const TegraGenOp_##name##_Invoker& operator= (const TegraGenOp_##name##_Invoker&); \
+};
+
+#define TegraBinaryOp_Invoker(name, func) TegraGenOp_Invoker(name, func, 2, 1, 0, \
+                                                             RANGE_DATA(ST, src1_data, src1_step), src1_step, \
+                                                             RANGE_DATA(ST, src2_data, src2_step), src2_step, \
+                                                             RANGE_DATA(DT, dst1_data, dst1_step), dst1_step )
+
+#define TegraBinaryOp_InvokerVAArg(name, func, ...) TegraGenOp_Invoker(name, func, 2, 1, 0, \
+                                                                       RANGE_DATA(ST, src1_data, src1_step), src1_step, \
+                                                                       RANGE_DATA(ST, src2_data, src2_step), src2_step, \
+                                                                       RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__)
+
+#define TEGRA_BINARYOP(type, op, src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    parallel_for_(Range(0, h), \
+    TegraGenOp_##op##_Invoker<const type, type>(src1, sz1, src2, sz2, dst, sz, w, h), \
+    (w * h) / static_cast<double>(1<<16)), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraBinaryOp_InvokerVAArg(add, add, CAROTENE_NS::CONVERT_POLICY_SATURATE) /*Original addition use saturated operator, so use the same from CAROTENE*/
+
+TegraBinaryOp_Invoker(addf, add)
+
+TegraBinaryOp_InvokerVAArg(sub, sub, CAROTENE_NS::CONVERT_POLICY_SATURATE) /*Original addition use saturated operator, so use the same from CAROTENE*/
+
+TegraBinaryOp_Invoker(subf, sub)
+
+TegraBinaryOp_Invoker(max, max)
+
+TegraBinaryOp_Invoker(min, min)
+
+TegraBinaryOp_Invoker(absDiff, absDiff)
+
+TegraBinaryOp_Invoker(bitwiseAnd, bitwiseAnd)
+
+TegraBinaryOp_Invoker(bitwiseOr, bitwiseOr)
+
+TegraBinaryOp_Invoker(bitwiseXor, bitwiseXor)
+
+#define TegraUnaryOp_Invoker(name, func) TegraGenOp_Invoker(name, func, 1, 1, 0, \
+                                                            RANGE_DATA(ST, src1_data, src1_step), src1_step, \
+                                                            RANGE_DATA(DT, dst1_data, dst1_step), dst1_step )
+
+TegraUnaryOp_Invoker(bitwiseNot, bitwiseNot)
+#define TEGRA_UNARYOP(type, op, src1, sz1, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    parallel_for_(Range(0, h), \
+    TegraGenOp_##op##_Invoker<const type, type>(src1, sz1, dst, sz, w, h), \
+    (w * h) / static_cast<double>(1<<16)), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_add8u
+#define cv_hal_add8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, add, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_add8s
+#define cv_hal_add8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, add, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_add16u
+#define cv_hal_add16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, add, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_add16s
+#define cv_hal_add16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, add, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_add32s
+#define cv_hal_add32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, add, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_add32f
+#define cv_hal_add32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, addf, src1, sz1, src2, sz2, dst, sz, w, h)
+//#undef cv_hal_add64f
+//#define cv_hal_add64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, addf, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_sub8u
+#define cv_hal_sub8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, sub, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_sub8s
+#define cv_hal_sub8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, sub, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_sub16u
+#define cv_hal_sub16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, sub, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_sub16s
+#define cv_hal_sub16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, sub, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_sub32s
+#define cv_hal_sub32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, sub, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_sub32f
+#define cv_hal_sub32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, subf, src1, sz1, src2, sz2, dst, sz, w, h)
+//#undef cv_hal_sub64f
+//#define cv_hal_sub64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, subf, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_max8u
+#define cv_hal_max8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, max, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_max8s
+#define cv_hal_max8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, max, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_max16u
+#define cv_hal_max16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, max, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_max16s
+#define cv_hal_max16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, max, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_max32s
+#define cv_hal_max32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, max, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_max32f
+#define cv_hal_max32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, max, src1, sz1, src2, sz2, dst, sz, w, h)
+//#undef cv_hal_max64f
+//#define cv_hal_max64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, max, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_min8u
+#define cv_hal_min8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, min, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_min8s
+#define cv_hal_min8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, min, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_min16u
+#define cv_hal_min16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, min, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_min16s
+#define cv_hal_min16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, min, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_min32s
+#define cv_hal_min32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, min, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_min32f
+#define cv_hal_min32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, min, src1, sz1, src2, sz2, dst, sz, w, h)
+//#undef cv_hal_min64f
+//#define cv_hal_min64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, min, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_absdiff8u
+#define cv_hal_absdiff8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, absDiff, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_absdiff8s
+#define cv_hal_absdiff8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, absDiff, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_absdiff16u
+#define cv_hal_absdiff16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, absDiff, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_absdiff16s
+#define cv_hal_absdiff16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, absDiff, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_absdiff32s
+#define cv_hal_absdiff32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, absDiff, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_absdiff32f
+#define cv_hal_absdiff32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, absDiff, src1, sz1, src2, sz2, dst, sz, w, h)
+//#undef cv_hal_absdiff64f
+//#define cv_hal_absdiff64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, absDiff, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_and8u
+#define cv_hal_and8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseAnd, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_or8u
+#define cv_hal_or8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseOr, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_xor8u
+#define cv_hal_xor8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseXor, src1, sz1, src2, sz2, dst, sz, w, h)
+#undef cv_hal_not8u
+#define cv_hal_not8u(src1, sz1, dst, sz, w, h) TEGRA_UNARYOP(CAROTENE_NS::u8, bitwiseNot, src1, sz1, dst, sz, w, h)
+
+TegraBinaryOp_Invoker(cmpEQ, cmpEQ)
+TegraBinaryOp_Invoker(cmpNE, cmpNE)
+TegraBinaryOp_Invoker(cmpGT, cmpGT)
+TegraBinaryOp_Invoker(cmpGE, cmpGE)
+TegraGenOp_Invoker(cmpLT, cmpGT, 2, 1, 0, RANGE_DATA(ST, src2_data, src2_step), src2_step, \
+                                          RANGE_DATA(ST, src1_data, src1_step), src1_step, \
+                                          RANGE_DATA(DT, dst1_data, dst1_step), dst1_step)
+TegraGenOp_Invoker(cmpLE, cmpGE, 2, 1, 0, RANGE_DATA(ST, src2_data, src2_step), src2_step, \
+                                          RANGE_DATA(ST, src1_data, src1_step), src1_step, \
+                                          RANGE_DATA(DT, dst1_data, dst1_step), dst1_step)
+#define TEGRA_CMP(type, src1, sz1, src2, sz2, dst, sz, w, h, op) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+        ((op) == cv::CMP_EQ) ? \
+        parallel_for_(Range(0, h), \
+        TegraGenOp_cmpEQ_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
+        (w * h) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_NE) ? \
+        parallel_for_(Range(0, h), \
+        TegraGenOp_cmpNE_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
+        (w * h) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_GT) ? \
+        parallel_for_(Range(0, h), \
+        TegraGenOp_cmpGT_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
+        (w * h) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_GE) ? \
+        parallel_for_(Range(0, h), \
+        TegraGenOp_cmpGE_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
+        (w * h) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_LT) ? \
+        parallel_for_(Range(0, h), \
+        TegraGenOp_cmpLT_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
+        (w * h) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_LE) ? \
+        parallel_for_(Range(0, h), \
+        TegraGenOp_cmpLE_Invoker<const type, CAROTENE_NS::u8>(src1, sz1, src2, sz2, dst, sz, w, h), \
+        (w * h) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_cmp8u
+#define cv_hal_cmp8u(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::u8, src1, sz1, src2, sz2, dst, sz, w, h, op)
+#undef cv_hal_cmp8s
+#define cv_hal_cmp8s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s8, src1, sz1, src2, sz2, dst, sz, w, h, op)
+#undef cv_hal_cmp16u
+#define cv_hal_cmp16u(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::u16, src1, sz1, src2, sz2, dst, sz, w, h, op)
+#undef cv_hal_cmp16s
+#define cv_hal_cmp16s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s16, src1, sz1, src2, sz2, dst, sz, w, h, op)
+#undef cv_hal_cmp32s
+#define cv_hal_cmp32s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s32, src1, sz1, src2, sz2, dst, sz, w, h, op)
+#undef cv_hal_cmp32f
+#define cv_hal_cmp32f(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::f32, src1, sz1, src2, sz2, dst, sz, w, h, op)
+//#undef cv_hal_cmp64f
+//#define cv_hal_cmp64f(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::f64, src1, sz1, src2, sz2, dst, sz, w, h, op)
+
+#define TegraBinaryOpScale_Invoker(name, func, scale_cnt, ...) TegraGenOp_Invoker(name, func, 2, 1, scale_cnt, \
+                                                                                  RANGE_DATA(ST, src1_data, src1_step), src1_step, \
+                                                                                  RANGE_DATA(ST, src2_data, src2_step), src2_step, \
+                                                                                  RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__)
+
+#define TEGRA_BINARYOPSCALE(type, op, src1, sz1, src2, sz2, dst, sz, w, h, scales) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    parallel_for_(Range(0, h), \
+    TegraGenOp_##op##_Invoker<const type, type>(src1, sz1, src2, sz2, dst, sz, w, h, scales), \
+    (w * h) / static_cast<double>(1<<16)), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraBinaryOpScale_Invoker(mul, mul, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE)
+
+TegraBinaryOpScale_Invoker(mulf, mul, 1, scale)
+
+TegraBinaryOpScale_Invoker(div, div, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE)
+
+TegraBinaryOpScale_Invoker(divf, div, 1, scale)
+
+#define TegraUnaryOpScale_Invoker(name, func, scale_cnt, ...) TegraGenOp_Invoker(name, func, 1, 1, scale_cnt, \
+                                                                                 RANGE_DATA(ST, src1_data, src1_step), src1_step, \
+                                                                                 RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__)
+
+#define TEGRA_UNARYOPSCALE(type, op, src1, sz1, dst, sz, w, h, scales) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    parallel_for_(Range(0, h), \
+    TegraGenOp_##op##_Invoker<const type, type>(src1, sz1, dst, sz, w, h, scales), \
+    (w * h) / static_cast<double>(1<<16)), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraUnaryOpScale_Invoker(recip, reciprocal, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE)
+
+TegraUnaryOpScale_Invoker(recipf, reciprocal, 1, scale)
+
+#undef cv_hal_mul8u
+#define cv_hal_mul8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_mul8s
+#define cv_hal_mul8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_mul16u
+#define cv_hal_mul16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_mul16s
+#define cv_hal_mul16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_mul32s
+#define cv_hal_mul32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_mul32f
+#define cv_hal_mul32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, mulf, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+//#undef cv_hal_mul64f
+//#define cv_hal_mul64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, mulf, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_div8u
+#define cv_hal_div8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, div, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_div8s
+#define cv_hal_div8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, div, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_div16u
+#define cv_hal_div16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, div, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_div16s
+#define cv_hal_div16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, div, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_div32s
+#define cv_hal_div32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, div, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_div32f
+#define cv_hal_div32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, divf, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+//#undef cv_hal_div64f
+//#define cv_hal_div64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, divf, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_recip8u
+#define cv_hal_recip8u(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::u8, recip, src1, sz1, dst, sz, w, h, scales)
+#undef cv_hal_recip8s
+#define cv_hal_recip8s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s8, recip, src1, sz1, dst, sz, w, h, scales)
+#undef cv_hal_recip16u
+#define cv_hal_recip16u(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::u16, recip, src1, sz1, dst, sz, w, h, scales)
+#undef cv_hal_recip16s
+#define cv_hal_recip16s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s16, recip, src1, sz1, dst, sz, w, h, scales)
+#undef cv_hal_recip32s
+#define cv_hal_recip32s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s32, recip, src1, sz1, dst, sz, w, h, scales)
+#undef cv_hal_recip32f
+#define cv_hal_recip32f(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::f32, recipf, src1, sz1, dst, sz, w, h, scales)
+//#undef cv_hal_recip64f
+//#define cv_hal_recip64f(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::f64, recipf, src1, sz1, dst, sz, w, h, scales)
+
+TegraBinaryOpScale_Invoker(addWeighted, addWeighted, 3, scales[0], scales[1], scales[2])
+
+#undef cv_hal_addWeighted8u
+#define cv_hal_addWeighted8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_addWeighted8s
+#define cv_hal_addWeighted8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_addWeighted16u
+#define cv_hal_addWeighted16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_addWeighted16s
+#define cv_hal_addWeighted16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+#undef cv_hal_addWeighted32s
+#define cv_hal_addWeighted32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+//#undef cv_hal_addWeighted32f
+//#define cv_hal_addWeighted32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+//#undef cv_hal_addWeighted64f
+//#define cv_hal_addWeighted64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales)
+
+#else
+
+#define TEGRA_ADD(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::add(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz, \
+                     CAROTENE_NS::CONVERT_POLICY_SATURATE), /*Original addition use saturated operator*/ \
+                                                            /*so use the same from CAROTENE*/ \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_ADDF(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::add(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_SUB(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::sub(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz, \
+                     CAROTENE_NS::CONVERT_POLICY_SATURATE), /*Original addition use saturated operator*/ \
+                                                            /*so use the same from CAROTENE*/ \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_SUBF(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::sub(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_MAX(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::max(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_MIN(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::min(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_ABSDIFF(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::absDiff(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_AND(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::bitwiseAnd(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+#define TEGRA_OR(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::bitwiseOr(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_XOR(src1, sz1, src2, sz2, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::bitwiseXor(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_NOT(src1, sz1, dst, sz, w, h) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::bitwiseNot(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     dst, sz), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_add8u
+#define cv_hal_add8u TEGRA_ADD
+#undef cv_hal_add8s
+#define cv_hal_add8s TEGRA_ADD
+#undef cv_hal_add16u
+#define cv_hal_add16u TEGRA_ADD
+#undef cv_hal_add16s
+#define cv_hal_add16s TEGRA_ADD
+#undef cv_hal_add32s
+#define cv_hal_add32s TEGRA_ADD
+#undef cv_hal_add32f
+#define cv_hal_add32f TEGRA_ADDF
+//#undef cv_hal_add64f
+//#define cv_hal_add64f TEGRA_ADDF
+#undef cv_hal_sub8u
+#define cv_hal_sub8u TEGRA_SUB
+#undef cv_hal_sub8s
+#define cv_hal_sub8s TEGRA_SUB
+#undef cv_hal_sub16u
+#define cv_hal_sub16u TEGRA_SUB
+#undef cv_hal_sub16s
+#define cv_hal_sub16s TEGRA_SUB
+#undef cv_hal_sub32s
+#define cv_hal_sub32s TEGRA_SUB
+#undef cv_hal_sub32f
+#define cv_hal_sub32f TEGRA_SUBF
+//#undef cv_hal_sub64f
+//#define cv_hal_sub64f TEGRA_SUBF
+#undef cv_hal_max8u
+#define cv_hal_max8u TEGRA_MAX
+#undef cv_hal_max8s
+#define cv_hal_max8s TEGRA_MAX
+#undef cv_hal_max16u
+#define cv_hal_max16u TEGRA_MAX
+#undef cv_hal_max16s
+#define cv_hal_max16s TEGRA_MAX
+#undef cv_hal_max32s
+#define cv_hal_max32s TEGRA_MAX
+#undef cv_hal_max32f
+#define cv_hal_max32f TEGRA_MAX
+//#undef cv_hal_max64f
+//#define cv_hal_max64f TEGRA_MAX
+#undef cv_hal_min8u
+#define cv_hal_min8u TEGRA_MIN
+#undef cv_hal_min8s
+#define cv_hal_min8s TEGRA_MIN
+#undef cv_hal_min16u
+#define cv_hal_min16u TEGRA_MIN
+#undef cv_hal_min16s
+#define cv_hal_min16s TEGRA_MIN
+#undef cv_hal_min32s
+#define cv_hal_min32s TEGRA_MIN
+#undef cv_hal_min32f
+#define cv_hal_min32f TEGRA_MIN
+//#undef cv_hal_min64f
+//#define cv_hal_min64f TEGRA_MIN
+#undef cv_hal_absdiff8u
+#define cv_hal_absdiff8u TEGRA_ABSDIFF
+#undef cv_hal_absdiff8s
+#define cv_hal_absdiff8s TEGRA_ABSDIFF
+#undef cv_hal_absdiff16u
+#define cv_hal_absdiff16u TEGRA_ABSDIFF
+#undef cv_hal_absdiff16s
+#define cv_hal_absdiff16s TEGRA_ABSDIFF
+#undef cv_hal_absdiff32s
+#define cv_hal_absdiff32s TEGRA_ABSDIFF
+#undef cv_hal_absdiff32f
+#define cv_hal_absdiff32f TEGRA_ABSDIFF
+//#undef cv_hal_absdiff64f
+//#define cv_hal_absdiff64f TEGRA_ABSDIFF
+#undef cv_hal_and8u
+#define cv_hal_and8u TEGRA_AND
+#undef cv_hal_or8u
+#define cv_hal_or8u TEGRA_OR
+#undef cv_hal_xor8u
+#define cv_hal_xor8u TEGRA_XOR
+#undef cv_hal_not8u
+#define cv_hal_not8u TEGRA_NOT
+
+#define TEGRA_CMP(src1, sz1, src2, sz2, dst, sz, w, h, op) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+        ((op) == cv::CMP_EQ) ? \
+        CAROTENE_NS::cmpEQ(CAROTENE_NS::Size2D(w, h), \
+                           src1, sz1, \
+                           src2, sz2, \
+                           dst, sz), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_NE) ? \
+        CAROTENE_NS::cmpNE(CAROTENE_NS::Size2D(w, h), \
+                           src1, sz1, \
+                           src2, sz2, \
+                           dst, sz), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_GT) ? \
+        CAROTENE_NS::cmpGT(CAROTENE_NS::Size2D(w, h), \
+                           src1, sz1, \
+                           src2, sz2, \
+                           dst, sz), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_GE) ? \
+        CAROTENE_NS::cmpGE(CAROTENE_NS::Size2D(w, h), \
+                           src1, sz1, \
+                           src2, sz2, \
+                           dst, sz), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_LT) ? \
+        CAROTENE_NS::cmpGT(CAROTENE_NS::Size2D(w, h), \
+                           src2, sz2, \
+                           src1, sz1, \
+                           dst, sz), \
+        CV_HAL_ERROR_OK : \
+        ((op) == cv::CMP_LE) ? \
+        CAROTENE_NS::cmpGE(CAROTENE_NS::Size2D(w, h), \
+                           src2, sz2, \
+                           src1, sz1, \
+                           dst, sz), \
+        CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_cmp8u
+#define cv_hal_cmp8u TEGRA_CMP
+#undef cv_hal_cmp8s
+#define cv_hal_cmp8s TEGRA_CMP
+#undef cv_hal_cmp16u
+#define cv_hal_cmp16u TEGRA_CMP
+#undef cv_hal_cmp16s
+#define cv_hal_cmp16s TEGRA_CMP
+#undef cv_hal_cmp32s
+#define cv_hal_cmp32s TEGRA_CMP
+#undef cv_hal_cmp32f
+#define cv_hal_cmp32f TEGRA_CMP
+//#undef cv_hal_cmp64f
+//#define cv_hal_cmp64f TEGRA_CMP
+
+#define TEGRA_MUL(src1, sz1, src2, sz2, dst, sz, w, h, scale) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::mul(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz, \
+                     scale, \
+                     CAROTENE_NS::CONVERT_POLICY_SATURATE), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_MULF(src1, sz1, src2, sz2, dst, sz, w, h, scale) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::mul(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz, \
+                     (float)scale), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_DIV(src1, sz1, src2, sz2, dst, sz, w, h, scale) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::div(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz, \
+                     scale, \
+                     CAROTENE_NS::CONVERT_POLICY_SATURATE), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_DIVF(src1, sz1, src2, sz2, dst, sz, w, h, scale) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::div(CAROTENE_NS::Size2D(w, h), \
+                     src1, sz1, \
+                     src2, sz2, \
+                     dst, sz, \
+                     (float)scale), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_RECIP(src2, sz2, dst, sz, w, h, scale) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::reciprocal(CAROTENE_NS::Size2D(w, h), \
+                            src2, sz2, \
+                            dst, sz, \
+                            scale, \
+                            CAROTENE_NS::CONVERT_POLICY_SATURATE), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_RECIPF(src2, sz2, dst, sz, w, h, scale) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::reciprocal(CAROTENE_NS::Size2D(w, h), \
+                            src2, sz2, \
+                            dst, sz, \
+                            (float)scale), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_mul8u
+#define cv_hal_mul8u TEGRA_MUL
+#undef cv_hal_mul8s
+#define cv_hal_mul8s TEGRA_MUL
+#undef cv_hal_mul16u
+#define cv_hal_mul16u TEGRA_MUL
+#undef cv_hal_mul16s
+#define cv_hal_mul16s TEGRA_MUL
+#undef cv_hal_mul32s
+#define cv_hal_mul32s TEGRA_MUL
+#undef cv_hal_mul32f
+#define cv_hal_mul32f TEGRA_MULF
+//#undef cv_hal_mul64f
+//#define cv_hal_mul64f TEGRA_MULF
+#undef cv_hal_div8u
+#define cv_hal_div8u TEGRA_DIV
+#undef cv_hal_div8s
+#define cv_hal_div8s TEGRA_DIV
+#undef cv_hal_div16u
+#define cv_hal_div16u TEGRA_DIV
+#undef cv_hal_div16s
+#define cv_hal_div16s TEGRA_DIV
+#undef cv_hal_div32s
+#define cv_hal_div32s TEGRA_DIV
+#undef cv_hal_div32f
+#define cv_hal_div32f TEGRA_DIVF
+//#undef cv_hal_div64f
+//#define cv_hal_div64f TEGRA_DIVF
+#undef cv_hal_recip8u
+#define cv_hal_recip8u TEGRA_RECIP
+#undef cv_hal_recip8s
+#define cv_hal_recip8s TEGRA_RECIP
+#undef cv_hal_recip16u
+#define cv_hal_recip16u TEGRA_RECIP
+#undef cv_hal_recip16s
+#define cv_hal_recip16s TEGRA_RECIP
+#undef cv_hal_recip32s
+#define cv_hal_recip32s TEGRA_RECIP
+#undef cv_hal_recip32f
+#define cv_hal_recip32f TEGRA_RECIPF
+//#undef cv_hal_recip64f
+//#define cv_hal_recip64f TEGRA_RECIPF
+
+#define TEGRA_ADDWEIGHTED(src1, sz1, src2, sz2, dst, sz, w, h, scales) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    CAROTENE_NS::addWeighted(CAROTENE_NS::Size2D(w, h), \
+                             src1, sz1, \
+                             src2, sz2, \
+                             dst, sz, \
+                             ((double *)scales)[0], ((double *)scales)[1], ((double *)scales)[2]), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_addWeighted8u
+#define cv_hal_addWeighted8u TEGRA_ADDWEIGHTED
+#undef cv_hal_addWeighted8s
+#define cv_hal_addWeighted8s TEGRA_ADDWEIGHTED
+#undef cv_hal_addWeighted16u
+#define cv_hal_addWeighted16u TEGRA_ADDWEIGHTED
+#undef cv_hal_addWeighted16s
+#define cv_hal_addWeighted16s TEGRA_ADDWEIGHTED
+#undef cv_hal_addWeighted32s
+#define cv_hal_addWeighted32s TEGRA_ADDWEIGHTED
+//#undef cv_hal_addWeighted32f
+//#define cv_hal_addWeighted32f TEGRA_ADDWEIGHTED
+//#undef cv_hal_addWeighted64f
+//#define cv_hal_addWeighted64f TEGRA_ADDWEIGHTED
+
+#endif //PARALLEL_CORE
+
+#define ROW_SRC_ARG1 const ST * src1_data_
+#define ROW_SRC_STORE1 , src1_data(src1_data_)
+#define ROW_SRC_VAR1 const ST * src1_data;
+#define ROW_SRC_ARG2 ROW_SRC_ARG1 \
+                     , const ST * src2_data_
+#define ROW_SRC_STORE2 ROW_SRC_STORE1 \
+                       , src2_data(src2_data_)
+#define ROW_SRC_VAR2 ROW_SRC_VAR1 \
+                     const ST * src2_data;
+#define ROW_SRC_ARG3 ROW_SRC_ARG2 \
+                     , const ST * src3_data_
+#define ROW_SRC_STORE3 ROW_SRC_STORE2 \
+                       , src3_data(src3_data_)
+#define ROW_SRC_VAR3 ROW_SRC_VAR2 \
+                     const ST * src3_data;
+#define ROW_SRC_ARG4 ROW_SRC_ARG3 \
+                     , const ST * src4_data_
+#define ROW_SRC_STORE4 ROW_SRC_STORE3 \
+                       , src4_data(src4_data_)
+#define ROW_SRC_VAR4 ROW_SRC_VAR3 \
+                     const ST * src4_data;
+
+#define ROW_DST_ARG1 , DT * dst1_data_
+#define ROW_DST_STORE1 , dst1_data(dst1_data_)
+#define ROW_DST_VAR1 DT * dst1_data;
+#define ROW_DST_ARG2 ROW_DST_ARG1 \
+                     , DT * dst2_data_
+#define ROW_DST_STORE2 ROW_DST_STORE1 \
+                       , dst2_data(dst2_data_)
+#define ROW_DST_VAR2 ROW_DST_VAR1 \
+                     DT * dst2_data;
+#define ROW_DST_ARG3 ROW_DST_ARG2 \
+                     , DT * dst3_data_
+#define ROW_DST_STORE3 ROW_DST_STORE2 \
+                       , dst3_data(dst3_data_)
+#define ROW_DST_VAR3 ROW_DST_VAR2 \
+                     DT * dst3_data;
+#define ROW_DST_ARG4 ROW_DST_ARG3 \
+                     , DT * dst4_data_
+#define ROW_DST_STORE4 ROW_DST_STORE3 \
+                       , dst4_data(dst4_data_)
+#define ROW_DST_VAR4 ROW_DST_VAR3 \
+                     DT * dst4_data;
+
+#define ROW_VAL_ARG0
+#define ROW_VAL_STORE0
+#define ROW_VAL_VAR0
+#define ROW_VAL_ARG1 , double val_
+#define ROW_VAL_STORE1 , val(val_)
+#define ROW_VAL_VAR1 double val;
+
+#define TegraRowOp_Invoker(name, func, src_cnt, dst_cnt, val_cnt, ...) \
+template <typename ST, typename DT> \
+class TegraRowOp_##name##_Invoker : public cv::ParallelLoopBody \
+{ \
+public: \
+    TegraRowOp_##name##_Invoker(ROW_SRC_ARG##src_cnt \
+                                ROW_DST_ARG##dst_cnt \
+                                ROW_VAL_ARG##val_cnt) : \
+         cv::ParallelLoopBody() ROW_SRC_STORE##src_cnt \
+                                ROW_DST_STORE##dst_cnt \
+                                ROW_VAL_STORE##val_cnt {} \
+    virtual void operator()(const cv::Range& range) const \
+    { \
+        CAROTENE_NS::func(CAROTENE_NS::Size2D(range.end-range.start, 1), __VA_ARGS__); \
+    } \
+private: \
+    ROW_SRC_VAR##src_cnt \
+    ROW_DST_VAR##dst_cnt \
+    ROW_VAL_VAR##val_cnt \
+    const TegraRowOp_##name##_Invoker& operator= (const TegraRowOp_##name##_Invoker&); \
+};
+
+
+#define TEGRA_SPLIT(src, dst, len, cn) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+        cn == 2 ? \
+        CAROTENE_NS::split2(CAROTENE_NS::Size2D(len, 1), \
+                            src, len, \
+                            dst[0], len, \
+                            dst[1], len), \
+        CV_HAL_ERROR_OK : \
+        cn == 3 ? \
+        CAROTENE_NS::split3(CAROTENE_NS::Size2D(len, 1), \
+                            src, len, \
+                            dst[0], len, \
+                            dst[1], len, \
+                            dst[2], len), \
+        CV_HAL_ERROR_OK : \
+        cn == 4 ? \
+        CAROTENE_NS::split4(CAROTENE_NS::Size2D(len, 1), \
+                            src, len, \
+                            dst[0], len, \
+                            dst[1], len, \
+                            dst[2], len, \
+                            dst[3], len), \
+        CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraRowOp_Invoker(split2, split2, 1, 2, 0, RANGE_DATA(ST, src1_data, 2*sizeof(ST)), range.end-range.start,
+                                            RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start,
+                                            RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start)
+TegraRowOp_Invoker(split3, split3, 1, 3, 0, RANGE_DATA(ST, src1_data, 3*sizeof(ST)), range.end-range.start,
+                                            RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start,
+                                            RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start,
+                                            RANGE_DATA(DT, dst3_data, sizeof(DT)), range.end-range.start)
+TegraRowOp_Invoker(split4, split4, 1, 4, 0, RANGE_DATA(ST, src1_data, 4*sizeof(ST)), range.end-range.start,
+                                            RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start,
+                                            RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start,
+                                            RANGE_DATA(DT, dst3_data, sizeof(DT)), range.end-range.start,
+                                            RANGE_DATA(DT, dst4_data, sizeof(DT)), range.end-range.start)
+#define TEGRA_SPLIT64S(type, src, dst, len, cn) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+        cn == 2 ? \
+        parallel_for_(Range(0, len), \
+        TegraRowOp_split2_Invoker<const type, type>(src, dst[0], dst[1]), \
+        (len) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        cn == 3 ? \
+        parallel_for_(Range(0, len), \
+        TegraRowOp_split3_Invoker<const type, type>(src, dst[0], dst[1], dst[2]), \
+        (len) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        cn == 4 ? \
+        parallel_for_(Range(0, len), \
+        TegraRowOp_split4_Invoker<const type, type>(src, dst[0], dst[1], dst[2], dst[3]), \
+        (len) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_MERGE(src, dst, len, cn) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+        cn == 2 ? \
+        CAROTENE_NS::combine2(CAROTENE_NS::Size2D(len, 1), \
+                              src[0], len, \
+                              src[1], len, \
+                              dst, len), \
+        CV_HAL_ERROR_OK : \
+        cn == 3 ? \
+        CAROTENE_NS::combine3(CAROTENE_NS::Size2D(len, 1), \
+                              src[0], len, \
+                              src[1], len, \
+                              src[2], len, \
+                              dst, len), \
+        CV_HAL_ERROR_OK : \
+        cn == 4 ? \
+        CAROTENE_NS::combine4(CAROTENE_NS::Size2D(len, 1), \
+                              src[0], len, \
+                              src[1], len, \
+                              src[2], len, \
+                              src[3], len, \
+                              dst, len), \
+        CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraRowOp_Invoker(combine2, combine2, 2, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(DT, dst1_data, 2*sizeof(DT)), range.end-range.start)
+TegraRowOp_Invoker(combine3, combine3, 3, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(ST, src3_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(DT, dst1_data, 3*sizeof(DT)), range.end-range.start)
+TegraRowOp_Invoker(combine4, combine4, 4, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(ST, src3_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(ST, src4_data, sizeof(ST)), range.end-range.start,
+                                                RANGE_DATA(DT, dst1_data, 4*sizeof(DT)), range.end-range.start)
+#define TEGRA_MERGE64S(type, src, dst, len, cn) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+        cn == 2 ? \
+        parallel_for_(Range(0, len), \
+        TegraRowOp_combine2_Invoker<const type, type>(src[0], src[1], dst), \
+        (len) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        cn == 3 ? \
+        parallel_for_(Range(0, len), \
+        TegraRowOp_combine3_Invoker<const type, type>(src[0], src[1], src[2], dst), \
+        (len) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        cn == 4 ? \
+        parallel_for_(Range(0, len), \
+        TegraRowOp_combine4_Invoker<const type, type>(src[0], src[1], src[2], src[3], dst), \
+        (len) / static_cast<double>(1<<16)), \
+        CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_split8u
+#define cv_hal_split8u TEGRA_SPLIT
+#undef cv_hal_split16u
+#define cv_hal_split16u TEGRA_SPLIT
+#undef cv_hal_split32s
+#define cv_hal_split32s TEGRA_SPLIT
+#undef cv_hal_split64s
+#define cv_hal_split64s(src, dst, len, cn) TEGRA_SPLIT64S(CAROTENE_NS::s64, src, dst, len, cn)
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u TEGRA_MERGE
+#undef cv_hal_merge16u
+#define cv_hal_merge16u TEGRA_MERGE
+#undef cv_hal_merge32s
+#define cv_hal_merge32s TEGRA_MERGE
+#undef cv_hal_merge64s
+#define cv_hal_merge64s(src, dst, len, cn) TEGRA_MERGE64S(CAROTENE_NS::s64, src, dst, len, cn)
+
+
+TegraRowOp_Invoker(phase, phase, 2, 1, 1, RANGE_DATA(ST, src1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start,
+                                          RANGE_DATA(ST, src2_data, sizeof(CAROTENE_NS::f32)), range.end-range.start,
+                                          RANGE_DATA(DT, dst1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, val)
+#define TEGRA_FASTATAN(y, x, dst, len, angleInDegrees) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    parallel_for_(Range(0, len), \
+    TegraRowOp_phase_Invoker<const CAROTENE_NS::f32, CAROTENE_NS::f32>(x, y, dst, angleInDegrees ? 1.0f : M_PI/180), \
+    (len) / static_cast<double>(1<<16)), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_fastAtan32f
+#define cv_hal_fastAtan32f TEGRA_FASTATAN
+
+TegraRowOp_Invoker(magnitude, magnitude, 2, 1, 0, RANGE_DATA(ST, src1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start,
+                                                  RANGE_DATA(ST, src2_data, sizeof(CAROTENE_NS::f32)), range.end-range.start,
+                                                  RANGE_DATA(DT, dst1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start)
+#define TEGRA_MAGNITUDE(x, y, dst, len) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+    parallel_for_(Range(0, len), \
+    TegraRowOp_magnitude_Invoker<const CAROTENE_NS::f32, CAROTENE_NS::f32>(x, y, dst), \
+    (len) / static_cast<double>(1<<16)), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_magnitude32f
+#define cv_hal_magnitude32f TEGRA_MAGNITUDE
+
+
+#if defined OPENCV_IMGPROC_HAL_INTERFACE_H
+
+struct cvhalFilter2D;
+
+struct FilterCtx
+{
+    CAROTENE_NS::Size2D ksize;
+    CAROTENE_NS::s16* kernel_data;
+    CAROTENE_NS::BORDER_MODE border;
+};
+inline int TEGRA_FILTERINIT(cvhalFilter2D **context, uchar *kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height,
+                            int max_width, int max_height, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool allowSubmatrix, bool allowInplace)
+{
+    if(!context || !kernel_data || allowSubmatrix || allowInplace || 
+       src_type != CV_8UC1 || dst_type != CV_8UC1 ||
+       delta != 0 || anchor_x != kernel_width / 2 || anchor_y != kernel_height / 2 )
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    FilterCtx* ctx = new FilterCtx;
+    if(!ctx)
+        return CV_HAL_ERROR_UNKNOWN;
+    ctx->ksize.width = kernel_width;
+    ctx->ksize.height = kernel_height;
+    switch(borderType)
+    {
+    case CV_HAL_BORDER_CONSTANT:
+        ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT;
+        break;
+    case CV_HAL_BORDER_REPLICATE:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE;
+        break;
+    case CV_HAL_BORDER_REFLECT:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT;
+        break;
+    case CV_HAL_BORDER_WRAP:
+        ctx->border = CAROTENE_NS::BORDER_MODE_WRAP;
+        break;
+    case CV_HAL_BORDER_REFLECT_101:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101;
+        break;
+    default:
+        delete ctx;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if(!CAROTENE_NS::isConvolutionSupported(CAROTENE_NS::Size2D(max_width, max_height), ctx->ksize, ctx->border))    
+    {
+        delete ctx;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    ctx->kernel_data = new CAROTENE_NS::s16[kernel_width*kernel_height];
+    if(!ctx->kernel_data)
+        return CV_HAL_ERROR_UNKNOWN;
+    switch(kernel_type)
+    {
+    case CV_8UC1:
+        convert(ctx->ksize, (CAROTENE_NS::u8*)kernel_data, kernel_step, ctx->kernel_data, kernel_width);
+        break;
+    case CV_8SC1:
+        convert(ctx->ksize, (CAROTENE_NS::s8*)kernel_data, kernel_step, ctx->kernel_data, kernel_width);
+        break;
+    case CV_16UC1:
+        for(int j = 0; j < kernel_height; ++j)
+        {
+            std::memcpy(ctx->kernel_data + kernel_width * j, kernel_data + kernel_step * j, kernel_width * sizeof(int16_t));
+        }
+    default:
+        delete[] ctx->kernel_data;
+        delete ctx;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    *context = (cvhalFilter2D*)(ctx);
+    return CV_HAL_ERROR_OK;
+}
+inline int TEGRA_FILTERFREE(cvhalFilter2D *context)
+{
+    if(context)
+    {
+        if(((FilterCtx*)context)->kernel_data)
+            delete[] ((FilterCtx*)context)->kernel_data;
+        delete (FilterCtx*)context;
+        return CV_HAL_ERROR_OK;
+    }
+    else
+    {
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+}
+#define TEGRA_FILTERIMPL(context, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y) \
+( \
+    (void)full_width, (void)full_height, (void)offset_x, (void)offset_y, \
+    context && CAROTENE_NS::isConvolutionSupported(CAROTENE_NS::Size2D(width, height), ((FilterCtx*)context)->ksize, ((FilterCtx*)context)->border) ? \
+    CAROTENE_NS::convolution(CAROTENE_NS::Size2D(width, height), \
+                             src_data, src_step, \
+                             dst_data, dst_step, \
+                             ((FilterCtx*)context)->border, 0, \
+                             ((FilterCtx*)context)->ksize, ((FilterCtx*)context)->kernel_data, 1), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_filterInit
+#define cv_hal_filterInit TEGRA_FILTERINIT
+#undef cv_hal_filter
+#define cv_hal_filter TEGRA_FILTERIMPL
+#undef cv_hal_filterFree
+#define cv_hal_filterFree TEGRA_FILTERFREE
+
+
+struct SepFilterCtx
+{
+    int16_t kernelx_data[3];
+    int16_t kernely_data[3];
+    CAROTENE_NS::BORDER_MODE border;
+};
+inline int TEGRA_SEPFILTERINIT(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type,
+                               uchar *kernelx_data, size_t             , int kernelx_width, int kernelx_height,
+                               uchar *kernely_data, size_t kernely_step, int kernely_width, int kernely_height,
+                               int anchor_x, int anchor_y, double delta, int borderType)
+{
+    if(!context || !kernelx_data || !kernely_data || src_type != CV_8UC1 || dst_type != CV_16SC1 ||
+       !(kernelx_width == 3 && kernelx_height == 1) || !(kernely_width == 1 && kernely_height == 3) ||
+       delta != 0 || anchor_x != 1 || anchor_y != 1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    SepFilterCtx* ctx = new SepFilterCtx;
+    if(!ctx)
+        return CV_HAL_ERROR_UNKNOWN;
+    switch(borderType)
+    {
+    case CV_HAL_BORDER_CONSTANT:
+        ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT;
+        break;
+    case CV_HAL_BORDER_REPLICATE:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE;
+        break;
+    case CV_HAL_BORDER_REFLECT:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT;
+        break;
+    case CV_HAL_BORDER_WRAP:
+        ctx->border = CAROTENE_NS::BORDER_MODE_WRAP;
+        break;
+    case CV_HAL_BORDER_REFLECT_101:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101;
+        break;
+    default:
+        delete ctx;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if(!CAROTENE_NS::isSeparableFilter3x3Supported(CAROTENE_NS::Size2D(16, 16), ctx->border, 3, 3))    
+    {
+        delete ctx;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    switch(kernel_type)
+    {
+    case CV_8UC1:
+        ctx->kernelx_data[0]=kernelx_data[0];
+        ctx->kernelx_data[1]=kernelx_data[1];
+        ctx->kernelx_data[2]=kernelx_data[2];
+        ctx->kernely_data[0]=kernely_data[0];
+        ctx->kernely_data[1]=kernely_data[kernely_step];
+        ctx->kernely_data[2]=kernely_data[2*kernely_step];
+        break;
+    case CV_8SC1:
+        ctx->kernelx_data[0]=((char*)kernelx_data)[0];
+        ctx->kernelx_data[1]=((char*)kernelx_data)[1];
+        ctx->kernelx_data[2]=((char*)kernelx_data)[2];
+        ctx->kernely_data[0]=((char*)kernely_data)[0];
+        ctx->kernely_data[1]=((char*)(kernely_data+kernely_step))[0];
+        ctx->kernely_data[2]=((char*)(kernely_data+2*kernely_step))[0];
+        break;
+    case CV_16UC1:
+        ctx->kernelx_data[0]=((int16_t*)kernelx_data)[0];
+        ctx->kernelx_data[1]=((int16_t*)kernelx_data)[1];
+        ctx->kernelx_data[2]=((int16_t*)kernelx_data)[2];
+        ctx->kernely_data[0]=((int16_t*)kernely_data)[0];
+        ctx->kernely_data[1]=((int16_t*)(kernely_data+kernely_step))[0];
+        ctx->kernely_data[2]=((int16_t*)(kernely_data+2*kernely_step))[0];
+    default:
+        delete ctx;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    *context = (cvhalFilter2D*)(ctx);
+    return CV_HAL_ERROR_OK;
+}
+inline int TEGRA_SEPFILTERFREE(cvhalFilter2D *context)
+{
+    if(context)
+    {
+        delete (SepFilterCtx*)context;
+        return CV_HAL_ERROR_OK;
+    }
+    else
+    {
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+}
+#define TEGRA_SEPFILTERIMPL(context, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y) \
+( \
+    context && CAROTENE_NS::isSeparableFilter3x3Supported(CAROTENE_NS::Size2D(width, height), ((SepFilterCtx*)context)->border, 3, 3, \
+                                               CAROTENE_NS::Margin(offset_x, full_width - width - offset_x, offset_y, full_height - height - offset_y)) ? \
+    CAROTENE_NS::SeparableFilter3x3(CAROTENE_NS::Size2D(width, height), \
+                                    src_data, src_step, \
+                                    (CAROTENE_NS::s16*)dst_data, dst_step, \
+                                    3, 3, ((SepFilterCtx*)context)->kernelx_data, ((SepFilterCtx*)context)->kernely_data, \
+                                    ((SepFilterCtx*)context)->border, 0, \
+                                    CAROTENE_NS::Margin(offset_x, full_width - width - offset_x, offset_y, full_height - height - offset_y)), \
+    CV_HAL_ERROR_OK \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_sepFilterInit
+#define cv_hal_sepFilterInit TEGRA_SEPFILTERINIT
+#undef cv_hal_sepFilter
+#define cv_hal_sepFilter TEGRA_SEPFILTERIMPL
+#undef cv_hal_sepFilterFree
+#define cv_hal_sepFilterFree TEGRA_SEPFILTERFREE
+
+
+struct MorphCtx
+{
+    int operation;
+    int channels;
+    CAROTENE_NS::Size2D ksize;
+    int anchor_x, anchor_y;
+    CAROTENE_NS::BORDER_MODE border;
+    uchar borderValues[4];
+};
+inline int TEGRA_MORPHINIT(cvhalFilter2D **context, int operation, int src_type, int dst_type, int, int,
+                           int kernel_type, uchar *kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y,
+                           int borderType, const double borderValue[4], int iterations, bool allowSubmatrix, bool allowInplace)
+{
+    if(!context || !kernel_data || src_type != dst_type ||
+       CV_MAT_DEPTH(src_type) != CV_8U || src_type < 0 || (src_type >> CV_CN_SHIFT) > 3 ||
+
+       allowSubmatrix || allowInplace || iterations != 1 ||
+       !CAROTENE_NS::isSupportedConfiguration())
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch(CV_MAT_DEPTH(kernel_type))
+    {
+    case CV_8U:
+        if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), kernel_data, kernel_step) != kernel_width * kernel_height)
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        break;
+    case CV_16U:
+        if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (uint16_t*)kernel_data, kernel_step) != kernel_width * kernel_height)
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        break;
+    case CV_32S:
+        if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (int32_t*)kernel_data, kernel_step) != kernel_width * kernel_height)
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        break;
+    case CV_32F:
+        if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (float*)kernel_data, kernel_step) != kernel_width * kernel_height)
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        break;
+    case CV_64F:
+        if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (double*)kernel_data, kernel_step) != kernel_width * kernel_height)
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        break;
+    default:
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    MorphCtx* ctx = new MorphCtx;
+    if(!ctx)
+        return CV_HAL_ERROR_UNKNOWN;
+    ctx->channels = (src_type >> CV_CN_SHIFT) + 1;
+    ctx->ksize.width = kernel_width;
+    ctx->ksize.height = kernel_height;
+    ctx->anchor_x = anchor_x;
+    ctx->anchor_y = anchor_y;
+    switch(operation)
+    {
+    case MORPH_ERODE:
+    case MORPH_DILATE:
+        ctx->operation = operation;
+        break;
+    default:
+        delete ctx;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+    switch(borderType)
+    {
+    case CV_HAL_BORDER_CONSTANT:
+        ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT;
+        if( borderValue[0] == DBL_MAX && borderValue[1] == DBL_MAX && borderValue[2] == DBL_MAX && borderValue[3] == DBL_MAX )
+        {
+            if( operation == MORPH_ERODE )
+                for(int i = 0; i < ctx->channels; ++i)
+                    ctx->borderValues[i] = (CAROTENE_NS::u8)UCHAR_MAX;
+            else
+                for(int i = 0; i < ctx->channels; ++i)
+                    ctx->borderValues[i] = 0;
+        }
+        else
+        {
+            for(int i = 0; i < ctx->channels; ++i)
+                ctx->borderValues[i] = (CAROTENE_NS::u8)cv::saturate_cast<uchar>(borderValue[i]);
+        }
+        break;
+    case CV_HAL_BORDER_REPLICATE:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE;
+        break;
+    case CV_HAL_BORDER_REFLECT:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT;
+        break;
+    case CV_HAL_BORDER_WRAP:
+        ctx->border = CAROTENE_NS::BORDER_MODE_WRAP;
+        break;
+    case CV_HAL_BORDER_REFLECT_101:
+        ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101;
+        break;
+    default:
+        delete ctx;
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    *context = (cvhalFilter2D*)(ctx);
+    return CV_HAL_ERROR_OK;
+}
+inline int TEGRA_MORPHFREE(cvhalFilter2D *context)
+{
+    if(context)
+    {
+        delete (MorphCtx*)context;
+        return CV_HAL_ERROR_OK;
+    }
+    else
+    {
+        return CV_HAL_ERROR_UNKNOWN;
+    }
+}
+#define TEGRA_MORPHIMPL(context, src_data, src_step, dst_data, dst_step, width, height, src_full_width, src_full_height, src_roi_x, src_roi_y, dst_full_width, dst_full_height, dst_roi_x, dst_roi_y) \
+( \
+    (void)dst_full_width, (void)dst_full_height, (void)dst_roi_x, (void)dst_roi_y, \
+    context && CAROTENE_NS::isSupportedConfiguration() ? \
+        ((MorphCtx*)context)->operation == MORPH_ERODE ? \
+        CAROTENE_NS::erode(CAROTENE_NS::Size2D(width, height), ((MorphCtx*)context)->channels, \
+                           src_data, src_step, dst_data, dst_step, \
+                           ((MorphCtx*)context)->ksize, ((MorphCtx*)context)->anchor_x, ((MorphCtx*)context)->anchor_y, \
+                           ((MorphCtx*)context)->border, ((MorphCtx*)context)->border, ((MorphCtx*)context)->borderValues, \
+                           CAROTENE_NS::Margin(src_roi_x, src_full_width - width - src_roi_x, src_roi_y, src_full_height - height - src_roi_y)), \
+        CV_HAL_ERROR_OK : \
+        ((MorphCtx*)context)->operation == MORPH_DILATE ? \
+        CAROTENE_NS::dilate(CAROTENE_NS::Size2D(width, height), ((MorphCtx*)context)->channels, \
+                            src_data, src_step, dst_data, dst_step, \
+                            ((MorphCtx*)context)->ksize, ((MorphCtx*)context)->anchor_x, ((MorphCtx*)context)->anchor_y, \
+                            ((MorphCtx*)context)->border, ((MorphCtx*)context)->border, ((MorphCtx*)context)->borderValues, \
+                            CAROTENE_NS::Margin(src_roi_x, src_full_width - width - src_roi_x, src_roi_y, src_full_height - height - src_roi_y)), \
+        CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_morphInit
+#define cv_hal_morphInit TEGRA_MORPHINIT
+#undef cv_hal_morph
+#define cv_hal_morph TEGRA_MORPHIMPL
+#undef cv_hal_morphFree
+#define cv_hal_morphFree TEGRA_MORPHFREE
+
+
+
+#define TEGRA_RESIZE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation) \
+( \
+    interpolation == CV_HAL_INTER_LINEAR ? \
+        CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeLinearOpenCVSupported(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), ((src_type >> CV_CN_SHIFT) + 1)) && \
+        inv_scale_x > 0 && inv_scale_y > 0 && \
+        (dst_width - 0.5)/inv_scale_x - 0.5 < src_width && (dst_height - 0.5)/inv_scale_y - 0.5 < src_height && \
+        (dst_width + 0.5)/inv_scale_x + 0.5 >= src_width && (dst_height + 0.5)/inv_scale_y + 0.5 >= src_height && \
+        std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \
+            CAROTENE_NS::resizeLinearOpenCV(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                            src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)), \
+            CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \
+    interpolation == CV_HAL_INTER_AREA ? \
+        CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeAreaSupported(1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)) && \
+        std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \
+            CAROTENE_NS::resizeAreaOpenCV(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                          src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)), \
+            CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \
+    /*nearest neighbour interpolation disabled due to rounding accuracy issues*/ \
+    /*interpolation == CV_HAL_INTER_NEAREST ? \
+        (src_type == CV_8UC1 || src_type == CV_8SC1) && CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 1) ? \
+            CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                               src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 1), \
+            CV_HAL_ERROR_OK : \
+        (src_type == CV_8UC3 || src_type == CV_8SC3) && CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 3) ? \
+            CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                               src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 3), \
+            CV_HAL_ERROR_OK : \
+        (src_type == CV_8UC4 || src_type == CV_8SC4 || src_type == CV_16UC2 || src_type == CV_16SC2 || src_type == CV_32SC1) && \
+        CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 4) ? \
+            CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                               src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 4), \
+            CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED :*/ \
+    CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_WARPAFFINE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue) \
+( \
+    interpolation == CV_HAL_INTER_NEAREST ? \
+        (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \
+        CAROTENE_NS::isWarpAffineNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \
+            CAROTENE_NS::warpAffineNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                                   src_data, src_step, \
+                                                   std::vector<float>(M+0,M+6).data(), \
+                                                   dst_data, dst_step, \
+                                                   borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \
+                                                   (CAROTENE_NS::u8)borderValue[0]), \
+        CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \
+    interpolation == CV_HAL_INTER_LINEAR ? \
+        (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \
+        CAROTENE_NS::isWarpAffineLinearSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \
+            CAROTENE_NS::warpAffineLinear(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                          src_data, src_step, \
+                                          std::vector<float>(M+0,M+6).data(), \
+                                          dst_data, dst_step, \
+                                          borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \
+                                          (CAROTENE_NS::u8)borderValue[0]), \
+        CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \
+    CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_WARPPERSPECTIVE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue) \
+( \
+    interpolation == CV_HAL_INTER_NEAREST ? \
+        (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \
+        CAROTENE_NS::isWarpPerspectiveNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \
+            CAROTENE_NS::warpPerspectiveNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                                        src_data, src_step, \
+                                                        std::vector<float>(M+0,M+9).data(), \
+                                                        dst_data, dst_step, \
+                                                        borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \
+                                                        (CAROTENE_NS::u8)borderValue[0]), \
+        CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \
+    interpolation == CV_HAL_INTER_LINEAR ? \
+        (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \
+        CAROTENE_NS::isWarpPerspectiveLinearSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \
+            CAROTENE_NS::warpPerspectiveLinear(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                               src_data, src_step, \
+                                               std::vector<float>(M+0,M+9).data(), \
+                                               dst_data, dst_step, \
+                                               borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \
+                                               (CAROTENE_NS::u8)borderValue[0]), \
+        CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \
+    CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_resize
+#define cv_hal_resize TEGRA_RESIZE
+//warpAffine/warpPerspective disabled due to rounding accuracy issue
+//#undef cv_hal_warpAffine
+//#define cv_hal_warpAffine TEGRA_WARPAFFINE
+//#undef cv_hal_warpPerspective
+//#define cv_hal_warpPerspective TEGRA_WARPPERSPECTIVE
+
+
+#define TegraCvtColor_Invoker(name, func, ...) \
+class TegraCvtColor_##name##_Invoker : public cv::ParallelLoopBody \
+{ \
+public: \
+    TegraCvtColor_##name##_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, int height_) : \
+        cv::ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), height(height_) {} \
+    virtual void operator()(const cv::Range& range) const \
+    { \
+        CAROTENE_NS::func(CAROTENE_NS::Size2D(width, range.end-range.start), __VA_ARGS__); \
+    } \
+private: \
+    const uchar * src_data; \
+    size_t src_step; \
+    uchar * dst_data; \
+    size_t dst_step; \
+    int width, height; \
+    const TegraCvtColor_##name##_Invoker& operator= (const TegraCvtColor_##name##_Invoker&); \
+};
+
+TegraCvtColor_Invoker(rgb2bgr, rgb2bgr, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                        dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgb2bgrx, rgb2bgrx, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgb2rgbx, rgb2rgbx, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgbx2bgr, rgbx2bgr, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgbx2rgb, rgbx2rgb, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgbx2bgrx, rgbx2bgrx, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                            dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+#define TEGRA_CVTBGRTOBGR(src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue) \
+( \
+    depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \
+        scn == 3 ? \
+            dcn == 3 ? \
+                swapBlue ? \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgb2bgr_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)), \
+                    CV_HAL_ERROR_OK : \
+                    CV_HAL_ERROR_NOT_IMPLEMENTED : \
+            dcn == 4 ? \
+                (swapBlue ? \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgb2bgrx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) : \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgb2rgbx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) ), \
+                CV_HAL_ERROR_OK : \
+            CV_HAL_ERROR_NOT_IMPLEMENTED : \
+        scn == 4 ? \
+            dcn == 3 ? \
+                (swapBlue ? \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgbx2bgr_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) : \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgbx2rgb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) ), \
+                CV_HAL_ERROR_OK : \
+            dcn == 4 ? \
+                swapBlue ? \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgbx2bgrx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)), \
+                    CV_HAL_ERROR_OK : \
+                    CV_HAL_ERROR_NOT_IMPLEMENTED : \
+            CV_HAL_ERROR_NOT_IMPLEMENTED : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraCvtColor_Invoker(rgb2bgr565, rgb2bgr565, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                              dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgb2rgb565, rgb2rgb565, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                              dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgbx2bgr565, rgbx2bgr565, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                                dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgbx2rgb565, rgbx2rgb565, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                                dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+#define TEGRA_CVTBGRTOBGR565(src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits) \
+( \
+    greenBits == 6 && CAROTENE_NS::isSupportedConfiguration() ? \
+        scn == 3 ? \
+            (swapBlue ? \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_rgb2bgr565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) : \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_rgb2rgb565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) ), \
+            CV_HAL_ERROR_OK : \
+        scn == 4 ? \
+            (swapBlue ? \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_rgbx2bgr565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) : \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_rgbx2rgb565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) ), \
+            CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraCvtColor_Invoker(rgb2gray, rgb2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(bgr2gray, bgr2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgbx2gray, rgbx2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                                                            dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(bgrx2gray, bgrx2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                                                            dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+#define TEGRA_CVTBGRTOGRAY(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue) \
+( \
+    depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \
+        scn == 3 ? \
+            (swapBlue ? \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_rgb2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) : \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_bgr2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) ), \
+            CV_HAL_ERROR_OK : \
+        scn == 4 ? \
+            (swapBlue ? \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_rgbx2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) : \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_bgrx2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) ), \
+            CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraCvtColor_Invoker(gray2rgb, gray2rgb, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(gray2rgbx, gray2rgbx, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                            dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+#define TEGRA_CVTGRAYTOBGR(src_data, src_step, dst_data, dst_step, width, height, depth, dcn) \
+( \
+    depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \
+        dcn == 3 ? \
+            parallel_for_(Range(0, height), \
+            TegraCvtColor_gray2rgb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+            (width * height) / static_cast<double>(1<<16)), \
+            CV_HAL_ERROR_OK : \
+        dcn == 4 ? \
+            parallel_for_(Range(0, height), \
+            TegraCvtColor_gray2rgbx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+            (width * height) / static_cast<double>(1<<16)), \
+            CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraCvtColor_Invoker(rgb2ycrcb, rgb2ycrcb, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                            dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(bgr2ycrcb, bgr2ycrcb, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                            dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(rgbx2ycrcb, rgbx2ycrcb, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                              dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+TegraCvtColor_Invoker(bgrx2ycrcb, bgrx2ycrcb, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                              dst_data + static_cast<size_t>(range.start) * dst_step, dst_step)
+#define TEGRA_CVTBGRTOYUV(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr) \
+( \
+    isCbCr && depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \
+        scn == 3 ? \
+            (swapBlue ? \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_rgb2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) : \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_bgr2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) ), \
+            CV_HAL_ERROR_OK : \
+        scn == 4 ? \
+            (swapBlue ? \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_rgbx2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) : \
+                parallel_for_(Range(0, height), \
+                TegraCvtColor_bgrx2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                (width * height) / static_cast<double>(1<<16)) ), \
+            CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+TegraCvtColor_Invoker(rgb2hsv, rgb2hsv, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                        dst_data + static_cast<size_t>(range.start) * dst_step, dst_step, 180)
+TegraCvtColor_Invoker(bgr2hsv, bgr2hsv, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                        dst_data + static_cast<size_t>(range.start) * dst_step, dst_step, 180)
+TegraCvtColor_Invoker(rgbx2hsv, rgbx2hsv, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step, 180)
+TegraCvtColor_Invoker(bgrx2hsv, bgrx2hsv, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                          dst_data + static_cast<size_t>(range.start) * dst_step, dst_step, 180)
+TegraCvtColor_Invoker(rgb2hsvf, rgb2hsv, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                         dst_data + static_cast<size_t>(range.start) * dst_step, dst_step, 256)
+TegraCvtColor_Invoker(bgr2hsvf, bgr2hsv, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                         dst_data + static_cast<size_t>(range.start) * dst_step, dst_step, 256)
+TegraCvtColor_Invoker(rgbx2hsvf, rgbx2hsv, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                           dst_data + static_cast<size_t>(range.start) * dst_step, dst_step, 256)
+TegraCvtColor_Invoker(bgrx2hsvf, bgrx2hsv, src_data + static_cast<size_t>(range.start) * src_step, src_step, \
+                                           dst_data + static_cast<size_t>(range.start) * dst_step, dst_step, 256)
+#define TEGRA_CVTBGRTOHSV(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV) \
+( \
+    isHSV && depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \
+        scn == 3 ? \
+            (swapBlue ? \
+                isFullRange ? \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgb2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) : \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgb2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) : \
+                isFullRange ? \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_bgr2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) : \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_bgr2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) ), \
+            CV_HAL_ERROR_OK : \
+        scn == 4 ? \
+            (swapBlue ? \
+                isFullRange ? \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgbx2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) : \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_rgbx2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) : \
+                isFullRange ? \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_bgrx2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) : \
+                    parallel_for_(Range(0, height), \
+                    TegraCvtColor_bgrx2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \
+                    (width * height) / static_cast<double>(1<<16)) ), \
+            CV_HAL_ERROR_OK : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#define TEGRA_CVT2PYUVTOBGR(src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx) \
+( \
+    CAROTENE_NS::isSupportedConfiguration() ? \
+        dcn == 3 ? \
+            uIdx == 0 ? \
+                (swapBlue ? \
+                    CAROTENE_NS::yuv420i2rgb(CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                             src_data, src_step, \
+                                             src_data + src_step * dst_height, src_step, \
+                                             dst_data, dst_step) : \
+                    CAROTENE_NS::yuv420i2bgr(CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                             src_data, src_step, \
+                                             src_data + src_step * dst_height, src_step, \
+                                             dst_data, dst_step)), \
+                CV_HAL_ERROR_OK : \
+            uIdx == 1 ? \
+                (swapBlue ? \
+                    CAROTENE_NS::yuv420sp2rgb(CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                              src_data, src_step, \
+                                              src_data + src_step * dst_height, src_step, \
+                                              dst_data, dst_step) : \
+                    CAROTENE_NS::yuv420sp2bgr(CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                              src_data, src_step, \
+                                              src_data + src_step * dst_height, src_step, \
+                                              dst_data, dst_step)), \
+                CV_HAL_ERROR_OK : \
+            CV_HAL_ERROR_NOT_IMPLEMENTED : \
+        dcn == 4 ? \
+            uIdx == 0 ? \
+                (swapBlue ? \
+                    CAROTENE_NS::yuv420i2rgbx(CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                              src_data, src_step, \
+                                              src_data + src_step * dst_height, src_step, \
+                                              dst_data, dst_step) : \
+                    CAROTENE_NS::yuv420i2bgrx(CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                              src_data, src_step, \
+                                              src_data + src_step * dst_height, src_step, \
+                                              dst_data, dst_step)), \
+                CV_HAL_ERROR_OK : \
+            uIdx == 1 ? \
+                (swapBlue ? \
+                    CAROTENE_NS::yuv420sp2rgbx(CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                               src_data, src_step, \
+                                               src_data + src_step * dst_height, src_step, \
+                                               dst_data, dst_step) : \
+                    CAROTENE_NS::yuv420sp2bgrx(CAROTENE_NS::Size2D(dst_width, dst_height), \
+                                               src_data, src_step, \
+                                               src_data + src_step * dst_height, src_step, \
+                                               dst_data, dst_step)), \
+                CV_HAL_ERROR_OK : \
+            CV_HAL_ERROR_NOT_IMPLEMENTED : \
+        CV_HAL_ERROR_NOT_IMPLEMENTED \
+    : CV_HAL_ERROR_NOT_IMPLEMENTED \
+)
+
+#undef cv_hal_cvtBGRtoBGR
+#define cv_hal_cvtBGRtoBGR TEGRA_CVTBGRTOBGR
+#undef cv_hal_cvtBGRtoBGR5x5
+#define cv_hal_cvtBGRtoBGR5x5 TEGRA_CVTBGRTOBGR565
+#undef cv_hal_cvtBGRtoGray
+#define cv_hal_cvtBGRtoGray TEGRA_CVTBGRTOGRAY
+#undef cv_hal_cvtGraytoBGR
+#define cv_hal_cvtGraytoBGR TEGRA_CVTGRAYTOBGR
+#undef cv_hal_cvtBGRtoYUV
+#define cv_hal_cvtBGRtoYUV TEGRA_CVTBGRTOYUV
+#undef cv_hal_cvtBGRtoHSV
+#define cv_hal_cvtBGRtoHSV TEGRA_CVTBGRTOHSV
+#undef cv_hal_cvtTwoPlaneYUVtoBGR
+#define cv_hal_cvtTwoPlaneYUVtoBGR TEGRA_CVT2PYUVTOBGR
+
+#endif // OPENCV_IMGPROC_HAL_INTERFACE_H
+
+#endif
diff --git a/3rdparty/carotene/include/carotene/definitions.hpp b/3rdparty/carotene/include/carotene/definitions.hpp
new file mode 100644
index 0000000000..124a674d61
--- /dev/null
+++ b/3rdparty/carotene/include/carotene/definitions.hpp
@@ -0,0 +1,47 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_DEFINITIONS_HPP
+#define CAROTENE_DEFINITIONS_HPP
+
+#ifndef CAROTENE_NS
+#define CAROTENE_NS carotene
+#endif
+
+#endif
diff --git a/3rdparty/carotene/include/carotene/functions.hpp b/3rdparty/carotene/include/carotene/functions.hpp
new file mode 100644
index 0000000000..76d1328194
--- /dev/null
+++ b/3rdparty/carotene/include/carotene/functions.hpp
@@ -0,0 +1,2492 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_FUNCTIONS_HPP
+#define CAROTENE_FUNCTIONS_HPP
+
+#include <carotene/definitions.hpp>
+#include <carotene/types.hpp>
+
+namespace CAROTENE_NS {
+    /* If this returns false, none of the functions will work. */
+    bool isSupportedConfiguration();
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] + src1[p]
+    */
+    void add(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             u8 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void add(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void add(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const s16 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void add(const Size2D &size,
+             const s8 *src0Base, ptrdiff_t src0Stride,
+             const s8 *src1Base, ptrdiff_t src1Stride,
+             s8 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void add(const Size2D &size,
+             const s16 *src0Base, ptrdiff_t src0Stride,
+             const s16 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void add(const Size2D &size,
+             const u16 * src0Base, ptrdiff_t src0Stride,
+             const u16 * src1Base, ptrdiff_t src1Stride,
+             u16 * dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void add(const Size2D &size,
+             const s32 * src0Base, ptrdiff_t src0Stride,
+             const s32 * src1Base, ptrdiff_t src1Stride,
+             s32 * dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void add(const Size2D &size,
+             const u32 * src0Base, ptrdiff_t src0Stride,
+             const u32 * src1Base, ptrdiff_t src1Stride,
+             u32 * dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void add(const Size2D &size,
+             const f32 * src0Base, ptrdiff_t src0Stride,
+             const f32 * src1Base, ptrdiff_t src1Stride,
+             f32 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] - src1[p]
+    */
+    void sub(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             u8 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             f32 *dstBase, ptrdiff_t dstStride);
+
+    void sub(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const s16 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const s16 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const s16 *src0Base, ptrdiff_t src0Stride,
+             const s16 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const s8 *src0Base, ptrdiff_t src0Stride,
+             const s8 *src1Base, ptrdiff_t src1Stride,
+             s8 *dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const u16 * src0Base, ptrdiff_t src0Stride,
+             const u16 * src1Base, ptrdiff_t src1Stride,
+             u16 * dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const s32 * src0Base, ptrdiff_t src0Stride,
+             const s32 * src1Base, ptrdiff_t src1Stride,
+             s32 * dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const u32 * src0Base, ptrdiff_t src0Stride,
+             const u32 * src1Base, ptrdiff_t src1Stride,
+             u32 * dstBase, ptrdiff_t dstStride,
+             CONVERT_POLICY policy);
+
+    void sub(const Size2D &size,
+             const f32 * src0Base, ptrdiff_t src0Stride,
+             const f32 * src1Base, ptrdiff_t src1Stride,
+             f32 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] * alpha + src1[p] * beta + gamma
+    */
+    void addWeighted(const Size2D &size,
+                     const u8 * src0Base, ptrdiff_t src0Stride,
+                     const u8 * src1Base, ptrdiff_t src1Stride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     f32 alpha, f32 beta, f32 gamma);
+
+    void addWeighted(const Size2D &size,
+                     const s8 * src0Base, ptrdiff_t src0Stride,
+                     const s8 * src1Base, ptrdiff_t src1Stride,
+                     s8 * dstBase, ptrdiff_t dstStride,
+                     f32 alpha, f32 beta, f32 gamma);
+
+    void addWeighted(const Size2D &size,
+                     const u16 * src0Base, ptrdiff_t src0Stride,
+                     const u16 * src1Base, ptrdiff_t src1Stride,
+                     u16 * dstBase, ptrdiff_t dstStride,
+                     f32 alpha, f32 beta, f32 gamma);
+
+    void addWeighted(const Size2D &size,
+                     const s16 * src0Base, ptrdiff_t src0Stride,
+                     const s16 * src1Base, ptrdiff_t src1Stride,
+                     s16 * dstBase, ptrdiff_t dstStride,
+                     f32 alpha, f32 beta, f32 gamma);
+
+    void addWeighted(const Size2D &size,
+                     const u32 * src0Base, ptrdiff_t src0Stride,
+                     const u32 * src1Base, ptrdiff_t src1Stride,
+                     u32 * dstBase, ptrdiff_t dstStride,
+                     f32 alpha, f32 beta, f32 gamma);
+
+    void addWeighted(const Size2D &size,
+                     const s32 * src0Base, ptrdiff_t src0Stride,
+                     const s32 * src1Base, ptrdiff_t src1Stride,
+                     s32 * dstBase, ptrdiff_t dstStride,
+                     f32 alpha, f32 beta, f32 gamma);
+
+    void addWeighted(const Size2D &size,
+                     const f32 * src0Base, ptrdiff_t src0Stride,
+                     const f32 * src1Base, ptrdiff_t src1Stride,
+                     f32 * dstBase, ptrdiff_t dstStride,
+                     f32 alpha, f32 beta, f32 gamma);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = min(src0[p], src1[p])
+    */
+    void min(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             u8 *dstBase, ptrdiff_t dstStride);
+
+    void min(const Size2D &size,
+             const s8 *src0Base, ptrdiff_t src0Stride,
+             const s8 *src1Base, ptrdiff_t src1Stride,
+             s8 *dstBase, ptrdiff_t dstStride);
+
+    void min(const Size2D &size,
+             const u16 * src0Base, ptrdiff_t src0Stride,
+             const u16 * src1Base, ptrdiff_t src1Stride,
+             u16 * dstBase, ptrdiff_t dstStride);
+
+    void min(const Size2D &size,
+             const s16 *src0Base, ptrdiff_t src0Stride,
+             const s16 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride);
+
+    void min(const Size2D &size,
+             const s32 * src0Base, ptrdiff_t src0Stride,
+             const s32 * src1Base, ptrdiff_t src1Stride,
+             s32 * dstBase, ptrdiff_t dstStride);
+
+    void min(const Size2D &size,
+             const u32 * src0Base, ptrdiff_t src0Stride,
+             const u32 * src1Base, ptrdiff_t src1Stride,
+             u32 * dstBase, ptrdiff_t dstStride);
+
+    void min(const Size2D &size,
+             const f32 * src0Base, ptrdiff_t src0Stride,
+             const f32 * src1Base, ptrdiff_t src1Stride,
+             f32 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = max(src0[p], src1[p])
+    */
+    void max(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             u8 *dstBase, ptrdiff_t dstStride);
+
+    void max(const Size2D &size,
+             const s8 *src0Base, ptrdiff_t src0Stride,
+             const s8 *src1Base, ptrdiff_t src1Stride,
+             s8 *dstBase, ptrdiff_t dstStride);
+
+    void max(const Size2D &size,
+             const u16 * src0Base, ptrdiff_t src0Stride,
+             const u16 * src1Base, ptrdiff_t src1Stride,
+             u16 * dstBase, ptrdiff_t dstStride);
+
+    void max(const Size2D &size,
+             const s16 *src0Base, ptrdiff_t src0Stride,
+             const s16 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride);
+
+    void max(const Size2D &size,
+             const s32 * src0Base, ptrdiff_t src0Stride,
+             const s32 * src1Base, ptrdiff_t src1Stride,
+             s32 * dstBase, ptrdiff_t dstStride);
+
+    void max(const Size2D &size,
+             const u32 * src0Base, ptrdiff_t src0Stride,
+             const u32 * src1Base, ptrdiff_t src1Stride,
+             u32 * dstBase, ptrdiff_t dstStride);
+
+    void max(const Size2D &size,
+             const f32 * src0Base, ptrdiff_t src0Stride,
+             const f32 * src1Base, ptrdiff_t src1Stride,
+             f32 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] * src1[p] * scale
+
+        NOTE: ROUND_TO_ZERO convert policy is used
+    */
+    void mul(const Size2D &size,
+             const u8 * src0Base, ptrdiff_t src0Stride,
+             const u8 * src1Base, ptrdiff_t src1Stride,
+             u8 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void mul(const Size2D &size,
+             const u8 * src0Base, ptrdiff_t src0Stride,
+             const u8 * src1Base, ptrdiff_t src1Stride,
+             s16 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void mul(const Size2D &size,
+             const u8 * src0Base, ptrdiff_t src0Stride,
+             const s16 * src1Base, ptrdiff_t src1Stride,
+             s16 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void mul(const Size2D &size,
+             const s8 * src0Base, ptrdiff_t src0Stride,
+             const s8 * src1Base, ptrdiff_t src1Stride,
+             s8 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void mul(const Size2D &size,
+             const u16 * src0Base, ptrdiff_t src0Stride,
+             const u16 * src1Base, ptrdiff_t src1Stride,
+             u16 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void mul(const Size2D &size,
+             const s16 * src0Base, ptrdiff_t src0Stride,
+             const s16 * src1Base, ptrdiff_t src1Stride,
+             s16 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void mul(const Size2D &size,
+             const s32 * src0Base, ptrdiff_t src0Stride,
+             const s32 * src1Base, ptrdiff_t src1Stride,
+             s32 * dstBase, ptrdiff_t dstStride,
+             f64 scale,
+             CONVERT_POLICY cpolicy);
+
+    void mul(const Size2D &size,
+             const f32 * src0Base, ptrdiff_t src0Stride,
+             const f32 * src1Base, ptrdiff_t src1Stride,
+             f32 * dstBase, ptrdiff_t dstStride,
+             f32 scale);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] * scale / src1[p] 
+
+        NOTE: ROUND_TO_ZERO convert policy is used
+    */
+    void div(const Size2D &size,
+             const u8 * src0Base, ptrdiff_t src0Stride,
+             const u8 * src1Base, ptrdiff_t src1Stride,
+             u8 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void div(const Size2D &size,
+             const u8 * src0Base, ptrdiff_t src0Stride,
+             const u8 * src1Base, ptrdiff_t src1Stride,
+             s16 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void div(const Size2D &size,
+             const u8 * src0Base, ptrdiff_t src0Stride,
+             const s16 * src1Base, ptrdiff_t src1Stride,
+             s16 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void div(const Size2D &size,
+             const s8 * src0Base, ptrdiff_t src0Stride,
+             const s8 * src1Base, ptrdiff_t src1Stride,
+             s8 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void div(const Size2D &size,
+             const u16 * src0Base, ptrdiff_t src0Stride,
+             const u16 * src1Base, ptrdiff_t src1Stride,
+             u16 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void div(const Size2D &size,
+             const s16 * src0Base, ptrdiff_t src0Stride,
+             const s16 * src1Base, ptrdiff_t src1Stride,
+             s16 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void div(const Size2D &size,
+             const s32 * src0Base, ptrdiff_t src0Stride,
+             const s32 * src1Base, ptrdiff_t src1Stride,
+             s32 * dstBase, ptrdiff_t dstStride,
+             f32 scale,
+             CONVERT_POLICY cpolicy);
+
+    void div(const Size2D &size,
+             const f32 * src0Base, ptrdiff_t src0Stride,
+             const f32 * src1Base, ptrdiff_t src1Stride,
+             f32 * dstBase, ptrdiff_t dstStride,
+             f32 scale);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = scale / src[p] 
+
+        NOTE: ROUND_TO_ZERO convert policy is used
+    */
+    void reciprocal(const Size2D &size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 * dstBase, ptrdiff_t dstStride,
+                    f32 scale,
+                    CONVERT_POLICY cpolicy);
+
+    void reciprocal(const Size2D &size,
+                    const s8 * srcBase, ptrdiff_t srcStride,
+                    s8 * dstBase, ptrdiff_t dstStride,
+                    f32 scale,
+                    CONVERT_POLICY cpolicy);
+
+    void reciprocal(const Size2D &size,
+                    const u16 * srcBase, ptrdiff_t srcStride,
+                    u16 * dstBase, ptrdiff_t dstStride,
+                    f32 scale,
+                    CONVERT_POLICY cpolicy);
+
+    void reciprocal(const Size2D &size,
+                    const s16 * srcBase, ptrdiff_t srcStride,
+                    s16 * dstBase, ptrdiff_t dstStride,
+                    f32 scale,
+                    CONVERT_POLICY cpolicy);
+
+    void reciprocal(const Size2D &size,
+                    const s32 * srcBase, ptrdiff_t srcStride,
+                    s32 * dstBase, ptrdiff_t dstStride,
+                    f32 scale,
+                    CONVERT_POLICY cpolicy);
+
+    void reciprocal(const Size2D &size,
+                    const f32 * srcBase, ptrdiff_t srcStride,
+                    f32 * dstBase, ptrdiff_t dstStride,
+                    f32 scale);
+
+    /*
+        For each point `p` within `size`, set `dst[p]` to the median
+        of `src[p]` and the 8 points around it. If `srcMargin` is
+        zero on any side, get the neighbors on that side by replicating
+        the edge.
+    */
+    bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels);
+    void medianFilter3x3(const Size2D &size, u32 numChannels,
+                         const u8 *srcBase, ptrdiff_t srcStride,
+                         const Margin &srcMargin,
+                         u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        Apply a half Gaussian filter + half Scale, as one level of a Gaussian
+        pyramid. For all `p` within `dstSize`, set `dst[p]` to `f[2 * p]`, where
+        `f` is an image of size srcSize obtained by filtering src with the 5x5
+        Gaussian kernel ([1 4 6 4 1]'*[1 4 6 4 1]/256) using the border mode
+        passed in, and round-to-zero rounding.
+        dstSize must be (srcSize.width / 2, srcSize.height / 2), rounded by any method.
+     */
+    bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border);
+    void gaussianPyramidDownRTZ(const Size2D &srcSize,
+                         const u8 *srcBase, ptrdiff_t srcStride,
+                         const Size2D &dstSize,
+                         u8 *dstBase, ptrdiff_t dstStride,
+                         BORDER_MODE border, u8 borderValue);
+
+    /* Same as above, but uses round-half-up rounding. */
+
+    bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn);
+    void gaussianPyramidDown(const Size2D &srcSize,
+                             const u8 *srcBase, ptrdiff_t srcStride,
+                             const Size2D &dstSize,
+                             u8 *dstBase, ptrdiff_t dstStride, u8 cn);
+
+
+    bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn);
+    void gaussianPyramidDown(const Size2D &srcSize,
+                             const s16 *srcBase, ptrdiff_t srcStride,
+                             const Size2D &dstSize,
+                             s16 *dstBase, ptrdiff_t dstStride, u8 cn);
+
+    bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn);
+    void gaussianPyramidDown(const Size2D &srcSize,
+                             const f32 *srcBase, ptrdiff_t srcStride,
+                             const Size2D &dstSize,
+                             f32 *dstBase, ptrdiff_t dstStride, u8 cn);
+
+    bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn);
+    void gaussianPyramidUp(const Size2D &srcSize,
+                           const u8 *srcBase, ptrdiff_t srcStride,
+                           const Size2D &dstSize,
+                           u8 *dstBase, ptrdiff_t dstStride, u8 cn);
+
+    bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn);
+    void gaussianPyramidUp(const Size2D &srcSize,
+                           const s16 *srcBase, ptrdiff_t srcStride,
+                           const Size2D &dstSize,
+                           s16 *dstBase, ptrdiff_t dstStride, u8 cn);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src[p] > threshold ? trueValue : falseValue
+    */
+    void thresholdBinary(const Size2D &size,
+                         const u8 *srcBase, ptrdiff_t srcStride,
+                         u8 *dstBase, ptrdiff_t dstStride,
+                         u8 threshold, u8 trueValue = 255, u8 falseValue = 0);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = lower <= src[p] && src[p] <= upper ? trueValue : falseValue
+    */
+    void thresholdRange(const Size2D &size,
+                        const u8 *srcBase, ptrdiff_t srcStride,
+                        u8 *dstBase, ptrdiff_t dstStride,
+                        u8 lowerThreshold, u8 upperThreshold,
+                        u8 trueValue = 255, u8 falseValue = 0);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src[p] > threshold ? value : 0
+    */
+    void thresholdBinary(const Size2D &size,
+                         const u8 *srcBase, ptrdiff_t srcStride,
+                         u8 *dstBase, ptrdiff_t dstStride,
+                         u8 threshold, u8 value);
+
+    void thresholdBinary(const Size2D &size,
+                         const s8 *srcBase, ptrdiff_t srcStride,
+                         s8 *dstBase, ptrdiff_t dstStride,
+                         s8 threshold, s8 value);
+
+    void thresholdBinary(const Size2D &size,
+                         const u16 *srcBase, ptrdiff_t srcStride,
+                         u16 *dstBase, ptrdiff_t dstStride,
+                         u16 threshold, u16 value);
+
+    void thresholdBinary(const Size2D &size,
+                         const s16 *srcBase, ptrdiff_t srcStride,
+                         s16 *dstBase, ptrdiff_t dstStride,
+                         s16 threshold, s16 value);
+
+    void thresholdBinary(const Size2D &size,
+                         const s32 *srcBase, ptrdiff_t srcStride,
+                         s32 *dstBase, ptrdiff_t dstStride,
+                         s32 threshold, s32 value);
+
+    void thresholdBinary(const Size2D &size,
+                         const f32 *srcBase, ptrdiff_t srcStride,
+                         f32 *dstBase, ptrdiff_t dstStride,
+                         f32 threshold, f32 value);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src[p] > threshold ? 0 : value
+    */
+    void thresholdBinaryInv(const Size2D &size,
+                            const u8 *srcBase, ptrdiff_t srcStride,
+                            u8 *dstBase, ptrdiff_t dstStride,
+                            u8 threshold, u8 value);
+
+    void thresholdBinaryInv(const Size2D &size,
+                            const s8 *srcBase, ptrdiff_t srcStride,
+                            s8 *dstBase, ptrdiff_t dstStride,
+                            s8 threshold, s8 value);
+
+    void thresholdBinaryInv(const Size2D &size,
+                            const u16 *srcBase, ptrdiff_t srcStride,
+                            u16 *dstBase, ptrdiff_t dstStride,
+                            u16 threshold, u16 value);
+
+    void thresholdBinaryInv(const Size2D &size,
+                            const s16 *srcBase, ptrdiff_t srcStride,
+                            s16 *dstBase, ptrdiff_t dstStride,
+                            s16 threshold, s16 value);
+
+    void thresholdBinaryInv(const Size2D &size,
+                            const s32 *srcBase, ptrdiff_t srcStride,
+                            s32 *dstBase, ptrdiff_t dstStride,
+                            s32 threshold, s32 value);
+
+    void thresholdBinaryInv(const Size2D &size,
+                            const f32 *srcBase, ptrdiff_t srcStride,
+                            f32 *dstBase, ptrdiff_t dstStride,
+                            f32 threshold, f32 value);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src[p] > threshold ? threshold : src[p]
+    */
+    void thresholdTruncate(const Size2D &size,
+                           const u8 *srcBase, ptrdiff_t srcStride,
+                           u8 *dstBase, ptrdiff_t dstStride,
+                           u8 threshold);
+
+    void thresholdTruncate(const Size2D &size,
+                           const s8 *srcBase, ptrdiff_t srcStride,
+                           s8 *dstBase, ptrdiff_t dstStride,
+                           s8 threshold);
+
+    void thresholdTruncate(const Size2D &size,
+                           const u16 *srcBase, ptrdiff_t srcStride,
+                           u16 *dstBase, ptrdiff_t dstStride,
+                           u16 threshold);
+
+    void thresholdTruncate(const Size2D &size,
+                           const s16 *srcBase, ptrdiff_t srcStride,
+                           s16 *dstBase, ptrdiff_t dstStride,
+                           s16 threshold);
+
+    void thresholdTruncate(const Size2D &size,
+                           const s32 *srcBase, ptrdiff_t srcStride,
+                           s32 *dstBase, ptrdiff_t dstStride,
+                           s32 threshold);
+
+    void thresholdTruncate(const Size2D &size,
+                           const f32 *srcBase, ptrdiff_t srcStride,
+                           f32 *dstBase, ptrdiff_t dstStride,
+                           f32 threshold);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src[p] > threshold ? src[p] : 0
+    */
+    void thresholdToZero(const Size2D &size,
+                         const u8 *srcBase, ptrdiff_t srcStride,
+                         u8 *dstBase, ptrdiff_t dstStride,
+                         u8 threshold);
+
+    void thresholdToZero(const Size2D &size,
+                         const s8 *srcBase, ptrdiff_t srcStride,
+                         s8 *dstBase, ptrdiff_t dstStride,
+                         s8 threshold);
+
+    void thresholdToZero(const Size2D &size,
+                         const u16 *srcBase, ptrdiff_t srcStride,
+                         u16 *dstBase, ptrdiff_t dstStride,
+                         u16 threshold);
+
+    void thresholdToZero(const Size2D &size,
+                         const s16 *srcBase, ptrdiff_t srcStride,
+                         s16 *dstBase, ptrdiff_t dstStride,
+                         s16 threshold);
+
+    void thresholdToZero(const Size2D &size,
+                         const s32 *srcBase, ptrdiff_t srcStride,
+                         s32 *dstBase, ptrdiff_t dstStride,
+                         s32 threshold);
+
+    void thresholdToZero(const Size2D &size,
+                         const f32 *srcBase, ptrdiff_t srcStride,
+                         f32 *dstBase, ptrdiff_t dstStride,
+                         f32 threshold);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src[p] > threshold ? 0 : src[p]
+    */
+    void thresholdToZeroInv(const Size2D &size,
+                            const u8 *srcBase, ptrdiff_t srcStride,
+                            u8 *dstBase, ptrdiff_t dstStride,
+                            u8 threshold);
+
+    void thresholdToZeroInv(const Size2D &size,
+                            const s8 *srcBase, ptrdiff_t srcStride,
+                            s8 *dstBase, ptrdiff_t dstStride,
+                            s8 threshold);
+
+    void thresholdToZeroInv(const Size2D &size,
+                            const u16 *srcBase, ptrdiff_t srcStride,
+                            u16 *dstBase, ptrdiff_t dstStride,
+                            u16 threshold);
+
+    void thresholdToZeroInv(const Size2D &size,
+                            const s16 *srcBase, ptrdiff_t srcStride,
+                            s16 *dstBase, ptrdiff_t dstStride,
+                            s16 threshold);
+
+    void thresholdToZeroInv(const Size2D &size,
+                            const s32 *srcBase, ptrdiff_t srcStride,
+                            s32 *dstBase, ptrdiff_t dstStride,
+                            s32 threshold);
+
+    void thresholdToZeroInv(const Size2D &size,
+                            const f32 *srcBase, ptrdiff_t srcStride,
+                            f32 *dstBase, ptrdiff_t dstStride,
+                            f32 threshold);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = abs(src0[p] - src1[p])
+    */
+    void absDiff(const Size2D &size,
+                 const u8 *src0Base, ptrdiff_t src0Stride,
+                 const u8 *src1Base, ptrdiff_t src1Stride,
+                 u8 *dstBase, ptrdiff_t dstStride);
+
+    void absDiff(const Size2D &size,
+                 const u16 *src0Base, ptrdiff_t src0Stride,
+                 const u16 *src1Base, ptrdiff_t src1Stride,
+                 u16 *dstBase, ptrdiff_t dstStride);
+
+    void absDiff(const Size2D &size,
+                 const s8 *src0Base, ptrdiff_t src0Stride,
+                 const s8 *src1Base, ptrdiff_t src1Stride,
+                 s8 *dstBase, ptrdiff_t dstStride);
+
+    void absDiff(const Size2D &size,
+                 const s16 *src0Base, ptrdiff_t src0Stride,
+                 const s16 *src1Base, ptrdiff_t src1Stride,
+                 s16 *dstBase, ptrdiff_t dstStride);
+
+    void absDiff(const Size2D &size,
+                 const s32 * src0Base, ptrdiff_t src0Stride,
+                 const s32 * src1Base, ptrdiff_t src1Stride,
+                 s32 * dstBase, ptrdiff_t dstStride);
+
+    void absDiff(const Size2D &size,
+                 const f32 * src0Base, ptrdiff_t src0Stride,
+                 const f32 * src1Base, ptrdiff_t src1Stride,
+                 f32 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = ~src[p]
+    */
+    void bitwiseNot(const Size2D &size,
+                    const u8 *srcBase, ptrdiff_t srcStride,
+                    u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] & src1[p]
+    */
+    void bitwiseAnd(const Size2D &size,
+                    const u8 *src0Base, ptrdiff_t src0Stride,
+                    const u8 *src1Base, ptrdiff_t src1Stride,
+                    u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] | src1[p]
+    */
+    void bitwiseOr(const Size2D &size,
+                   const u8 *src0Base, ptrdiff_t src0Stride,
+                   const u8 *src1Base, ptrdiff_t src1Stride,
+                   u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] ^ src1[p]
+    */
+    void bitwiseXor(const Size2D &size,
+                    const u8 *src0Base, ptrdiff_t src0Stride,
+                    const u8 *src1Base, ptrdiff_t src1Stride,
+                    u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] == src1[p] ? 255 : 0
+    */
+    void cmpEQ(const Size2D &size,
+               const u8 *src0Base, ptrdiff_t src0Stride,
+               const u8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpEQ(const Size2D &size,
+               const s8 *src0Base, ptrdiff_t src0Stride,
+               const s8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpEQ(const Size2D &size,
+               const u16 *src0Base, ptrdiff_t src0Stride,
+               const u16 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpEQ(const Size2D &size,
+               const s16 *src0Base, ptrdiff_t src0Stride,
+               const s16 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpEQ(const Size2D &size,
+               const u32 *src0Base, ptrdiff_t src0Stride,
+               const u32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpEQ(const Size2D &size,
+               const s32 *src0Base, ptrdiff_t src0Stride,
+               const s32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpEQ(const Size2D &size,
+               const f32 *src0Base, ptrdiff_t src0Stride,
+               const f32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] != src1[p] ? 255 : 0
+    */
+    void cmpNE(const Size2D &size,
+               const u8 *src0Base, ptrdiff_t src0Stride,
+               const u8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpNE(const Size2D &size,
+               const s8 *src0Base, ptrdiff_t src0Stride,
+               const s8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpNE(const Size2D &size,
+               const u16 *src0Base, ptrdiff_t src0Stride,
+               const u16 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpNE(const Size2D &size,
+               const s16 *src0Base, ptrdiff_t src0Stride,
+               const s16 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpNE(const Size2D &size,
+               const u32 *src0Base, ptrdiff_t src0Stride,
+               const u32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpNE(const Size2D &size,
+               const s32 *src0Base, ptrdiff_t src0Stride,
+               const s32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpNE(const Size2D &size,
+               const f32 *src0Base, ptrdiff_t src0Stride,
+               const f32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] > src1[p] ? 255 : 0
+    */
+    void cmpGT(const Size2D &size,
+               const u8 *src0Base, ptrdiff_t src0Stride,
+               const u8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGT(const Size2D &size,
+               const s8 *src0Base, ptrdiff_t src0Stride,
+               const s8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGT(const Size2D &size,
+               const u16 *src0Base, ptrdiff_t src0Stride,
+               const u16 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGT(const Size2D &size,
+               const s16 *src0Base, ptrdiff_t src0Stride,
+               const s16 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGT(const Size2D &size,
+               const u32 *src0Base, ptrdiff_t src0Stride,
+               const u32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGT(const Size2D &size,
+               const s32 *src0Base, ptrdiff_t src0Stride,
+               const s32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGT(const Size2D &size,
+               const f32 *src0Base, ptrdiff_t src0Stride,
+               const f32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src0[p] >= src1[p] ? 255 : 0
+    */
+    void cmpGE(const Size2D &size,
+               const u8 *src0Base, ptrdiff_t src0Stride,
+               const u8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGE(const Size2D &size,
+               const s8 *src0Base, ptrdiff_t src0Stride,
+               const s8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGE(const Size2D &size,
+               const u16 *src0Base, ptrdiff_t src0Stride,
+               const u16 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGE(const Size2D &size,
+               const s16 *src0Base, ptrdiff_t src0Stride,
+               const s16 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGE(const Size2D &size,
+               const u32 *src0Base, ptrdiff_t src0Stride,
+               const u32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGE(const Size2D &size,
+               const s32 *src0Base, ptrdiff_t src0Stride,
+               const s32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    void cmpGE(const Size2D &size,
+               const f32 *src0Base, ptrdiff_t src0Stride,
+               const f32 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        Calculates dot product
+    */
+    f64 dotProduct(const Size2D &size,
+                   const u8 * src0Base, ptrdiff_t src0Stride,
+                   const u8 * src1Base, ptrdiff_t src1Stride);
+
+    f64 dotProduct(const Size2D &size,
+                   const s8 * src0Base, ptrdiff_t src0Stride,
+                   const s8 * src1Base, ptrdiff_t src1Stride);
+
+    f64 dotProduct(const Size2D &size,
+                   const f32 * src0Base, ptrdiff_t src0Stride,
+                   const f32 * src1Base, ptrdiff_t src1Stride);
+
+    /*
+        Calculates mean and stddev
+    */
+    void meanStdDev(const Size2D &size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    f32 * pMean, f32 * pStdDev);
+
+    void meanStdDev(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                f32 * pMean, f32 * pStdDev);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = sqrt(src0[p] ^ 2 + src1[p] ^ 2)
+    */
+    void magnitude(const Size2D &size,
+                   const s16 *src0Base, ptrdiff_t src0Stride,
+                   const s16 *src1Base, ptrdiff_t src1Stride,
+                   s16 *dstBase, ptrdiff_t dstStride);
+
+    void magnitude(const Size2D &size,
+                   const f32 *src0Base, ptrdiff_t src0Stride,
+                   const f32 *src1Base, ptrdiff_t src1Stride,
+                   f32 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        Compute an integral image
+    */
+    void integral(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u32 * sumBase, ptrdiff_t sumStride);
+
+    /*
+        Compute an integral of squared image values
+    */
+    void sqrIntegral(const Size2D &size,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     f64 * sqsumBase, ptrdiff_t sqsumStride);
+
+    /*
+        Among each pixel `p` within `src` find min and max values
+    */
+    void minMaxVals(const Size2D &size,
+                    const u8 *srcBase, ptrdiff_t srcStride,
+                    u8 * minVal, u8 * maxVal);
+
+    void minMaxVals(const Size2D &size,
+                    const s16 *srcBase, ptrdiff_t srcStride,
+                    s16 * minVal, s16 * maxVal);
+
+    void minMaxVals(const Size2D &size,
+                    const u16 *srcBase, ptrdiff_t srcStride,
+                    u16 * minVal, u16 * maxVal);
+
+    void minMaxVals(const Size2D &size,
+                    const s32 *srcBase, ptrdiff_t srcStride,
+                    s32 * minVal, s32 * maxVal);
+
+    void minMaxVals(const Size2D &size,
+                    const u32 *srcBase, ptrdiff_t srcStride,
+                    u32 * minVal, u32 * maxVal);
+
+    /*
+        Fill the arrays `minLocPtr`, `maxLocPtr` with locations of
+        given values `minVal`, `maxVal`
+    */
+    void fillMinMaxLocs(const Size2D & size,
+                        const u8 *srcBase, ptrdiff_t srcStride,
+                        u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                        u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity);
+
+    void fillMinMaxLocs(const Size2D & size,
+                        const u16 *srcBase, ptrdiff_t srcStride,
+                        u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                        u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity);
+
+    void fillMinMaxLocs(const Size2D & size,
+                        const s16 *srcBase, ptrdiff_t srcStride,
+                        s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                        s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity);
+
+    void fillMinMaxLocs(const Size2D & size,
+                        const u32 *srcBase, ptrdiff_t srcStride,
+                        u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                        u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity);
+
+    void fillMinMaxLocs(const Size2D & size,
+                        const s32 *srcBase, ptrdiff_t srcStride,
+                        s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                        s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity);
+
+    /*
+        Among each pixel `p` within `src` find min and max values and its first occurences
+    */
+    void minMaxLoc(const Size2D &size,
+                   const s8 * srcBase, ptrdiff_t srcStride,
+                   s8 &minVal, size_t &minCol, size_t &minRow,
+                   s8 &maxVal, size_t &maxCol, size_t &maxRow);
+
+    void minMaxLoc(const Size2D &size,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   u8 &minVal, size_t &minCol, size_t &minRow,
+                   u8 &maxVal, size_t &maxCol, size_t &maxRow);
+
+    void minMaxLoc(const Size2D &size,
+                   const s16 * srcBase, ptrdiff_t srcStride,
+                   s16 &minVal, size_t &minCol, size_t &minRow,
+                   s16 &maxVal, size_t &maxCol, size_t &maxRow);
+
+    void minMaxLoc(const Size2D &size,
+                   const u16 * srcBase, ptrdiff_t srcStride,
+                   u16 &minVal, size_t &minCol, size_t &minRow,
+                   u16 &maxVal, size_t &maxCol, size_t &maxRow);
+
+    void minMaxLoc(const Size2D &size,
+                   const s32 * srcBase, ptrdiff_t srcStride,
+                   s32 &minVal, size_t &minCol, size_t &minRow,
+                   s32 &maxVal, size_t &maxCol, size_t &maxRow);
+
+    void minMaxLoc(const Size2D &size,
+                   const f32 * srcBase, ptrdiff_t srcStride,
+                   f32 &minVal, size_t &minCol, size_t &minRow,
+                   f32 &maxVal, size_t &maxCol, size_t &maxRow);
+
+    void minMaxLoc(const Size2D &size,
+                   const f32 * srcBase, ptrdiff_t srcStride,
+                   const u8 * maskBase, ptrdiff_t maskStride,
+                   f32 &minVal, size_t &minCol, size_t &minRow,
+                   f32 &maxVal, size_t &maxCol, size_t &maxRow);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] += src[p]
+    */
+    void accumulate(const Size2D &size,
+                    const u8 *srcBase, ptrdiff_t srcStride,
+                    s16 *dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = (dst[p] + ((src[p] ^ 2) >> shift))
+    */
+    void accumulateSquare(const Size2D &size,
+                          const u8 *srcBase, ptrdiff_t srcStride,
+                          s16 *dstBase, ptrdiff_t dstStride,
+                          u32 shift);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = (1 - alpha) * dst[p] + alpha * src[p]
+    */
+    void accumulateWeighted(const Size2D &size,
+                            const u8 *srcBase, ptrdiff_t srcStride,
+                            u8 *dstBase, ptrdiff_t dstStride,
+                            f32 alpha);
+
+    /*
+        orient[p] = atan2(src0[p], src1[p])
+    */
+    void phase(const Size2D &size,
+               const s16 * src0Base, ptrdiff_t src0Stride,
+               const s16 * src1Base, ptrdiff_t src1Stride,
+               u8 * orientBase, ptrdiff_t orientStride);
+
+    void phase(const Size2D &size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride,
+               f32 * orientBase, ptrdiff_t orientStride,
+               f32 scale);
+
+    /*
+        Combine 2 planes to a single one
+    */
+    void combine2(const Size2D &size,
+                  const u8 * src0Base, ptrdiff_t src0Stride,
+                  const u8 * src1Base, ptrdiff_t src1Stride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    void combine2(const Size2D &size,
+                  const u16 * src0Base, ptrdiff_t src0Stride,
+                  const u16 * src1Base, ptrdiff_t src1Stride,
+                  u16 * dstBase, ptrdiff_t dstStride);
+
+    void combine2(const Size2D &size,
+                  const s32 * src0Base, ptrdiff_t src0Stride,
+                  const s32 * src1Base, ptrdiff_t src1Stride,
+                  s32 * dstBase, ptrdiff_t dstStride);
+
+    void combine2(const Size2D &size,
+                  const s64 * src0Base, ptrdiff_t src0Stride,
+                  const s64 * src1Base, ptrdiff_t src1Stride,
+                  s64 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Combine 3 planes to a single one
+    */
+    void combine3(const Size2D &size,
+                  const u8 * src0Base, ptrdiff_t src0Stride,
+                  const u8 * src1Base, ptrdiff_t src1Stride,
+                  const u8 * src2Base, ptrdiff_t src2Stride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    void combine3(const Size2D &size,
+                  const u16 * src0Base, ptrdiff_t src0Stride,
+                  const u16 * src1Base, ptrdiff_t src1Stride,
+                  const u16 * src2Base, ptrdiff_t src2Stride,
+                  u16 * dstBase, ptrdiff_t dstStride);
+
+    void combine3(const Size2D &size,
+                  const s32 * src0Base, ptrdiff_t src0Stride,
+                  const s32 * src1Base, ptrdiff_t src1Stride,
+                  const s32 * src2Base, ptrdiff_t src2Stride,
+                  s32 * dstBase, ptrdiff_t dstStride);
+
+    void combine3(const Size2D &size,
+                  const s64 * src0Base, ptrdiff_t src0Stride,
+                  const s64 * src1Base, ptrdiff_t src1Stride,
+                  const s64 * src2Base, ptrdiff_t src2Stride,
+                  s64 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Combine 4 planes to a single one
+    */
+    void combine4(const Size2D &size,
+                  const u8 * src0Base, ptrdiff_t src0Stride,
+                  const u8 * src1Base, ptrdiff_t src1Stride,
+                  const u8 * src2Base, ptrdiff_t src2Stride,
+                  const u8 * src3Base, ptrdiff_t src3Stride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    void combine4(const Size2D &size,
+                  const u16 * src0Base, ptrdiff_t src0Stride,
+                  const u16 * src1Base, ptrdiff_t src1Stride,
+                  const u16 * src2Base, ptrdiff_t src2Stride,
+                  const u16 * src3Base, ptrdiff_t src3Stride,
+                  u16 * dstBase, ptrdiff_t dstStride);
+
+    void combine4(const Size2D &size,
+                  const s32 * src0Base, ptrdiff_t src0Stride,
+                  const s32 * src1Base, ptrdiff_t src1Stride,
+                  const s32 * src2Base, ptrdiff_t src2Stride,
+                  const s32 * src3Base, ptrdiff_t src3Stride,
+                  s32 * dstBase, ptrdiff_t dstStride);
+
+    void combine4(const Size2D &size,
+                  const s64 * src0Base, ptrdiff_t src0Stride,
+                  const s64 * src1Base, ptrdiff_t src1Stride,
+                  const s64 * src2Base, ptrdiff_t src2Stride,
+                  const s64 * src3Base, ptrdiff_t src3Stride,
+                  s64 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Combine 3 planes to YUYV one
+    */
+    void combineYUYV(const Size2D &size,
+                     const u8 * srcyBase, ptrdiff_t srcyStride,
+                     const u8 * srcuBase, ptrdiff_t srcuStride,
+                     const u8 * srcvBase, ptrdiff_t srcvStride,
+                     u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Combine 3 planes to UYVY one
+    */
+    void combineUYVY(const Size2D &size,
+                     const u8 * srcyBase, ptrdiff_t srcyStride,
+                     const u8 * srcuBase, ptrdiff_t srcuStride,
+                     const u8 * srcvBase, ptrdiff_t srcvStride,
+                     u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGB image to grayscale one
+    */
+    void rgb2gray(const Size2D &size, COLOR_SPACE color_space,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGBX image to grayscale one
+    */
+    void rgbx2gray(const Size2D &size, COLOR_SPACE color_space,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert BGR image to grayscale one
+    */
+    void bgr2gray(const Size2D &size, COLOR_SPACE color_space,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert BGRX image to grayscale one
+    */
+    void bgrx2gray(const Size2D &size, COLOR_SPACE color_space,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert grayscale image to RGB one
+    */
+    void gray2rgb(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert grayscale image to RGBX one
+    */
+    void gray2rgbx(const Size2D &size,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGB image to RGBX
+    */
+    void rgb2rgbx(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGBX image to RGB
+    */
+    void rgbx2rgb(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGB image to BGR
+    */
+    void rgb2bgr(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGBX image to BGRX
+    */
+    void rgbx2bgrx(const Size2D &size,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGBX image to BGR
+    */
+    void rgbx2bgr(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGB image to BGRX
+    */
+    void rgb2bgrx(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGB image to HSV
+    */
+    void rgb2hsv(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 s32 hrange);
+
+    /*
+        Convert RGBX image to HSV
+    */
+    void rgbx2hsv(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  s32 hrange);
+
+    /*
+        Convert BGR image to HSV
+    */
+    void bgr2hsv(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 s32 hrange);
+
+    /*
+        Convert BGRX image to HSV
+    */
+    void bgrx2hsv(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  s32 hrange);
+
+    /*
+        Convert RGBX image to BGR565
+        RRRRrrrr GGGGgggg BBBBbbbb XXXXxxxx -> GggBBBBb RRRRrGGG
+    */
+    void rgbx2bgr565(const Size2D &size,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGB image to BGR565
+        RRRRrrrr GGGGgggg BBBBbbbb -> GggBBBBb RRRRrGGG
+    */
+    void rgb2bgr565(const Size2D &size,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGBX image to RGB565
+        RRRRrrrr GGGGgggg BBBBbbbb XXXXxxxx -> GggRRRRr BBBBbGGG
+    */
+    void rgbx2rgb565(const Size2D &size,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGB image to RGB565
+        RRRRrrrr GGGGgggg BBBBbbbb -> GggRRRRr BBBBbGGG
+    */
+    void rgb2rgb565(const Size2D &size,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGB image to YCrCb
+    */
+    void rgb2ycrcb(const Size2D &size,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert RGBX image to YCrCb
+    */
+    void rgbx2ycrcb(const Size2D &size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert BGR image to YCrCb
+    */
+    void bgr2ycrcb(const Size2D &size,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert BGRX image to YCrCb
+    */
+    void bgrx2ycrcb(const Size2D &size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert YUV420sp image to RGB
+    */
+    void yuv420sp2rgb(const Size2D &size,
+                      const u8 *  yBase, ptrdiff_t  yStride,
+                      const u8 * uvBase, ptrdiff_t uvStride,
+                      u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert YUV420sp image to RGBX
+    */
+    void yuv420sp2rgbx(const Size2D &size,
+                       const u8 *  yBase, ptrdiff_t  yStride,
+                       const u8 * uvBase, ptrdiff_t uvStride,
+                       u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert YUV420i image to RGB
+    */
+    void yuv420i2rgb(const Size2D &size,
+                     const u8 *  yBase, ptrdiff_t  yStride,
+                     const u8 * uvBase, ptrdiff_t uvStride,
+                     u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert YUV420i image to RGBX
+    */
+    void yuv420i2rgbx(const Size2D &size,
+                      const u8 *  yBase, ptrdiff_t  yStride,
+                      const u8 * uvBase, ptrdiff_t uvStride,
+                      u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert YUV420sp image to BGR
+    */
+    void yuv420sp2bgr(const Size2D &size,
+                      const u8 *  yBase, ptrdiff_t  yStride,
+                      const u8 * uvBase, ptrdiff_t uvStride,
+                      u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert YUV420sp image to BGRX
+    */
+    void yuv420sp2bgrx(const Size2D &size,
+                       const u8 *  yBase, ptrdiff_t  yStride,
+                       const u8 * uvBase, ptrdiff_t uvStride,
+                       u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert YUV420i image to BGR
+    */
+    void yuv420i2bgr(const Size2D &size,
+                     const u8 *  yBase, ptrdiff_t  yStride,
+                     const u8 * uvBase, ptrdiff_t uvStride,
+                     u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert YUV420i image to BGRX
+    */
+    void yuv420i2bgrx(const Size2D &size,
+                      const u8 *  yBase, ptrdiff_t  yStride,
+                      const u8 * uvBase, ptrdiff_t uvStride,
+                      u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = src[p] << shift
+    */
+    void lshift(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                s16 * dstBase, ptrdiff_t dstStride,
+                u32 shift);
+
+    /*
+        For each point `p` within `size`, do sign-extending shift:
+        dst[p] = src[p] >> shift
+    */
+    void rshift(const Size2D &size,
+                const s16 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                u32 shift, CONVERT_POLICY cpolicy);
+
+    /*
+        For each point `p` within `size`, set `dst[p]` to the average
+        of `src[p]` and the 8 (or 24 for blur5x5) points around it
+        NOTE: the function cannot operate inplace
+    */
+    bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border);
+    void blur3x3(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE border, u8 borderValue);
+
+    bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border);
+    void blur3x3(const Size2D &size, s32 cn,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE borderType, u8 borderValue);
+
+    void blur5x5(const Size2D &size, s32 cn,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE borderType, u8 borderValue);
+
+    /*
+        For each point `p` within `size`, set `dst[p]` to the average
+        of `src[p]` and the 8 points around it
+        NOTE: the function can operate inplace
+    */
+    bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border);
+    void blur3x3(const Size2D &size, s32 cn,
+                 const f32 * srcBase, ptrdiff_t srcStride,
+                 f32 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE borderType, f32 borderValue, Margin borderMargin);
+
+    bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border);
+    void blur3x3(const Size2D &size, s32 cn,
+                 const s32 * srcBase, ptrdiff_t srcStride,
+                 s32 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE borderType, s32 borderValue, Margin borderMargin);
+
+    /*
+        For each point `p` within `size`, set `dst[p]` to gaussian smooth
+        of `src[p]` and the 8(24 for 5x5 version) points around it
+        NOTE: the function cannot operate inplace
+    */
+    bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border);
+    void gaussianBlur3x3(const Size2D &size,
+                         const u8 * srcBase, ptrdiff_t srcStride,
+                         u8 * dstBase, ptrdiff_t dstStride,
+                         BORDER_MODE border, u8 borderValue);
+    bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin = Margin());
+    void gaussianBlur3x3Margin(const Size2D &size,
+                               const u8 * srcBase, ptrdiff_t srcStride,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin());
+
+    bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border);
+    void gaussianBlur5x5(const Size2D &size, s32 cn,
+                         const u8 * srcBase, ptrdiff_t srcStride,
+                         u8 * dstBase, ptrdiff_t dstStride,
+                         BORDER_MODE borderType, u8 borderValue, Margin borderMargin);
+
+    void gaussianBlur5x5(const Size2D &size, s32 cn,
+                         const u16 * srcBase, ptrdiff_t srcStride,
+                         u16 * dstBase, ptrdiff_t dstStride,
+                         BORDER_MODE borderType, u16 borderValue, Margin borderMargin);
+
+    void gaussianBlur5x5(const Size2D &size, s32 cn,
+                         const s16 * srcBase, ptrdiff_t srcStride,
+                         s16 * dstBase, ptrdiff_t dstStride,
+                         BORDER_MODE borderType, s16 borderValue, Margin borderMargin);
+
+    void gaussianBlur5x5(const Size2D &size, s32 cn,
+                         const s32 * srcBase, ptrdiff_t srcStride,
+                         s32 * dstBase, ptrdiff_t dstStride,
+                         BORDER_MODE borderType, s32 borderValue, Margin borderMargin);
+
+    /*
+        Calculation of Sobel operator
+        NOTE: the function cannot operate inplace
+    */
+    bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin());
+    void Sobel3x3(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  s16 * dstBase, ptrdiff_t dstStride,
+                  s32 dx, s32 dy,
+                  BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin());
+
+    /*
+        Calculation of Sobel operator for f32 data
+        NOTE: the function can operate inplace
+    */
+    bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy);
+    void Sobel3x3(const Size2D &size,
+                  const f32 * srcBase, ptrdiff_t srcStride,
+                  f32 * dstBase, ptrdiff_t dstStride,
+                  s32 dx, s32 dy,
+                  BORDER_MODE borderType, f32 borderValue);
+
+    /*
+        Calculation of Scharr operator
+        NOTE: the function cannot operate inplace
+    */
+    bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin());
+    void Scharr3x3(const Size2D &size,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   s16 * dstBase, ptrdiff_t dstStride,
+                   s32 dx, s32 dy,
+                   BORDER_MODE borderType, u8 borderValue, Margin borderMargin = Margin());
+
+    void ScharrDeriv(const Size2D &size, s32 cn,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     s16 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Calculation of generic separable filtering operator
+        rowFilter/colFilter define filter weights
+        0 - predefined  1  2  1
+        1 - predefined -1  0  1
+        2 - predefined  1 -2  1
+        3 - weights provided as xw/yw
+    */
+    bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin());
+    void SeparableFilter3x3(const Size2D &size,
+                            const u8 * srcBase, ptrdiff_t srcStride,
+                            s16 * dstBase, ptrdiff_t dstStride,
+                            const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw,
+                            BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin());
+
+    /*
+        Extract a single plane from 2 channel image
+    */
+    void extract2(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  u32 coi);
+
+    /*
+        Extract a single plane from 3 channel image
+    */
+    void extract3(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  u32 coi);
+
+    /*
+        Extract a single plane from 4 channel image
+    */
+    void extract4(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  u32 coi);
+
+    /*
+        Split 2 channel image to separate planes
+    */
+    void split2(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dst0Base, ptrdiff_t dst0Stride,
+                u8 * dst1Base, ptrdiff_t dst1Stride);
+
+    void split2(const Size2D &size,
+                const u16* srcBase, ptrdiff_t srcStride,
+                u16 * dst0Base, ptrdiff_t dst0Stride,
+                u16 * dst1Base, ptrdiff_t dst1Stride);
+
+    void split2(const Size2D &size,
+                const s32 * srcBase, ptrdiff_t srcStride,
+                s32 * dst0Base, ptrdiff_t dst0Stride,
+                s32 * dst1Base, ptrdiff_t dst1Stride);
+
+    void split2(const Size2D &size,
+                const s64 * srcBase, ptrdiff_t srcStride,
+                s64 * dst0Base, ptrdiff_t dst0Stride,
+                s64 * dst1Base, ptrdiff_t dst1Stride);
+
+    /*
+        Split 3 channel image to separate planes
+    */
+    void split3(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dst0Base, ptrdiff_t dst0Stride,
+                u8 * dst1Base, ptrdiff_t dst1Stride,
+                u8 * dst2Base, ptrdiff_t dst2Stride);
+
+    void split3(const Size2D &size,
+                const u16* srcBase, ptrdiff_t srcStride,
+                u16 * dst0Base, ptrdiff_t dst0Stride,
+                u16 * dst1Base, ptrdiff_t dst1Stride,
+                u16 * dst2Base, ptrdiff_t dst2Stride);
+
+    void split3(const Size2D &size,
+                const s32 * srcBase, ptrdiff_t srcStride,
+                s32 * dst0Base, ptrdiff_t dst0Stride,
+                s32 * dst1Base, ptrdiff_t dst1Stride,
+                s32 * dst2Base, ptrdiff_t dst2Stride);
+
+    void split3(const Size2D &size,
+                const s64 * srcBase, ptrdiff_t srcStride,
+                s64 * dst0Base, ptrdiff_t dst0Stride,
+                s64 * dst1Base, ptrdiff_t dst1Stride,
+                s64 * dst2Base, ptrdiff_t dst2Stride);
+
+    /*
+        Split 4 channel image to separate planes
+    */
+    void split4(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dst0Base, ptrdiff_t dst0Stride,
+                u8 * dst1Base, ptrdiff_t dst1Stride,
+                u8 * dst2Base, ptrdiff_t dst2Stride,
+                u8 * dst3Base, ptrdiff_t dst3Stride);
+
+    void split4(const Size2D &size,
+                const u16* srcBase, ptrdiff_t srcStride,
+                u16 * dst0Base, ptrdiff_t dst0Stride,
+                u16 * dst1Base, ptrdiff_t dst1Stride,
+                u16 * dst2Base, ptrdiff_t dst2Stride,
+                u16 * dst3Base, ptrdiff_t dst3Stride);
+
+    void split4(const Size2D &size,
+                const s32 * srcBase, ptrdiff_t srcStride,
+                s32 * dst0Base, ptrdiff_t dst0Stride,
+                s32 * dst1Base, ptrdiff_t dst1Stride,
+                s32 * dst2Base, ptrdiff_t dst2Stride,
+                s32 * dst3Base, ptrdiff_t dst3Stride);
+
+    void split4(const Size2D &size,
+                const s64 * srcBase, ptrdiff_t srcStride,
+                s64 * dst0Base, ptrdiff_t dst0Stride,
+                s64 * dst1Base, ptrdiff_t dst1Stride,
+                s64 * dst2Base, ptrdiff_t dst2Stride,
+                s64 * dst3Base, ptrdiff_t dst3Stride);
+
+    /*
+        Split 4 channel image to 3 channel image and 1 channel image
+    */
+    void split4(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dst3Base, ptrdiff_t dst3Stride,
+                u8 * dst1Base, ptrdiff_t dst1Stride);
+
+    /*
+        Flip image using specified flip mode
+    */
+    bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize);
+    void flip(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              FLIP_MODE flipMode, u32 elemSize);
+
+    /*
+        For each point `p` within `size`, set `dst[p]` to the maximum
+        of `src[p]` and the 8 points around it
+        NOTE: the function cannot operate inplace
+    */
+    bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border);
+
+    void erode3x3(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  BORDER_MODE border, u8 borderValue);
+
+    void dilate3x3(const Size2D &size,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   u8 * dstBase, ptrdiff_t dstStride,
+                   BORDER_MODE border, u8 borderValue);
+
+    void erode(const Size2D &ssize, u32 cn,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride,
+               const Size2D &ksize,
+               size_t anchorX, size_t anchorY,
+               BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+               const u8 * borderValues, Margin borderMargin);
+
+    void dilate(const Size2D &ssize, u32 cn,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                const Size2D &ksize,
+                size_t anchorX, size_t anchorY,
+                BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+                const u8 * borderValues, Margin borderMargin);
+
+    /*
+        Resize a source image using "nearest neighbor" interpolation type
+
+        wr = src_width / dst_width
+        hr = src_height / dst_height
+    */
+    bool isResizeNearestNeighborSupported(const Size2D &ssize, u32 elemSize);
+    void resizeNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                               const void * srcBase, ptrdiff_t srcStride,
+                               void * dstBase, ptrdiff_t dstStride,
+                               f32 wr, f32 hr, u32 elemSize);
+
+    /*
+        Resize a source image using "area" interpolation type
+
+        wr = src_width / dst_width
+        hr = src_height / dst_height
+    */
+    bool isResizeAreaSupported(f32 wr, f32 hr, u32 channels);
+    void resizeAreaOpenCV(const Size2D &ssize, const Size2D &dsize,
+                          const u8 * srcBase, ptrdiff_t srcStride,
+                          u8 * dstBase, ptrdiff_t dstStride,
+                          f32 wr, f32 hr, u32 channels);
+    void resizeArea(const Size2D &ssize, const Size2D &dsize,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 * dstBase, ptrdiff_t dstStride,
+                    f32 wr, f32 hr, u32 channels);
+
+    /*
+        Resize a source image using "linear" interpolation type
+
+        wr = src_width / dst_width
+        hr = src_height / dst_height
+    */
+    bool isResizeLinearOpenCVSupported(const Size2D &ssize, const Size2D &dsize, u32 channels);
+    bool isResizeLinearSupported(const Size2D &ssize, const Size2D &dsize,
+                                 f32 wr, f32 hr, u32 channels);
+    void resizeLinearOpenCV(const Size2D &ssize, const Size2D &dsize,
+                            const u8 * srcBase, ptrdiff_t srcStride,
+                            u8 * dstBase, ptrdiff_t dstStride,
+                            f32 wr, f32 hr, u32 channels);
+    void resizeLinear(const Size2D &ssize, const Size2D &dsize,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      f32 wr, f32 hr, u32 channels);
+
+    /*
+        For each point `p` within `size`, set `dst[p]` to convolution
+        of `src[p]` and the (ksize * ksize - 1) points around it
+        The function uses OpenVX semantic (so, in order to use this function
+        in OpenCV you should flip kernel in both directions)
+        NOTE: the function cannot operate inplace
+    */
+    bool isConvolutionSupported(const Size2D &size, const Size2D &ksize, BORDER_MODE border);
+    void convolution(const Size2D &size,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE border, u8 borderValue,
+                     const Size2D & ksize, s16 * kernelBase, u32 scale);
+
+    /*
+        For each point `p` within `dstSize`, does convolution
+        of tmpl points and size*size square of src points starting with `src[p]`.
+        Src should be of size (dstSize+size-1)*(dstSize+size-1)
+        NOTE: the function cannot operate inplace
+    */
+    bool isMatchTemplateSupported(const Size2D &tmplSize);
+    void matchTemplate(const Size2D &srcSize,
+                       const u8 * srcBase, ptrdiff_t srcStride,
+                       const Size2D &tmplSize,
+                       const u8 * tmplBase, ptrdiff_t tmplStride,
+                       f32 * dstBase, ptrdiff_t dstStride,
+                       bool normalize);
+
+    /*
+        Calculation of Laplacian operator
+
+        1  1  1
+        1 -8  1
+        1  1  1
+
+        NOTE: the function cannot operate inplace
+    */
+    bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border);
+    void Laplacian3x3(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue);
+
+    /*
+        OpenCV like calculation of Laplacian operator
+
+        kernel 1    kernel 3    kernel 5
+        0  1  0     2  0  2     1   2   2   2   1
+        1 -4  1     0 -8  0     2   0  -4   0   2
+        0  1  0     2  0  2     2  -4 -12  -4   2
+                                2   0  -4   0   2
+                                1   2   2   2   1
+
+        NOTE: the function cannot operate inplace
+    */
+    bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border);
+    void Laplacian1OpenCV(const Size2D &size,
+                          const u8 * srcBase, ptrdiff_t srcStride,
+                          s16 * dstBase, ptrdiff_t dstStride,
+                          BORDER_MODE border, u8 borderValue);
+    void Laplacian3OpenCV(const Size2D &size,
+                          const u8 * srcBase, ptrdiff_t srcStride,
+                          s16 * dstBase, ptrdiff_t dstStride,
+                          BORDER_MODE border, u8 borderValue);
+    void Laplacian5OpenCV(const Size2D &size,
+                          const u8 * srcBase, ptrdiff_t srcStride,
+                          s16 * dstBase, ptrdiff_t dstStride,
+                          BORDER_MODE border, u8 borderValue);
+
+    /*
+        Detect image edges using Canny algorithm
+        These functions perform derivatives estimation using sobel algorithm
+    */
+    bool isCanny3x3Supported(const Size2D &size);
+    void Canny3x3L1(const Size2D &size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 * dstBase, ptrdiff_t dstStride,
+                    f64 low_thresh, f64 high_thresh,
+                    Margin borderMargin);
+
+    void Canny3x3L2(const Size2D &size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 * dstBase, ptrdiff_t dstStride,
+                    f64 low_thresh, f64 high_thresh,
+                    Margin borderMargin);
+
+    /*
+        Detect image edges using Canny algorithm
+        These functions don't estimate derivatives and thus require
+        precomputed derivatives estimation instead of source image
+    */
+    void Canny3x3L1(const Size2D &size, s32 cn,
+                    s16 * dxBase, ptrdiff_t dxStride,
+                    s16 * dyBase, ptrdiff_t dyStride,
+                    u8 * dstBase, ptrdiff_t dstStride,
+                    f64 low_thresh, f64 high_thresh);
+
+    void Canny3x3L2(const Size2D &size, s32 cn,
+                    s16 * dxBase, ptrdiff_t dxStride,
+                    s16 * dyBase, ptrdiff_t dyStride,
+                    u8 * dstBase, ptrdiff_t dstStride,
+                    f64 low_thresh, f64 high_thresh);
+
+    /*
+        Performs detection of FAST features
+    */
+    void FAST(const Size2D &size,
+              u8 *srcBase, ptrdiff_t srcStride,
+              KeypointStore *keypoints,
+              u8 threshold, bool nonmax_suppression);
+
+    /*
+        Remap a source image using table and specified
+        extrapolation method
+    */
+    bool isRemapNearestNeighborSupported(const Size2D &ssize);
+    void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                              const u8 * srcBase, ptrdiff_t srcStride,
+                              const f32 * tableBase, ptrdiff_t tableStride,
+                              u8 * dstBase, ptrdiff_t dstStride,
+                              BORDER_MODE borderMode, u8 borderValue);
+
+    bool isRemapLinearSupported(const Size2D &ssize);
+    void remapLinear(const Size2D &ssize, const Size2D &dsize,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     const f32 * tableBase, ptrdiff_t tableStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderMode, u8 borderValue);
+
+    /*
+        Perform an affine transform on an input image
+
+        src_x = dst_x * m[0] + dst_y * m[2] + m[4]
+        src_y = dst_x * m[1] + dst_y * m[3] + m[5]
+    */
+    bool isWarpAffineNearestNeighborSupported(const Size2D &ssize);
+    void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                                   const u8 * srcBase, ptrdiff_t srcStride,
+                                   const f32 * m,
+                                   u8 * dstBase, ptrdiff_t dstStride,
+                                   BORDER_MODE borderMode, u8 borderValue);
+
+    bool isWarpAffineLinearSupported(const Size2D &ssize);
+    void warpAffineLinear(const Size2D &ssize, const Size2D &dsize,
+                          const u8 * srcBase, ptrdiff_t srcStride,
+                          const f32 * m,
+                          u8 * dstBase, ptrdiff_t dstStride,
+                          BORDER_MODE borderMode, u8 borderValue);
+
+    /*
+        Perform a perspective transform on an input image
+
+        src_x = dst_x * m[0] + dst_y * m[3] + m[6]
+        src_y = dst_x * m[1] + dst_y * m[4] + m[7]
+        w     = dst_x * m[2] + dst_y * m[5] + m[8]
+
+        src_x = w == 0 ? 0 : src_x / w
+        src_y = w == 0 ? 0 : src_y / w
+    */
+    bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize);
+    void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                                        const u8 * srcBase, ptrdiff_t srcStride,
+                                        const f32 * m,
+                                        u8 * dstBase, ptrdiff_t dstStride,
+                                        BORDER_MODE borderMode, u8 borderValue);
+
+    bool isWarpPerspectiveLinearSupported(const Size2D &ssize);
+    void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize,
+                               const u8 * srcBase, ptrdiff_t srcStride,
+                               const f32 * m,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               BORDER_MODE borderMode, u8 borderValue);
+
+    /*
+        Convert data from source to destination type
+    */
+    void convert(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 s8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 s32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 f32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s8 * srcBase, ptrdiff_t srcStride,
+                 u16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s8 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s8 * srcBase, ptrdiff_t srcStride,
+                 s32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s8 * srcBase, ptrdiff_t srcStride,
+                 f32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride,
+                 s8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride,
+                 s32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride,
+                 f32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s16 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s16 * srcBase, ptrdiff_t srcStride,
+                 s8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s16 * srcBase, ptrdiff_t srcStride,
+                 u16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s16 * srcBase, ptrdiff_t srcStride,
+                 s32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s16 * srcBase, ptrdiff_t srcStride,
+                 f32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride,
+                 s8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride,
+                 u16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride,
+                 f32 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride,
+                 s8 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride,
+                 u16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride);
+
+    void convert(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride,
+                 s32 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Convert data from source to destination type with scaling
+        dst = saturate_cast<dst_type>(src * alpha + beta)
+    */
+    void convertScale(const Size2D &_size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      u16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s8 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s8 * srcBase, ptrdiff_t srcStride,
+                      s8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s8 * srcBase, ptrdiff_t srcStride,
+                      u16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s8 * srcBase, ptrdiff_t srcStride,
+                      s32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s8 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u16 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u16 * srcBase, ptrdiff_t srcStride,
+                      s8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u16 * srcBase, ptrdiff_t srcStride,
+                      u16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u16 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u16 * srcBase, ptrdiff_t srcStride,
+                      s32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const u16 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s16 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s16 * srcBase, ptrdiff_t srcStride,
+                      s8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s16 * srcBase, ptrdiff_t srcStride,
+                      u16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s16 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s16 * srcBase, ptrdiff_t srcStride,
+                      s32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s16 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s32 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s32 * srcBase, ptrdiff_t srcStride,
+                      s8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s32 * srcBase, ptrdiff_t srcStride,
+                      u16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s32 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s32 * srcBase, ptrdiff_t srcStride,
+                      s32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const s32 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      s8 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      u16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      s32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    void convertScale(const Size2D &_size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase, ptrdiff_t dstStride,
+                      f64 alpha, f64 beta);
+
+    /*
+        Reduce matrix to a vector by calculatin given operation for each column
+    */
+    void reduceColSum(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s32 * dstBase);
+
+    void reduceColMax(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase);
+
+    void reduceColMin(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      u8 * dstBase);
+
+    void reduceColSum(const Size2D &size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase);
+
+    void reduceColMax(const Size2D &size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase);
+
+    void reduceColMin(const Size2D &size,
+                      const f32 * srcBase, ptrdiff_t srcStride,
+                      f32 * dstBase);
+
+    /*
+        For each point `p` within `size`, do:
+        dst[p] = (rng1[p] <= src[p] && src[p] <= rng2[p]) ? 255 : 0
+    */
+
+    void inRange(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 const u8 * rng1Base, ptrdiff_t rng1Stride,
+                 const u8 * rng2Base, ptrdiff_t rng2Stride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void inRange(const Size2D &_size,
+                 const s8 * srcBase, ptrdiff_t srcStride,
+                 const s8 * rng1Base, ptrdiff_t rng1Stride,
+                 const s8 * rng2Base, ptrdiff_t rng2Stride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void inRange(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride,
+                 const u16 * rng1Base, ptrdiff_t rng1Stride,
+                 const u16 * rng2Base, ptrdiff_t rng2Stride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void inRange(const Size2D &_size,
+                 const s16 * srcBase, ptrdiff_t srcStride,
+                 const s16 * rng1Base, ptrdiff_t rng1Stride,
+                 const s16 * rng2Base, ptrdiff_t rng2Stride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void inRange(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride,
+                 const s32 * rng1Base, ptrdiff_t rng1Stride,
+                 const s32 * rng2Base, ptrdiff_t rng2Stride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    void inRange(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride,
+                 const f32 * rng1Base, ptrdiff_t rng1Stride,
+                 const f32 * rng2Base, ptrdiff_t rng2Stride,
+                 u8 * dstBase, ptrdiff_t dstStride);
+
+    /*
+        Estimate amount of non zero elements
+    */
+    s32 countNonZero(const Size2D &_size,
+                     const u8 * srcBase, ptrdiff_t srcStride);
+
+    s32 countNonZero(const Size2D &_size,
+                     const u16 * srcBase, ptrdiff_t srcStride);
+
+    s32 countNonZero(const Size2D &_size,
+                     const s32 * srcBase, ptrdiff_t srcStride);
+
+    s32 countNonZero(const Size2D &_size,
+                     const f32 * srcBase, ptrdiff_t srcStride);
+
+    s32 countNonZero(const Size2D &_size,
+                     const f64 * srcBase, ptrdiff_t srcStride);
+
+    /*
+        Calculates sum of all image pixel values and squared values
+    */
+    bool isSumSupported(u32 channels);
+
+    void sum(const Size2D &_size,
+             const u8 * srcBase, ptrdiff_t srcStride,
+             u32 * sumdst, u32 channels);
+
+    void sum(const Size2D &_size,
+             const f32 * srcBase, ptrdiff_t srcStride,
+             f64 * sumdst, u32 channels);
+
+    bool isSqsumSupported(u32 channels);
+
+    void sqsum(const Size2D &_size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               f64 * sumdst, f64 * sqsumdst, u32 channels);
+
+    /*
+        Calculates norm
+    */
+    s32 normInf(const Size2D &_size,
+                const u8 * srcBase, ptrdiff_t srcStride);
+
+    s32 normInf(const Size2D &_size,
+                const s8 * srcBase, ptrdiff_t srcStride);
+
+    s32 normInf(const Size2D &_size,
+                const u16 * srcBase, ptrdiff_t srcStride);
+
+    s32 normInf(const Size2D &_size,
+                const s16 * srcBase, ptrdiff_t srcStride);
+
+    s32 normInf(const Size2D &_size,
+                const s32 * srcBase, ptrdiff_t srcStride);
+
+    f32 normInf(const Size2D &_size,
+                const f32 * srcBase, ptrdiff_t srcStride);
+
+    s32 normL1(const Size2D &_size,
+               const u8 * srcBase, ptrdiff_t srcStride);
+
+    s32 normL1(const Size2D &_size,
+               const s8 * srcBase, ptrdiff_t srcStride);
+
+    s32 normL1(const Size2D &_size,
+               const u16 * srcBase, ptrdiff_t srcStride);
+
+    s32 normL1(const Size2D &_size,
+               const s16 * srcBase, ptrdiff_t srcStride);
+
+    f64 normL1(const Size2D &_size,
+               const s32 * srcBase, ptrdiff_t srcStride);
+
+    f64 normL1(const Size2D &_size,
+               const f32 * srcBase, ptrdiff_t srcStride);
+
+    s32 normL2(const Size2D &_size,
+               const u8 * srcBase, ptrdiff_t srcStride);
+
+    s32 normL2(const Size2D &_size,
+               const s8 * srcBase, ptrdiff_t srcStride);
+
+    f64 normL2(const Size2D &_size,
+               const u16 * srcBase, ptrdiff_t srcStride);
+
+    f64 normL2(const Size2D &_size,
+               const s16 * srcBase, ptrdiff_t srcStride);
+
+    f64 normL2(const Size2D &_size,
+               const s32 * srcBase, ptrdiff_t srcStride);
+
+    f64 normL2(const Size2D &_size,
+               const f32 * srcBase, ptrdiff_t srcStride);
+
+    /*
+        Calculates norm of per element difference
+    */
+    s32 diffNormInf(const Size2D &_size,
+                    const u8 * src0Base, ptrdiff_t src0Stride,
+                    const u8 * src1Base, ptrdiff_t src1Stride);
+
+    f32 diffNormInf(const Size2D &_size,
+                    const f32 * src0Base, ptrdiff_t src0Stride,
+                    const f32 * src1Base, ptrdiff_t src1Stride);
+
+    s32 diffNormL1(const Size2D &_size,
+                   const u8 * src0Base, ptrdiff_t src0Stride,
+                   const u8 * src1Base, ptrdiff_t src1Stride);
+
+    f64 diffNormL1(const Size2D &_size,
+                   const f32 * src0Base, ptrdiff_t src0Stride,
+                   const f32 * src1Base, ptrdiff_t src1Stride);
+
+    s32 diffNormL2(const Size2D &_size,
+                   const u8 * src0Base, ptrdiff_t src0Stride,
+                   const u8 * src1Base, ptrdiff_t src1Stride);
+
+    f64 diffNormL2(const Size2D &_size,
+                   const f32 * src0Base, ptrdiff_t src0Stride,
+                   const f32 * src1Base, ptrdiff_t src1Stride);
+
+    /*
+     *        Pyramidal Lucas-Kanade Optical Flow level processing
+     */
+    void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
+                           const u8 *prevData, ptrdiff_t prevStride,
+                           const s16 *prevDerivData, ptrdiff_t prevDerivStride,
+                           const u8 *nextData, ptrdiff_t nextStride,
+                           u32 ptCount,
+                           const f32 *prevPts, f32 *nextPts,
+                           u8 *status, f32 *err,
+                           const Size2D &winSize,
+                           u32 terminationCount, f64 terminationEpsilon,
+                           u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                           f32 minEigThreshold);
+}
+
+#endif
diff --git a/3rdparty/carotene/include/carotene/types.hpp b/3rdparty/carotene/include/carotene/types.hpp
new file mode 100644
index 0000000000..81b03d649a
--- /dev/null
+++ b/3rdparty/carotene/include/carotene/types.hpp
@@ -0,0 +1,125 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_TYPES_HPP
+#define CAROTENE_TYPES_HPP
+
+#include <carotene/definitions.hpp>
+#include <stdint.h>
+#include <cstddef>
+
+#ifndef UINT32_MAX
+    #define UINT32_MAX (4294967295U)
+#endif
+
+namespace CAROTENE_NS {
+    using std::size_t;
+    using std::ptrdiff_t;
+
+    typedef int8_t   s8;
+    typedef uint8_t  u8;
+    typedef int16_t  s16;
+    typedef uint16_t u16;
+    typedef int32_t  s32;
+    typedef uint32_t u32;
+    typedef float    f32;
+    typedef int64_t  s64;
+    typedef uint64_t u64;
+    typedef double   f64;
+
+    typedef ptrdiff_t  stride_t;
+
+    enum CONVERT_POLICY
+    {
+        CONVERT_POLICY_WRAP,
+        CONVERT_POLICY_SATURATE
+    };
+
+    enum BORDER_MODE
+    {
+        BORDER_MODE_UNDEFINED,
+        BORDER_MODE_CONSTANT,
+        BORDER_MODE_REPLICATE,
+        BORDER_MODE_REFLECT,
+        BORDER_MODE_REFLECT101,
+        BORDER_MODE_WRAP
+    };
+
+    enum FLIP_MODE
+    {
+        FLIP_HORIZONTAL_MODE = 1,
+        FLIP_VERTICAL_MODE = 2,
+        FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE
+    };
+
+    enum COLOR_SPACE
+    {
+        COLOR_SPACE_BT601,
+        COLOR_SPACE_BT709
+    };
+
+    struct Size2D {
+        Size2D() : width(0), height(0) {}
+        Size2D(size_t width_, size_t height_) : width(width_), height(height_) {}
+
+        size_t width;
+        size_t height;
+
+        inline size_t total() const
+        {
+            return width * height;
+        }
+    };
+
+    struct Margin {
+        Margin() : left(0), right(0), top(0), bottom(0) {}
+        Margin(size_t left_, size_t right_, size_t top_, size_t bottom_)
+            : left(left_), right(right_), top(top_), bottom(bottom_) {}
+
+        // these are measured in elements
+        size_t left, right, top, bottom;
+    };
+
+    struct KeypointStore {
+        virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0;
+        virtual ~KeypointStore() {};
+    };
+}
+
+#endif
diff --git a/3rdparty/carotene/src/absdiff.cpp b/3rdparty/carotene/src/absdiff.cpp
new file mode 100644
index 0000000000..02008ceb3e
--- /dev/null
+++ b/3rdparty/carotene/src/absdiff.cpp
@@ -0,0 +1,241 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <algorithm>
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+struct AbsDiff
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vabdq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vabd(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0];
+    }
+};
+
+template <typename T>
+struct AbsDiffSigned
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        typename internal::VecTraits<T>::vec128 v_min = internal::vminq(v_src0, v_src1);
+        typename internal::VecTraits<T>::vec128 v_max = internal::vmaxq(v_src0, v_src1);
+        v_dst = internal::vqsubq(v_max, v_min);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        typename internal::VecTraits<T>::vec64 v_min = internal::vmin(v_src0, v_src1);
+        typename internal::VecTraits<T>::vec64 v_max = internal::vmax(v_src0, v_src1);
+        v_dst = internal::vqsub(v_max, v_min);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = internal::saturate_cast<T>(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void absDiff(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiff<u8>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const u16 *src0Base, ptrdiff_t src0Stride,
+             const u16 *src1Base, ptrdiff_t src1Stride,
+             u16 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiff<u16>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const s8 *src0Base, ptrdiff_t src0Stride,
+             const s8 *src1Base, ptrdiff_t src1Stride,
+             s8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiffSigned<s8>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const s16 *src0Base, ptrdiff_t src0Stride,
+             const s16 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiffSigned<s16>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const s32 *src0Base, ptrdiff_t src0Stride,
+             const s32 *src1Base, ptrdiff_t src1Stride,
+             s32 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiffSigned<s32>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const f32 * src0Base, ptrdiff_t src0Stride,
+             const f32 * src1Base, ptrdiff_t src1Stride,
+             f32 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiff<f32>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/accumulate.cpp b/3rdparty/carotene/src/accumulate.cpp
new file mode 100644
index 0000000000..ee9ce22d35
--- /dev/null
+++ b/3rdparty/carotene/src/accumulate.cpp
@@ -0,0 +1,408 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+void accumulate(const Size2D &size,
+                const u8 *srcBase, ptrdiff_t srcStride,
+                s16 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            internal::prefetch(dst + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vld1q_s16(dst + j);
+            int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
+            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+            v_dst0 = vqaddq_s16(v_dst0, v_src0);
+            v_dst1 = vqaddq_s16(v_dst1, v_src1);
+            vst1q_s16(dst + j, v_dst0);
+            vst1q_s16(dst + j + 8, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src = vld1_u8(src + j);
+            int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
+            int16x8_t v_dst = vld1q_s16(dst + j);
+            v_dst = vqaddq_s16(v_dst, v_src16);
+            vst1q_s16(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+            dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <int shift>
+void accumulateSquareConst(const Size2D &size,
+                           const u8 *srcBase, ptrdiff_t srcStride,
+                           s16 *dstBase, ptrdiff_t dstStride)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            internal::prefetch(dst + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
+            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+            int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
+            v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
+                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
+
+            v_srclo = vget_low_s16(v_src1);
+            v_srchi = vget_high_s16(v_src1);
+            v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
+                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
+
+            vst1q_s16(dst + j, v_dst0);
+            vst1q_s16(dst + j + 8, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
+            int16x8_t v_dst = vld1q_s16(dst + j);
+            int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
+            v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
+                                 vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
+            vst1q_s16(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+        {
+            s32 srcVal = src[j];
+            dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
+        }
+    }
+}
+
+template <>
+void accumulateSquareConst<0>(const Size2D &size,
+                              const u8 *srcBase, ptrdiff_t srcStride,
+                              s16 *dstBase, ptrdiff_t dstStride)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            internal::prefetch(dst + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
+            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+            int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
+            v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
+                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
+
+            v_srclo = vget_low_s16(v_src1);
+            v_srchi = vget_high_s16(v_src1);
+            v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
+                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
+
+            vst1q_s16(dst + j, v_dst0);
+            vst1q_s16(dst + j + 8, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
+            int16x8_t v_dst = vld1q_s16(dst + j);
+            int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
+            v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
+                                 vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
+            vst1q_s16(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+        {
+            s32 srcVal = src[j];
+            dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
+        }
+    }
+}
+
+typedef void (* accumulateSquareConstFunc)(const Size2D &size,
+                                           const u8 *srcBase, ptrdiff_t srcStride,
+                                           s16 *dstBase, ptrdiff_t dstStride);
+
+} // namespace
+
+#endif
+
+void accumulateSquare(const Size2D &size,
+                      const u8 *srcBase, ptrdiff_t srcStride,
+                      s16 *dstBase, ptrdiff_t dstStride,
+                      u32 shift)
+{
+    if (shift >= 16)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            std::memset(dst, 0, sizeof(s16) * size.width);
+        }
+        return;
+    }
+
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    // this ugly contruction is needed to avoid:
+    // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
+    // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
+
+    accumulateSquareConstFunc funcs[16] =
+    {
+        accumulateSquareConst<0>,
+        accumulateSquareConst<1>,
+        accumulateSquareConst<2>,
+        accumulateSquareConst<3>,
+        accumulateSquareConst<4>,
+        accumulateSquareConst<5>,
+        accumulateSquareConst<6>,
+        accumulateSquareConst<7>,
+        accumulateSquareConst<8>,
+        accumulateSquareConst<9>,
+        accumulateSquareConst<10>,
+        accumulateSquareConst<11>,
+        accumulateSquareConst<12>,
+        accumulateSquareConst<13>,
+        accumulateSquareConst<14>,
+        accumulateSquareConst<15>
+    }, func = funcs[shift];
+
+    func(size, srcBase, srcStride, dstBase, dstStride);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)shift;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+struct AccumulateWeightedHalf
+{
+    typedef u8 type;
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        v_dst = vhaddq_u8(v_src0, v_src1);
+    }
+
+    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        v_dst = vhadd_u8(v_src0, v_src1);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
+    }
+};
+
+struct AccumulateWeighted
+{
+    typedef u8 type;
+
+    float alpha, beta;
+    float32x4_t v_alpha, v_beta;
+
+    explicit AccumulateWeighted(float _alpha) :
+        alpha(_alpha), beta(1 - _alpha)
+    {
+        v_alpha = vdupq_n_f32(alpha);
+        v_beta = vdupq_n_f32(beta);
+    }
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
+        uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
+        float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
+                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
+        float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
+                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
+        uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));
+
+        v_src0_p = vmovl_u8(vget_high_u8(v_src0));
+        v_src1_p = vmovl_u8(vget_high_u8(v_src1));
+        v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
+                            v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
+        v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
+                            v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
+        uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));
+
+        v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
+    }
+
+    void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
+
+        float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
+                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
+        float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
+                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
+        uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                        vmovn_u32(vcvtq_u32_f32(v_dst1f)));
+
+        v_dst = vmovn_u16(_v_dst);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = beta * src1[0] + alpha * src0[0];
+    }
+};
+
+} // namespace
+
+#endif
+
+void accumulateWeighted(const Size2D &size,
+                        const u8 *srcBase, ptrdiff_t srcStride,
+                        u8 *dstBase, ptrdiff_t dstStride,
+                        f32 alpha)
+{
+    if (alpha == 0.0f)
+        return;
+    if (alpha == 1.0f)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+            u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            std::memcpy(dst, src, sizeof(u8) * size.width);
+        }
+        return;
+    }
+
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    // in this case we can use the following scheme:
+    // dst[p] = (src[p] + dst[p]) >> 1
+    // which is faster
+    if (alpha == 0.5f)
+    {
+        internal::vtransform(size,
+                             srcBase, srcStride,
+                             dstBase, dstStride,
+                             dstBase, dstStride,
+                             AccumulateWeightedHalf());
+
+        return;
+    }
+
+    internal::vtransform(size,
+                     srcBase, srcStride,
+                     dstBase, dstStride,
+                     dstBase, dstStride,
+                     AccumulateWeighted(alpha));
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)alpha;
+#endif
+}
+
+} //namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/add.cpp b/3rdparty/carotene/src/add.cpp
new file mode 100644
index 0000000000..e8ace53122
--- /dev/null
+++ b/3rdparty/carotene/src/add.cpp
@@ -0,0 +1,475 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T, typename WT>
+struct AddWrap
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vaddq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vadd(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
+    }
+};
+
+template <typename T, typename WT>
+struct AddSaturate
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vqaddq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vqadd(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void add(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         u8 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<u8, u16>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<u8, u16>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const s8 * src0Base, ptrdiff_t src0Stride,
+         const s8 * src1Base, ptrdiff_t src1Stride,
+         s8 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<s8, s16>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<s8, s16>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+            uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
+            uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
+            vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
+            vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
+            vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
+            vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src0 = vld1_u8(src0 + j);
+            uint8x8_t v_src1 = vld1_u8(src1 + j);
+            vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
+        }
+
+        for (; j < size.width; j++)
+            dst[j] = (u16)src0[j] + (u16)src1[j];
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void add(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (policy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+                int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
+                int16x8_t v_src1 = vld1q_s16(src1 + j);
+                int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
+        }
+        else
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+                int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
+                int16x8_t v_src1 = vld1q_s16(src1 + j);
+                int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+        if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<s16, s32>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<s16, s32>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const u16 * src0Base, ptrdiff_t src0Stride,
+         const u16 * src1Base, ptrdiff_t src1Stride,
+         u16 * dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+        if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<u16, u32>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<u16, u32>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const s32 * src0Base, ptrdiff_t src0Stride,
+         const s32 * src1Base, ptrdiff_t src1Stride,
+         s32 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+        if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<s32, s64>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<s32, s64>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const u32 * src0Base, ptrdiff_t src0Stride,
+         const u32 * src1Base, ptrdiff_t src1Stride,
+         u32 * dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+        if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<u32, u64>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<u32, u64>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const f32 * src0Base, ptrdiff_t src0Stride,
+         const f32 * src1Base, ptrdiff_t src1Stride,
+         f32 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         AddWrap<f32, f32>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/add_weighted.cpp b/3rdparty/carotene/src/add_weighted.cpp
new file mode 100644
index 0000000000..1f89fb5372
--- /dev/null
+++ b/3rdparty/carotene/src/add_weighted.cpp
@@ -0,0 +1,265 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+using namespace internal;
+
+template <typename T> struct TypeTraits;
+template <> struct TypeTraits< u8> { typedef u16 wide;                     typedef  u8 unsign; typedef  uint8x16_t vec128; };
+template <> struct TypeTraits< s8> { typedef s16 wide;                     typedef  u8 unsign; typedef   int8x16_t vec128; };
+template <> struct TypeTraits<u16> { typedef u32 wide; typedef  u8 narrow; typedef u16 unsign; typedef  uint16x8_t vec128; };
+template <> struct TypeTraits<s16> { typedef s32 wide; typedef  s8 narrow; typedef u16 unsign; typedef   int16x8_t vec128; };
+template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef  uint32x4_t vec128; };
+template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef   int32x4_t vec128; };
+template <> struct TypeTraits<f32> { typedef f64 wide;                                         typedef float32x4_t vec128; };
+
+template <typename T> struct wAdd
+{
+    typedef T type;
+
+    f32 alpha, beta, gamma;
+    typedef typename TypeTraits<T>::wide wtype;
+    wAdd<wtype> wideAdd;
+    wAdd(f32 _alpha, f32 _beta, f32 _gamma):
+        alpha(_alpha), beta(_beta), gamma(_gamma),
+        wideAdd(_alpha, _beta, _gamma) {}
+
+    void operator() (const typename VecTraits<T>::vec128 & v_src0,
+                     const typename VecTraits<T>::vec128 & v_src1,
+                     typename VecTraits<T>::vec128 & v_dst) const
+    {
+        typename VecTraits<wtype>::vec128 vrl, vrh;
+        wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);
+        wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);
+
+        v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));
+    }
+
+    void operator() (const typename VecTraits<T>::vec64 & v_src0,
+                     const typename VecTraits<T>::vec64 & v_src1,
+                     typename VecTraits<T>::vec64 & v_dst) const
+    {
+        typename VecTraits<wtype>::vec128 vr;
+        wideAdd(vmovl(v_src0), vmovl(v_src1), vr);
+
+        v_dst = vqmovn(vr);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);
+    }
+};
+
+template <> struct wAdd<s32>
+{
+    typedef s32 type;
+
+    f32 alpha, beta, gamma;
+    float32x4_t valpha, vbeta, vgamma;
+    wAdd(f32 _alpha, f32 _beta, f32 _gamma):
+        alpha(_alpha), beta(_beta), gamma(_gamma)
+    {
+        valpha = vdupq_n_f32(_alpha);
+        vbeta = vdupq_n_f32(_beta);
+        vgamma = vdupq_n_f32(_gamma + 0.5);
+    }
+
+    void operator() (const typename VecTraits<s32>::vec128 & v_src0,
+                     const typename VecTraits<s32>::vec128 & v_src1,
+                     typename VecTraits<s32>::vec128 & v_dst) const
+    {
+        float32x4_t vs1 = vcvtq_f32_s32(v_src0);
+        float32x4_t vs2 = vcvtq_f32_s32(v_src1);
+
+        vs1 = vmlaq_f32(vgamma, vs1, valpha);
+        vs1 = vmlaq_f32(vs1, vs2, vbeta);
+        v_dst = vcvtq_s32_f32(vs1);
+    }
+
+    void operator() (const typename VecTraits<s32>::vec64 & v_src0,
+                     const typename VecTraits<s32>::vec64 & v_src1,
+                     typename VecTraits<s32>::vec64 & v_dst) const
+    {
+        float32x2_t vs1 = vcvt_f32_s32(v_src0);
+        float32x2_t vs2 = vcvt_f32_s32(v_src1);
+
+        vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
+        vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
+        v_dst = vcvt_s32_f32(vs1);
+    }
+
+    void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
+    {
+        dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);
+    }
+};
+
+template <> struct wAdd<u32>
+{
+    typedef u32 type;
+
+    f32 alpha, beta, gamma;
+    float32x4_t valpha, vbeta, vgamma;
+    wAdd(f32 _alpha, f32 _beta, f32 _gamma):
+        alpha(_alpha), beta(_beta), gamma(_gamma)
+    {
+        valpha = vdupq_n_f32(_alpha);
+        vbeta = vdupq_n_f32(_beta);
+        vgamma = vdupq_n_f32(_gamma + 0.5);
+    }
+
+    void operator() (const typename VecTraits<u32>::vec128 & v_src0,
+                     const typename VecTraits<u32>::vec128 & v_src1,
+                     typename VecTraits<u32>::vec128 & v_dst) const
+    {
+        float32x4_t vs1 = vcvtq_f32_u32(v_src0);
+        float32x4_t vs2 = vcvtq_f32_u32(v_src1);
+
+        vs1 = vmlaq_f32(vgamma, vs1, valpha);
+        vs1 = vmlaq_f32(vs1, vs2, vbeta);
+        v_dst = vcvtq_u32_f32(vs1);
+    }
+
+    void operator() (const typename VecTraits<u32>::vec64 & v_src0,
+                     const typename VecTraits<u32>::vec64 & v_src1,
+                     typename VecTraits<u32>::vec64 & v_dst) const
+    {
+        float32x2_t vs1 = vcvt_f32_u32(v_src0);
+        float32x2_t vs2 = vcvt_f32_u32(v_src1);
+
+        vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
+        vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
+        v_dst = vcvt_u32_f32(vs1);
+    }
+
+    void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
+    {
+        dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);
+    }
+};
+
+template <> struct wAdd<f32>
+{
+    typedef f32 type;
+
+    f32 alpha, beta, gamma;
+    float32x4_t valpha, vbeta, vgamma;
+    wAdd(f32 _alpha, f32 _beta, f32 _gamma):
+        alpha(_alpha), beta(_beta), gamma(_gamma)
+    {
+        valpha = vdupq_n_f32(_alpha);
+        vbeta = vdupq_n_f32(_beta);
+        vgamma = vdupq_n_f32(_gamma + 0.5);
+    }
+
+    void operator() (const typename VecTraits<f32>::vec128 & v_src0,
+                     const typename VecTraits<f32>::vec128 & v_src1,
+                     typename VecTraits<f32>::vec128 & v_dst) const
+    {
+        float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);
+        v_dst = vmlaq_f32(vs1, v_src1, vbeta);
+    }
+
+    void operator() (const typename VecTraits<f32>::vec64 & v_src0,
+                     const typename VecTraits<f32>::vec64 & v_src1,
+                     typename VecTraits<f32>::vec64 & v_dst) const
+    {
+        float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));
+        v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));
+
+    }
+
+    void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
+    {
+        dst[0] = alpha*src0[0] + beta*src1[0] + gamma;
+    }
+};
+
+} // namespace
+
+#define IMPL_ADDWEIGHTED(type)                                \
+void addWeighted(const Size2D &size,                          \
+                 const type * src0Base, ptrdiff_t src0Stride, \
+                 const type * src1Base, ptrdiff_t src1Stride, \
+                 type * dstBase, ptrdiff_t dstStride,         \
+                 f32 alpha, f32 beta, f32 gamma)              \
+{                                                             \
+    internal::assertSupportedConfiguration();                 \
+    wAdd<type> wgtAdd(alpha,                                  \
+                      beta,                                   \
+                      gamma);                                 \
+    internal::vtransform(size,                                \
+                         src0Base, src0Stride,                \
+                         src1Base, src1Stride,                \
+                         dstBase, dstStride,                  \
+                         wgtAdd);                             \
+}
+
+#else
+
+#define IMPL_ADDWEIGHTED(type)                                \
+void addWeighted(const Size2D &,                              \
+                 const type *, ptrdiff_t,                     \
+                 const type *, ptrdiff_t,                     \
+                 type *, ptrdiff_t,                           \
+                 f32, f32, f32)                               \
+{                                                             \
+    internal::assertSupportedConfiguration();                 \
+}
+
+#endif
+
+IMPL_ADDWEIGHTED(u8)
+IMPL_ADDWEIGHTED(s8)
+IMPL_ADDWEIGHTED(u16)
+IMPL_ADDWEIGHTED(s16)
+IMPL_ADDWEIGHTED(u32)
+IMPL_ADDWEIGHTED(s32)
+IMPL_ADDWEIGHTED(f32)
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/bitwise.cpp b/3rdparty/carotene/src/bitwise.cpp
new file mode 100644
index 0000000000..ee00775111
--- /dev/null
+++ b/3rdparty/carotene/src/bitwise.cpp
@@ -0,0 +1,225 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+struct BitwiseAnd
+{
+    typedef u8 type;
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        v_dst = vandq_u8(v_src0, v_src1);
+    }
+
+    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        v_dst = vand_u8(v_src0, v_src1);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] & src1[0];
+    }
+};
+
+struct BitwiseOr
+{
+    typedef u8 type;
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        v_dst = vorrq_u8(v_src0, v_src1);
+    }
+
+    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        v_dst = vorr_u8(v_src0, v_src1);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] | src1[0];
+    }
+};
+
+struct BitwiseXor
+{
+    typedef u8 type;
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        v_dst = veorq_u8(v_src0, v_src1);
+    }
+
+    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        v_dst = veor_u8(v_src0, v_src1);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] ^ src1[0];
+    }
+};
+
+#endif
+
+void bitwiseNot(const Size2D &size,
+                const u8 *srcBase, ptrdiff_t srcStride,
+                u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
+            uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1);
+            vst1q_u8(dst + j, v_dst0);
+            vst1q_u8(dst + j + 16, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src = vld1_u8(src + j);
+            uint8x8_t v_dst = vmvn_u8(v_src);
+            vst1_u8(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+        {
+            dst[j] = ~src[j];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bitwiseAnd(const Size2D &size,
+                const u8 *src0Base, ptrdiff_t src0Stride,
+                const u8 *src1Base, ptrdiff_t src1Stride,
+                u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, BitwiseAnd());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bitwiseOr(const Size2D &size,
+               const u8 *src0Base, ptrdiff_t src0Stride,
+               const u8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, BitwiseOr());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bitwiseXor(const Size2D &size,
+                const u8 *src0Base, ptrdiff_t src0Stride,
+                const u8 *src1Base, ptrdiff_t src1Stride,
+                u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, BitwiseXor());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/blur.cpp b/3rdparty/carotene/src/blur.cpp
new file mode 100644
index 0000000000..798cce5a71
--- /dev/null
+++ b/3rdparty/carotene/src/blur.cpp
@@ -0,0 +1,1337 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <vector>
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+
+namespace CAROTENE_NS {
+
+bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+void blur3x3(const Size2D &size,
+             const u8 * srcBase, ptrdiff_t srcStride,
+             u8 * dstBase, ptrdiff_t dstStride,
+             BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isBlur3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    const int16x8_t v_scale = vmovq_n_s16(3640);
+    const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+
+    uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        s16 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
+
+                currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border_x3;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u16(tprev, tcurr, 7);
+            t1 = tcurr;
+            t2 = vextq_u16(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
+
+            int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), v_scale);
+            uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
+            vst1_u8(drow + x - 8, it0);
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue * 3;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = srow2[x] + srow1[x] + srow0[x];
+            }
+            else
+                nextx = (srow2 ? srow2[x + 1] : borderValue) +
+                                 srow1[x + 1] +
+                        (srow0 ? srow0[x + 1] : borderValue);
+
+            f32 val = (prevx + currx + nextx) * (1 / 9.f) + 0.5f;
+            drow[x] = internal::saturate_cast<u8>((s32)val);
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border)
+{
+    return isSupportedConfiguration() &&
+           cn > 0 && cn <= 4 &&
+           size.width*cn >= 8 && size.height >= 2 &&
+           (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REFLECT101 ||
+            border == BORDER_MODE_REFLECT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+void blur3x3(const Size2D &size, s32 cn,
+             const u8 * srcBase, ptrdiff_t srcStride,
+             u8 * dstBase, ptrdiff_t dstStride,
+             BORDER_MODE borderType, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+//#define FLOAT_VARIANT_1_9
+#ifdef FLOAT_VARIANT_1_9
+    float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
+    float32x4_t v0_5 = vdupq_n_f32 (.5);
+#else
+    const int16x8_t vScale = vmovq_n_s16(3640);
+#endif
+
+    size_t colsn = size.width*cn;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 2*cn, borderValue);
+        tmp = &_tmp[cn];
+    }
+
+    uint16x8_t tprev = vdupq_n_u16(0x0);
+    uint16x8_t tcurr = tprev;
+    uint16x8_t tnext = tprev;
+    uint16x8_t t0, t1, t2;
+    if(cn == 1)
+    {
+        for( size_t y = 0; y < size.height; y++ )
+        {
+            const u8* srow0;
+            const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
+            const u8* srow2;
+            u8* drow = internal::getRowPtr(dstBase, dstStride, y);
+            if (borderType == BORDER_MODE_REFLECT101) {
+                srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
+                srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
+            } else  if (borderType == BORDER_MODE_CONSTANT) {
+                srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+                srow2 =  y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+            } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
+                srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+                srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
+            }
+
+            // do vertical convolution
+            size_t x = 0;
+            const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8);
+            for( ; x <= bcols; x += 8 )
+            {
+                internal::prefetch(srow0 + x);
+                internal::prefetch(srow1 + x);
+                internal::prefetch(srow2 + x);
+
+                uint8x8_t x0 = vld1_u8(srow0 + x);
+                uint8x8_t x1 = vld1_u8(srow1 + x);
+                uint8x8_t x2 = vld1_u8(srow2 + x);
+
+                tprev = tcurr;
+                tcurr = tnext;
+                tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
+
+                if(!x) {
+                    tcurr = tnext;
+
+                    // make border
+                        if (borderType == BORDER_MODE_CONSTANT)
+                        {
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
+                        }
+                        else if (borderType == BORDER_MODE_REFLECT101)
+                        {
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
+                        }
+                        else // borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE
+                        {
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
+                        }
+                    continue;
+                }
+
+                t0 = vextq_u16(tprev, tcurr, 7);
+                t1 = tcurr;
+                t2 = vextq_u16(tcurr, tnext, 1);
+
+                t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
+
+#ifdef FLOAT_VARIANT_1_9
+                uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
+                uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
+                float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
+                float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
+                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
+                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
+                vst1_u8(drow + x - 8, vmovn_u16(t0));
+#else
+                int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
+                uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
+                vst1_u8(drow + x - 8, it0);
+#endif
+            }
+
+            x -= 8;
+            if(x == colsn){
+                x--;
+            }
+            s16 prevx, rowx, nextx;
+            prevx = srow2[x-1] + srow1[x-1] + srow0[x-1];
+            rowx = srow2[x] + srow1[x] + srow0[x];
+            for( ; x < colsn; x++ )
+            {
+                if(x+1 >= colsn) {
+                    // make border
+                    if (borderType == BORDER_MODE_CONSTANT)
+                    {
+                        nextx = borderValue;
+                    } else if (borderType == BORDER_MODE_REFLECT101)
+                    {
+                        nextx = srow2[x-1] + srow1[x-1] + srow0[x-1];
+                    } else {
+                        nextx = srow2[x] + srow1[x] + srow0[x];
+                    }
+                } else {
+                    nextx = srow2[x+1] + srow1[x+1] + srow0[x+1];
+                }
+                *(drow+x) = internal::saturate_cast<u8>((prevx + rowx + nextx)*(1/9.));
+                prevx = rowx;
+                rowx = nextx;
+            }
+        }
+    }
+    else
+    {
+        for( size_t y = 0; y < size.height; y++ )
+        {
+            const u8* srow0;
+            const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
+            const u8* srow2;
+            u8* drow = internal::getRowPtr(dstBase, dstStride, y);
+            if (borderType == BORDER_MODE_REFLECT101) {
+                srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
+                srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
+            } else  if (borderType == BORDER_MODE_CONSTANT) {
+                srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+                srow2 =  y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+            } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
+                srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+                srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
+            }
+
+            // do vertical convolution
+            size_t x = 0;
+            const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8);
+            for( ; x <= bcols; x += 8 )
+            {
+                internal::prefetch(srow0 + x);
+                internal::prefetch(srow1 + x);
+                internal::prefetch(srow2 + x);
+
+                uint8x8_t x0 = vld1_u8(srow0 + x);
+                uint8x8_t x1 = vld1_u8(srow1 + x);
+                uint8x8_t x2 = vld1_u8(srow2 + x);
+
+                tprev = tcurr;
+                tcurr = tnext;
+                tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
+
+                if(!x) {
+                    tcurr = tnext;
+
+                    // make border
+                    switch(cn)
+                    {
+                    case 2:
+                        if (borderType == BORDER_MODE_CONSTANT)
+                        {
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
+                        }
+                        else if (borderType == BORDER_MODE_REFLECT101)
+                        {
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 6);
+                        }
+                        else
+                        {
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
+                        }
+                        break;
+                    case 3:
+                        if (borderType == BORDER_MODE_CONSTANT)
+                        {
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
+                        }
+                        else if (borderType == BORDER_MODE_REFLECT101)
+                        {
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tcurr, 6);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tcurr, 7);
+                        }
+                        else
+                        {
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 5);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 7);
+                        }
+                        break;
+                    case 4:
+                        if (borderType == BORDER_MODE_CONSTANT)
+                        {
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
+                            tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
+                        }
+                        else if (borderType != BORDER_MODE_REFLECT101)
+                        {
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
+                            tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7);
+                        }
+                        break;
+                    }
+                    continue;
+                }
+
+                if(cn==2)
+                    t0 = vextq_u16(tprev, tcurr, 6);
+                else if(cn==3)
+                    t0 = vextq_u16(tprev, tcurr, 5);
+                else if(cn==4)
+                    t0 = vextq_u16(tprev, tcurr, 4);
+
+                t1 = tcurr;
+
+                if(cn==2)
+                    t2 = vextq_u16(tcurr, tnext, 2);
+                else if(cn==3)
+                    t2 = vextq_u16(tcurr, tnext, 3);
+                else if(cn==4)
+                    t2 = vextq_u16(tcurr, tnext, 4);
+
+                t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
+
+#ifdef FLOAT_VARIANT_1_9
+                uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
+                uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
+                float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
+                float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
+                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
+                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
+                vst1_u8(drow + x - 8, vmovn_u16(t0));
+#else
+                int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
+                uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
+                vst1_u8(drow + x - 8, it0);
+#endif
+            }
+
+            x -= 8;
+            if(x == colsn){
+                x -= cn;
+            }
+            s16 prevx[4], rowx[4], nextx[4];
+            for( s32 k = 0; k < cn; k++ )
+            {
+                prevx[(k + x%cn)%cn] = srow2[x+k-cn] + srow1[x+k-cn] + srow0[x+k-cn];
+                rowx[(k + x%cn)%cn] = srow2[x+k] + srow1[x+k] + srow0[x+k];
+            }
+            for( ; x < colsn; x++ )
+            {
+                size_t xx = x%cn;
+                if(x+cn >= colsn) {
+                    // make border
+                    if (borderType == BORDER_MODE_CONSTANT)
+                    {
+                        nextx[xx] = borderValue;
+                    } else if (borderType == BORDER_MODE_REFLECT101)
+                    {
+                        nextx[xx] = srow2[x-cn] + srow1[x-cn] + srow0[x-cn];
+                    } else {
+                        nextx[xx] = srow2[x] + srow1[x] + srow0[x];
+                    }
+                } else {
+                    nextx[xx] = srow2[x+cn] + srow1[x+cn] + srow0[x+cn];
+                }
+                *(drow+x) = internal::saturate_cast<u8>((prevx[xx] + rowx[xx] + nextx[xx])*(1/9.));
+                prevx[xx] = rowx[xx];
+                rowx[xx] = nextx[xx];
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+void blur5x5(const Size2D &size, s32 cn,
+             const u8 * srcBase, ptrdiff_t srcStride,
+             u8 * dstBase, ptrdiff_t dstStride,
+             BORDER_MODE borderType, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+#define FLOAT_VARIANT_1_25
+#ifdef FLOAT_VARIANT_1_25
+    float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
+    float32x4_t v0_5 = vdupq_n_f32 (.5f);
+#else
+    const int16x8_t vScale = vmovq_n_s16(1310);
+#endif
+    size_t colsn = size.width*cn;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 2*cn, borderValue);
+        tmp = &_tmp[cn];
+    }
+
+    uint16x8_t tprev = vdupq_n_u16(0x0);
+    uint16x8_t tcurr = tprev;
+    uint16x8_t tnext = tprev;
+    uint16x8_t t0, t1, t2, t3, t4;
+    for( size_t y = 0; y < size.height; y++ )
+    {
+        const u8 *srow0, *srow1;
+        const u8 *srow2 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 *srow3, *srow4;
+        u8 *drow = internal::getRowPtr(dstBase, dstStride, y);
+        if (borderType == BORDER_MODE_REFLECT101) {
+            srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 2-y);
+            srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
+            srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
+            srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-4-y);
+        } else  if (borderType == BORDER_MODE_CONSTANT) {
+            srow0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
+            srow1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            srow3 =  y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+            srow4 =  y < size.height-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
+        } else  if (borderType == BORDER_MODE_REFLECT) {
+            srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 1-y);
+            srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
+            srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-3-y);
+        } else { // BORDER_MODE_REPLICATE
+            srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
+            srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
+            srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : size.height-1);
+        }
+
+        // do vertical convolution
+        size_t x = 0;
+        const size_t bcols = y + 3 < size.height ? colsn : (colsn - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+            internal::prefetch(srow3 + x);
+            internal::prefetch(srow4 + x);
+
+            uint8x8_t x0 = vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = vld1_u8(srow2 + x);
+            uint8x8_t x3 = vld1_u8(srow3 + x);
+            uint8x8_t x4 = vld1_u8(srow4 + x);
+
+            tprev = tcurr;
+            tcurr = tnext;
+            tnext = vaddw_u8(vaddq_u16(vaddl_u8(x0, x1), vaddl_u8(x2, x3)), x4);
+
+            if(!x) {
+                tcurr = tnext;
+
+                if(borderType == BORDER_MODE_REFLECT101 && size.width < 3)
+                {
+                    x = 8;
+                    break;
+                }
+
+                // make border
+                switch(cn)
+                {
+                case 1:
+                    if (borderType == BORDER_MODE_CONSTANT)
+                    {
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
+                    }
+                    else if (borderType == BORDER_MODE_REFLECT101)
+                    {
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
+                    }
+                    else if (borderType == BORDER_MODE_REFLECT)
+                    {
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
+                    }
+                    else
+                    {
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
+                    }
+                    break;
+                case 2:
+                    if (borderType == BORDER_MODE_CONSTANT)
+                    {
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
+                    }
+                    else if (borderType == BORDER_MODE_REFLECT101)
+                    {
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7);
+                    }
+                    else if (borderType == BORDER_MODE_REFLECT)
+                    {
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 4);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
+                    }
+                    else
+                    {
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
+                    }
+                    break;
+                case 3:
+                    if (borderType == BORDER_MODE_CONSTANT)
+                    {
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 2);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 3);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
+                    }
+                    else if (borderType == BORDER_MODE_REFLECT101)
+                    {
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 6),tcurr, 2);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 7),tprev, 3);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tprev, 5);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 6);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 7);
+                        s16 lane8 = srow4[8] + srow3[8] + srow2[8] + srow1[8] + srow0[8];
+                        tcurr = vsetq_lane_u16(lane8,tprev, 4);
+                    }
+                    else if (borderType == BORDER_MODE_REFLECT)
+                    {
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 2);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 3);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 4);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7);
+                    }
+                    else
+                    {
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 2);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 3);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 4);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5);
+                        tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6);
+                        tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7);
+                    }
+                    break;
+                case 4:
+                    if (borderType == BORDER_MODE_CONSTANT)
+                    {
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 0);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 1);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 2);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 3);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
+                        tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
+                    }
+                    else if (borderType == BORDER_MODE_REFLECT101)
+                    {
+                        s16 lane8  = srow4[ 8] + srow3[ 8] + srow2[ 8] + srow1[ 8] + srow0[ 8];
+                        s16 lane9  = srow4[ 9] + srow3[ 9] + srow2[ 9] + srow1[ 9] + srow0[ 9];
+                        s16 lane10 = srow4[10] + srow3[10] + srow2[10] + srow1[10] + srow0[10];
+                        s16 lane11 = srow4[11] + srow3[11] + srow2[11] + srow1[11] + srow0[11];
+                        tprev = vsetq_lane_u16( lane8,tcurr, 0);
+                        tprev = vsetq_lane_u16( lane9,tprev, 1);
+                        tprev = vsetq_lane_u16(lane10,tprev, 2);
+                        tcurr = vsetq_lane_u16(lane11,tprev, 3);
+                    }
+                    else if (borderType == BORDER_MODE_REFLECT)
+                    {
+                        tcurr = vcombine_u16(vget_high_u16(tcurr),vget_low_u16(tcurr));//swap 64-bit parts
+                    }
+                    else
+                    {
+                        tcurr = vcombine_u16(vget_low_u16(tcurr),vget_low_u16(tcurr));//double 64-bit part
+                    }
+                    break;
+                }
+                continue;
+            }
+            switch(cn)
+            {
+            case 1:
+                t0 = vextq_u16(tprev, tcurr, 6);
+                t1 = vextq_u16(tprev, tcurr, 7);
+                t2 = tcurr;
+                t3 = vextq_u16(tcurr, tnext, 1);
+                t4 = vextq_u16(tcurr, tnext, 2);
+                break;
+            case 2:
+                t0 = vextq_u16(tprev, tcurr, 4);
+                t1 = vextq_u16(tprev, tcurr, 6);
+                t2 = tcurr;
+                t3 = vextq_u16(tcurr, tnext, 2);
+                t4 = vextq_u16(tcurr, tnext, 4);
+                break;
+            case 3:
+                t0 = vextq_u16(tprev, tcurr, 2);
+                t1 = vextq_u16(tprev, tcurr, 5);
+                t2 = tcurr;
+                t3 = vextq_u16(tcurr, tnext, 3);
+                t4 = vextq_u16(tcurr, tnext, 6);
+                break;
+            case 4:
+                t0 = tprev;
+                t1 = vextq_u16(tprev, tcurr, 4);
+                t2 = tcurr;
+                t3 = vextq_u16(tcurr, tnext, 4);
+                t4 = tnext;
+                break;
+            default:
+                internal::assertSupportedConfiguration(false);//Unsupported channels number
+                return;
+            }
+            t0 = vqaddq_u16(vqaddq_u16(vqaddq_u16(t0, t1), vqaddq_u16(t2, t3)), t4);
+
+#ifdef FLOAT_VARIANT_1_25
+            uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
+            uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
+            float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
+            float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
+            tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
+            tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+            t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
+            vst1_u8(drow + x - 8, vmovn_u16(t0));
+#else
+            int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
+            uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
+            vst1_u8(drow + x - 8, it0);
+#endif
+        }
+
+        x -= 8;
+        if(x == colsn){
+            x -= cn;
+        }
+        s16 pprevx[4], prevx[4], rowx[4], nextx[4], nnextx[4];
+        ptrdiff_t px = x / cn;
+        for( s32 k = 0; k < cn; k++ )
+        {
+            ptrdiff_t ploc;
+            ploc = internal::borderInterpolate(px-2, size.width, borderType);
+            pprevx[k] = ploc < 0 ? 5*borderValue :
+                                   srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
+
+            ploc = internal::borderInterpolate(px-1, size.width, borderType);
+            prevx[k]  = ploc < 0 ? 5*borderValue :
+                                   srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
+
+            rowx[k]   = srow4[px*cn+k] + srow3[px*cn+k] + srow2[px*cn+k] + srow1[px*cn+k] + srow0[px*cn+k];
+
+            ploc = internal::borderInterpolate(px+1, size.width, borderType);
+            nextx[k]  = ploc < 0 ? 5*borderValue :
+                                   srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
+        }
+        x = px*cn;
+        for( ; x < colsn; x+=cn, px++ )
+        {
+            for( s32 k = 0; k < cn; k++ )
+            {
+                ptrdiff_t ploc = internal::borderInterpolate(px+2, size.width, borderType);
+                nnextx[k] = ploc < 0 ? 5*borderValue :
+                                       srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
+                *(drow+x+k) = internal::saturate_cast<u8>((pprevx[k] + prevx[k] + rowx[k] + nextx[k] +nnextx[k])*(1/25.));
+                pprevx[k] = prevx[k];
+                prevx[k]  = rowx[k];
+                rowx[k]   = nextx[k];
+                nextx[k]  = nnextx[k];
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border)
+{
+    return isSupportedConfiguration() &&
+           cn > 0 && cn <= 4 &&
+           size.width*cn >= 4 && size.height >= 2 &&
+           (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REFLECT101 ||
+            border == BORDER_MODE_REFLECT ||
+            border == BORDER_MODE_REPLICATE ||
+            border == BORDER_MODE_WRAP);
+}
+
+void blur3x3(const Size2D &size, s32 cn,
+             const f32 * srcBase, ptrdiff_t srcStride,
+             f32 * dstBase, ptrdiff_t dstStride,
+             BORDER_MODE borderType, f32 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isBlurF32Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<f32> _tmp;
+    f32 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 2*cn, borderValue);
+        tmp = &_tmp[cn];
+    }
+
+    ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //2-line buffer
+    std::vector<f32> _buf(4*(cn * (size.width + 2) + 32 / sizeof(f32)));
+    f32* lanea = internal::alignPtr(&_buf[cn], 32);
+    f32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32);
+
+    f32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32);
+    f32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lanea[-cn+k] = borderValue;
+            lanea[colsn+k] = borderValue;
+            laneA[-cn+k] = borderValue;
+            laneA[colsn+k] = borderValue;
+            laneb[-cn+k] = borderValue;
+            laneb[colsn+k] = borderValue;
+            laneB[-cn+k] = borderValue;
+            laneB[colsn+k] = borderValue;
+        }
+
+    size_t i = 0;
+    f32* dsta = internal::getRowPtr(dstBase, dstStride, 0);
+    for (; i < size.height-1; i+=2)
+    {
+        //vertical convolution
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
+        const f32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1);
+        const f32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(ln1 + x);
+            internal::prefetch(ln2 + x);
+            internal::prefetch(ln0 + x);
+            internal::prefetch(ln3 + x);
+box3x3f32_vert:
+            float32x4_t v1 = vld1q_f32(ln1 + x);
+            float32x4_t v2 = vld1q_f32(ln2 + x);
+            float32x4_t v0 = vld1q_f32(ln0 + x);
+            float32x4_t v3 = vld1q_f32(ln3 + x);
+
+            float32x4_t v = vaddq_f32(v1, v2);
+            float32x4_t w0 = vaddq_f32(v, v0);
+            float32x4_t w1 = vaddq_f32(v, v3);
+
+            vst1q_f32(lanea + x, w0);
+            vst1q_f32(laneb + x, w1);
+        }
+        if(x < colsn)
+        {
+            x = colsn-4;
+            goto box3x3f32_vert;
+        }
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lanea[-cn+k] = lanea[idx_l + k];
+                lanea[colsn+k] = lanea[idx_r + k];
+                laneb[-cn+k] = laneb[idx_l + k];
+                laneb[colsn+k] = laneb[idx_r + k];
+            }
+
+        //horizontal convolution (2 lines from previous iteration)
+        if (i > 0)
+        {
+            f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
+            x = 0;
+            for (; x <= colsn - 4; x += 4)
+            {
+                internal::prefetch(laneA + x + cn);
+                internal::prefetch(laneB + x + cn);
+box3x3f32_horiz:
+                float32x4_t lane0a = vld1q_f32(laneA + x - cn);
+                float32x4_t lane2a = vld1q_f32(laneA + x + cn);
+                float32x4_t lane1a = vld1q_f32(laneA + x);
+
+                float32x4_t lane0b = vld1q_f32(laneB + x - cn);
+                float32x4_t lane2b = vld1q_f32(laneB + x + cn);
+                float32x4_t lane1b = vld1q_f32(laneB + x);
+
+                float32x4_t va = vaddq_f32(lane0a, lane2a);
+                float32x4_t vb = vaddq_f32(lane0b, lane2b);
+                float32x4_t wa = vaddq_f32(va, lane1a);
+                float32x4_t wb = vaddq_f32(vb, lane1b);
+
+                vst1q_f32(dsta + x, wa);
+                vst1q_f32(dstb + x, wb);
+            }
+            if(x < colsn)
+            {
+                x = colsn-4;
+                goto box3x3f32_horiz;
+            }
+            dsta = internal::getRowPtr(dstBase, dstStride, i);
+        }
+
+        std::swap(lanea, laneA);
+        std::swap(laneb, laneB);
+    }
+
+    //last line
+    if(i < size.height)
+    {
+        //vertical convolution
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
+        const f32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(ln0 + x);
+            internal::prefetch(ln1 + x);
+            internal::prefetch(ln2 + x);
+box3x3f32_vert_ll:
+            float32x4_t v0 = vld1q_f32(ln0+x);
+            float32x4_t v1 = vld1q_f32(ln1+x);
+            float32x4_t v2 = vld1q_f32(ln2+x);
+
+            float32x4_t v = vaddq_f32(v0, v1);
+            float32x4_t w = vaddq_f32(v, v2);
+
+            vst1q_f32(lanea + x, w);
+        }
+        if(x < colsn)
+        {
+            x = colsn-4;
+            goto box3x3f32_vert_ll;
+        }
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lanea[-cn+k] = lanea[idx_l + k];
+                lanea[colsn+k] = lanea[idx_r + k];
+            }
+
+        //horizontal convolution (last 3 lines)
+        x = 0;
+        f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
+        f32* dstc = internal::getRowPtr(dstBase, dstStride, i);
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(laneA + x + cn);
+            internal::prefetch(laneB + x + cn);
+            internal::prefetch(lanea + x + cn);
+box3x3f32_horiz_ll:
+            float32x4_t lane0a = vld1q_f32(laneA + x - cn);
+            float32x4_t lane2a = vld1q_f32(laneA + x + cn);
+            float32x4_t lane1a = vld1q_f32(laneA + x);
+
+            float32x4_t lane0b = vld1q_f32(laneB + x - cn);
+            float32x4_t lane2b = vld1q_f32(laneB + x + cn);
+            float32x4_t lane1b = vld1q_f32(laneB + x);
+
+            float32x4_t lane0c = vld1q_f32(lanea + x - cn);
+            float32x4_t lane2c = vld1q_f32(lanea + x + cn);
+            float32x4_t lane1c = vld1q_f32(lanea + x);
+
+            float32x4_t va = vaddq_f32(lane0a, lane2a);
+            float32x4_t vb = vaddq_f32(lane0b, lane2b);
+            float32x4_t vc = vaddq_f32(lane0c, lane2c);
+            float32x4_t wa = vaddq_f32(va, lane1a);
+            float32x4_t wb = vaddq_f32(vb, lane1b);
+            float32x4_t wc = vaddq_f32(vc, lane1c);
+
+            vst1q_f32(dsta + x, wa);
+            vst1q_f32(dstb + x, wb);
+            vst1q_f32(dstc + x, wc);
+        }
+        if(x < colsn)
+        {
+            x = colsn-4;
+            goto box3x3f32_horiz_ll;
+        }
+    }
+    else
+    {
+        //horizontal convolution (last 2 lines)
+        f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(laneA + x + cn);
+            internal::prefetch(laneB + x + cn);
+box3x3f32_horiz_last2:
+            float32x4_t lane0a = vld1q_f32(laneA + x - cn);
+            float32x4_t lane2a = vld1q_f32(laneA + x + cn);
+            float32x4_t lane1a = vld1q_f32(laneA + x);
+
+            float32x4_t lane0b = vld1q_f32(laneB + x - cn);
+            float32x4_t lane2b = vld1q_f32(laneB + x + cn);
+            float32x4_t lane1b = vld1q_f32(laneB + x);
+
+            float32x4_t va = vaddq_f32(lane0a, lane2a);
+            float32x4_t vb = vaddq_f32(lane0b, lane2b);
+            float32x4_t wa = vaddq_f32(va, lane1a);
+            float32x4_t wb = vaddq_f32(vb, lane1b);
+
+            vst1q_f32(dsta + x, wa);
+            vst1q_f32(dstb + x, wb);
+        }
+        if(x < colsn)
+        {
+            x = colsn-4;
+            goto box3x3f32_horiz_last2;
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border)
+{
+    return isSupportedConfiguration() &&
+           cn > 0 && cn <= 4 &&
+           size.width*cn >= 4 && size.height >= 2 &&
+           (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REFLECT101 ||
+            border == BORDER_MODE_REFLECT ||
+            border == BORDER_MODE_REPLICATE ||
+            border == BORDER_MODE_WRAP);
+}
+
+void blur3x3(const Size2D &size, s32 cn,
+             const s32 * srcBase, ptrdiff_t srcStride,
+             s32 * dstBase, ptrdiff_t dstStride,
+             BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isBlurS32Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<s32> _tmp;
+    s32 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 2*cn, borderValue);
+        tmp = &_tmp[cn];
+    }
+
+    ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //2-line buffer
+    std::vector<s32> _buf(4*(cn * (size.width + 2) + 32 / sizeof(s32)));
+    s32* lanea = internal::alignPtr(&_buf[cn], 32);
+    s32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32);
+
+    s32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32);
+    s32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lanea[-cn+k] = borderValue;
+            lanea[colsn+k] = borderValue;
+            laneA[-cn+k] = borderValue;
+            laneA[colsn+k] = borderValue;
+            laneb[-cn+k] = borderValue;
+            laneb[colsn+k] = borderValue;
+            laneB[-cn+k] = borderValue;
+            laneB[colsn+k] = borderValue;
+        }
+
+    size_t i = 0;
+    s32* dsta = internal::getRowPtr(dstBase, dstStride, 0);
+    for (; i < size.height-1; i+=2)
+    {
+        //vertical convolution
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
+        const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1);
+        const s32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(ln1 + x);
+            internal::prefetch(ln2 + x);
+            internal::prefetch(ln0 + x);
+            internal::prefetch(ln3 + x);
+box3x3s32_vert:
+            int32x4_t v1 = vld1q_s32(ln1 + x);
+            int32x4_t v2 = vld1q_s32(ln2 + x);
+            int32x4_t v0 = vld1q_s32(ln0 + x);
+            int32x4_t v3 = vld1q_s32(ln3 + x);
+
+            int32x4_t v = vaddq_s32(v1, v2);
+            int32x4_t w0 = vaddq_s32(v, v0);
+            int32x4_t w1 = vaddq_s32(v, v3);
+
+            vst1q_s32(lanea + x, w0);
+            vst1q_s32(laneb + x, w1);
+        }
+        if(x < colsn)
+        {
+            x = colsn-4;
+            goto box3x3s32_vert;
+        }
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lanea[-cn+k] = lanea[idx_l + k];
+                lanea[colsn+k] = lanea[idx_r + k];
+                laneb[-cn+k] = laneb[idx_l + k];
+                laneb[colsn+k] = laneb[idx_r + k];
+            }
+
+        //horizontal convolution (2 lines from previous iteration)
+        if (i > 0)
+        {
+            s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
+            x = 0;
+            for (; x <= colsn - 4; x += 4)
+            {
+                internal::prefetch(laneA + x + cn);
+                internal::prefetch(laneB + x + cn);
+box3x3s32_horiz:
+                int32x4_t lane0a = vld1q_s32(laneA + x - cn);
+                int32x4_t lane2a = vld1q_s32(laneA + x + cn);
+                int32x4_t lane1a = vld1q_s32(laneA + x);
+
+                int32x4_t lane0b = vld1q_s32(laneB + x - cn);
+                int32x4_t lane2b = vld1q_s32(laneB + x + cn);
+                int32x4_t lane1b = vld1q_s32(laneB + x);
+
+                int32x4_t va = vaddq_s32(lane0a, lane2a);
+                int32x4_t vb = vaddq_s32(lane0b, lane2b);
+                int32x4_t wa = vaddq_s32(va, lane1a);
+                int32x4_t wb = vaddq_s32(vb, lane1b);
+
+                vst1q_s32(dsta + x, wa);
+                vst1q_s32(dstb + x, wb);
+            }
+            if(x < colsn)
+            {
+                x = colsn-4;
+                goto box3x3s32_horiz;
+            }
+            dsta = internal::getRowPtr(dstBase, dstStride, i);
+        }
+
+        std::swap(lanea, laneA);
+        std::swap(laneb, laneB);
+    }
+    //last line
+    if(i < size.height)
+    {
+        //vertical convolution
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
+        const s32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(ln0 + x);
+            internal::prefetch(ln1 + x);
+            internal::prefetch(ln2 + x);
+box3x3s32_vert_ll:
+            int32x4_t v0 = vld1q_s32(ln0+x);
+            int32x4_t v1 = vld1q_s32(ln1+x);
+            int32x4_t v2 = vld1q_s32(ln2+x);
+
+            int32x4_t v = vaddq_s32(v0, v1);
+            int32x4_t w = vaddq_s32(v, v2);
+
+            vst1q_s32(lanea + x, w);
+        }
+        if(x < colsn)
+        {
+            x = colsn-4;
+            goto box3x3s32_vert_ll;
+        }
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lanea[-cn+k] = lanea[idx_l + k];
+                lanea[colsn+k] = lanea[idx_r + k];
+            }
+
+        //horizontal convolution (last 3 lines)
+        x = 0;
+        s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
+        s32* dstc = internal::getRowPtr(dstBase, dstStride, i);
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(laneA + x + cn);
+            internal::prefetch(laneB + x + cn);
+            internal::prefetch(lanea + x + cn);
+box3x3s32_horiz_ll:
+            int32x4_t lane0a = vld1q_s32(laneA + x - cn);
+            int32x4_t lane2a = vld1q_s32(laneA + x + cn);
+            int32x4_t lane1a = vld1q_s32(laneA + x);
+
+            int32x4_t lane0b = vld1q_s32(laneB + x - cn);
+            int32x4_t lane2b = vld1q_s32(laneB + x + cn);
+            int32x4_t lane1b = vld1q_s32(laneB + x);
+
+            int32x4_t lane0c = vld1q_s32(lanea + x - cn);
+            int32x4_t lane2c = vld1q_s32(lanea + x + cn);
+            int32x4_t lane1c = vld1q_s32(lanea + x);
+
+            int32x4_t va = vaddq_s32(lane0a, lane2a);
+            int32x4_t vb = vaddq_s32(lane0b, lane2b);
+            int32x4_t vc = vaddq_s32(lane0c, lane2c);
+            int32x4_t wa = vaddq_s32(va, lane1a);
+            int32x4_t wb = vaddq_s32(vb, lane1b);
+            int32x4_t wc = vaddq_s32(vc, lane1c);
+
+            vst1q_s32(dsta + x, wa);
+            vst1q_s32(dstb + x, wb);
+            vst1q_s32(dstc + x, wc);
+        }
+        if(x < colsn)
+        {
+            x = colsn-4;
+            goto box3x3s32_horiz_ll;
+        }
+    }
+    else
+    {
+        //horizontal convolution (last 2 lines)
+        s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(laneA + x + cn);
+            internal::prefetch(laneB + x + cn);
+box3x3s32_horiz_last2:
+            int32x4_t lane0a = vld1q_s32(laneA + x - cn);
+            int32x4_t lane2a = vld1q_s32(laneA + x + cn);
+            int32x4_t lane1a = vld1q_s32(laneA + x);
+
+            int32x4_t lane0b = vld1q_s32(laneB + x - cn);
+            int32x4_t lane2b = vld1q_s32(laneB + x + cn);
+            int32x4_t lane1b = vld1q_s32(laneB + x);
+
+            int32x4_t va = vaddq_s32(lane0a, lane2a);
+            int32x4_t vb = vaddq_s32(lane0b, lane2b);
+            int32x4_t wa = vaddq_s32(va, lane1a);
+            int32x4_t wb = vaddq_s32(vb, lane1b);
+
+            vst1q_s32(dsta + x, wa);
+            vst1q_s32(dstb + x, wb);
+        }
+        if(x < colsn)
+        {
+            x = colsn-4;
+            goto box3x3s32_horiz_last2;
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+} //namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/canny.cpp b/3rdparty/carotene/src/canny.cpp
new file mode 100644
index 0000000000..f61bc23e9b
--- /dev/null
+++ b/3rdparty/carotene/src/canny.cpp
@@ -0,0 +1,773 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "saturate_cast.hpp"
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+namespace {
+struct RowFilter3x3Canny
+{
+    inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr)
+    {
+        vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
+        vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
+        lookLeft = offsetk - borderxl;
+        lookRight = offsetk - borderxr;
+    }
+
+    inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width)
+    {
+        uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
+        ptrdiff_t i = 0;
+        for (; i < width - 8 + lookRight; i += 8)
+        {
+            internal::prefetch(src + i);
+            uint8x8_t l18u = vld1_u8(src + i + 1);
+
+            uint8x8_t l2 = l18u;
+            uint8x8_t l0 = vext_u8(l, l18u, 6);
+            int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1));
+
+            l = l18u;
+
+            int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0));
+            int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0));
+            int16x8_t ldy = vaddq_s16(l02, l1x2);
+
+            vst1q_s16(dstx + i, ldx);
+            vst1q_s16(dsty + i, ldy);
+        }
+
+        //tail
+        if (lookRight == 0 || i != width)
+        {
+            uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
+            uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
+            uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
+
+            int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0));
+            int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
+            int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
+            int16x8_t taildy = vqaddq_s16(tail02, tail1x2);
+
+            vst1q_s16(dstx + (width - 8), taildx);
+            vst1q_s16(dsty + (width - 8), taildy);
+        }
+    }
+
+    uint8x8_t vfmask;
+    uint8x8_t vtmask;
+    enum { offsetk = 1};
+    ptrdiff_t lookLeft;
+    ptrdiff_t lookRight;
+};
+
+template <bool L2gradient>
+inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
+{
+    ptrdiff_t j = 0;
+    for (; j <= width - 8; j += 8)
+    {
+        ColFilter3x3CannyL1Loop:
+        int16x8_t line0x = vld1q_s16(src0 + j);
+        int16x8_t line1x = vld1q_s16(src1 + j);
+        int16x8_t line2x = vld1q_s16(src2 + j);
+        int16x8_t line0y = vld1q_s16(src0 + j + width);
+        int16x8_t line2y = vld1q_s16(src2 + j + width);
+
+        int16x8_t l02 = vaddq_s16(line0x, line2x);
+        int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
+        int16x8_t dy = vsubq_s16(line2y, line0y);
+        int16x8_t dx = vaddq_s16(l1x2, l02);
+
+        int16x8_t dya = vabsq_s16(dy);
+        int16x8_t dxa = vabsq_s16(dx);
+        int16x8_t norm = vaddq_s16(dya, dxa);
+
+        int32x4_t normh = vmovl_s16(vget_high_s16(norm));
+        int32x4_t norml = vmovl_s16(vget_low_s16(norm));
+
+        vst1q_s16(dsty + j, dy);
+        vst1q_s16(dstx + j, dx);
+        vst1q_s32(mag + j + 4, normh);
+        vst1q_s32(mag + j, norml);
+    }
+    if (j != width)
+    {
+        j = width - 8;
+        goto ColFilter3x3CannyL1Loop;
+    }
+}
+template <>
+inline void ColFilter3x3Canny<true>(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
+{
+    ptrdiff_t j = 0;
+    for (; j <= width - 8; j += 8)
+    {
+        ColFilter3x3CannyL2Loop:
+        int16x8_t line0x = vld1q_s16(src0 + j);
+        int16x8_t line1x = vld1q_s16(src1 + j);
+        int16x8_t line2x = vld1q_s16(src2 + j);
+        int16x8_t line0y = vld1q_s16(src0 + j + width);
+        int16x8_t line2y = vld1q_s16(src2 + j + width);
+
+        int16x8_t l02 = vaddq_s16(line0x, line2x);
+        int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
+        int16x8_t dy = vsubq_s16(line2y, line0y);
+        int16x8_t dx = vaddq_s16(l1x2, l02);
+
+        int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx));
+        int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy));
+
+        norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy));
+        normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx));
+
+        vst1q_s16(dsty + j, dy);
+        vst1q_s16(dstx + j, dx);
+        vst1q_s32(mag + j, norml);
+        vst1q_s32(mag + j + 4, normh);
+    }
+    if (j != width)
+    {
+        j = width - 8;
+        goto ColFilter3x3CannyL2Loop;
+    }
+}
+
+template <bool L2gradient>
+inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
+{
+    ptrdiff_t j = 0;
+    if (colscn >= 8)
+    {
+        int16x8_t vx = vld1q_s16(_dx);
+        int16x8_t vy = vld1q_s16(_dy);
+        for (; j <= colscn - 16; j+=8)
+        {
+            internal::prefetch(_dx);
+            internal::prefetch(_dy);
+
+            int16x8_t vx2 = vld1q_s16(_dx + j + 8);
+            int16x8_t vy2 = vld1q_s16(_dy + j + 8);
+
+            int16x8_t vabsx = vabsq_s16(vx);
+            int16x8_t vabsy = vabsq_s16(vy);
+
+            int16x8_t norm = vaddq_s16(vabsx, vabsy);
+
+            int32x4_t normh = vmovl_s16(vget_high_s16(norm));
+            int32x4_t norml = vmovl_s16(vget_low_s16(norm));
+
+            vst1q_s32(_norm + j + 4, normh);
+            vst1q_s32(_norm + j + 0, norml);
+
+            vx = vx2;
+            vy = vy2;
+        }
+        int16x8_t vabsx = vabsq_s16(vx);
+        int16x8_t vabsy = vabsq_s16(vy);
+
+        int16x8_t norm = vaddq_s16(vabsx, vabsy);
+
+        int32x4_t normh = vmovl_s16(vget_high_s16(norm));
+        int32x4_t norml = vmovl_s16(vget_low_s16(norm));
+
+        vst1q_s32(_norm + j + 4, normh);
+        vst1q_s32(_norm + j + 0, norml);
+    }
+    for (; j < colscn; j++)
+        _norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j]));
+}
+
+template <>
+inline void NormCanny<true>(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
+{
+    ptrdiff_t j = 0;
+    if (colscn >= 8)
+    {
+        int16x8_t vx = vld1q_s16(_dx);
+        int16x8_t vy = vld1q_s16(_dy);
+
+        for (; j <= colscn - 16; j+=8)
+        {
+            internal::prefetch(_dx);
+            internal::prefetch(_dy);
+
+            int16x8_t vxnext = vld1q_s16(_dx + j + 8);
+            int16x8_t vynext = vld1q_s16(_dy + j + 8);
+
+            int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
+            int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
+
+            norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
+            normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
+
+            vst1q_s32(_norm + j + 0, norml);
+            vst1q_s32(_norm + j + 4, normh);
+
+            vx = vxnext;
+            vy = vynext;
+        }
+        int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
+        int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
+
+        norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
+        normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
+
+        vst1q_s32(_norm + j + 0, norml);
+        vst1q_s32(_norm + j + 4, normh);
+    }
+    for (; j < colscn; j++)
+        _norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j];
+}
+
+template <bool L2gradient>
+inline void prepareThresh(f64 low_thresh, f64 high_thresh,
+                          s32 &low, s32 &high)
+{
+    if (low_thresh > high_thresh)
+        std::swap(low_thresh, high_thresh);
+#if defined __GNUC__
+    low = (s32)low_thresh;
+    high = (s32)high_thresh;
+    low -= (low > low_thresh);
+    high -= (high > high_thresh);
+#else
+    low = internal::round(low_thresh);
+    high = internal::round(high_thresh);
+    f32 ldiff = (f32)(low_thresh - low);
+    f32 hdiff = (f32)(high_thresh - high);
+    low -= (ldiff < 0);
+    high -= (hdiff < 0);
+#endif
+}
+template <>
+inline void prepareThresh<true>(f64 low_thresh, f64 high_thresh,
+                                s32 &low, s32 &high)
+{
+    if (low_thresh > high_thresh)
+        std::swap(low_thresh, high_thresh);
+    if (low_thresh > 0) low_thresh *= low_thresh;
+    if (high_thresh > 0) high_thresh *= high_thresh;
+#if defined __GNUC__
+    low = (s32)low_thresh;
+    high = (s32)high_thresh;
+    low -= (low > low_thresh);
+    high -= (high > high_thresh);
+#else
+    low = internal::round(low_thresh);
+    high = internal::round(high_thresh);
+    f32 ldiff = (f32)(low_thresh - low);
+    f32 hdiff = (f32)(high_thresh - high);
+    low -= (ldiff < 0);
+    high -= (hdiff < 0);
+#endif
+}
+
+template <bool L2gradient, bool externalSobel>
+struct _normEstimator
+{
+    ptrdiff_t magstep;
+    ptrdiff_t dxOffset;
+    ptrdiff_t dyOffset;
+    ptrdiff_t shxOffset;
+    ptrdiff_t shyOffset;
+    std::vector<u8> buffer;
+    const ptrdiff_t offsetk;
+    ptrdiff_t borderyt, borderyb;
+    RowFilter3x3Canny sobelRow;
+
+    inline _normEstimator(const Size2D &size, s32, Margin borderMargin,
+                          ptrdiff_t &mapstep, s32** mag_buf, u8* &map):
+                          offsetk(1),
+                          sobelRow(std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left),
+                                   std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right))
+    {
+        mapstep = size.width + 2;
+        magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32));
+        dxOffset = mapstep * sizeof(s32)/sizeof(s16);
+        dyOffset = dxOffset + size.width * 1;
+        shxOffset = dxOffset + size.width * 2;
+        shyOffset = dxOffset + size.width * 3;
+        buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) );
+        mag_buf[0] = (s32*)&buffer[0];
+        mag_buf[1] = mag_buf[0] + magstep;
+        mag_buf[2] = mag_buf[1] + magstep;
+        memset(mag_buf[0], 0, mapstep * sizeof(s32));
+
+        map = (u8*)(mag_buf[2] + magstep);
+        memset(map, 1, mapstep);
+        memset(map + mapstep*(size.height + 1), 1, mapstep);
+        borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
+        borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
+    }
+    inline void firstRow(const Size2D &size, s32,
+                         const u8 *srcBase, ptrdiff_t srcStride,
+                         s16*, ptrdiff_t,
+                         s16*, ptrdiff_t,
+                         s32** mag_buf)
+    {
+        //sobelH row #0
+        const u8* _src = internal::getRowPtr(srcBase, srcStride, 0);
+        sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width);
+        //sobelH row #1
+        _src = internal::getRowPtr(srcBase, srcStride, 1);
+        sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width);
+
+        mag_buf[1][0] = mag_buf[1][size.width+1] = 0;
+        if (borderyt == 0)
+        {
+            //sobelH row #-1
+            _src = internal::getRowPtr(srcBase, srcStride, -1);
+            sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
+
+            ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
+                                           ((s16*)mag_buf[1]) + dxOffset,  ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
+        }
+        else
+        {
+            ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
+                                           ((s16*)mag_buf[1]) + dxOffset,  ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
+        }
+    }
+    inline void nextRow(const Size2D &size, s32,
+                        const u8 *srcBase, ptrdiff_t srcStride,
+                        s16*, ptrdiff_t,
+                        s16*, ptrdiff_t,
+                        const ptrdiff_t &mapstep, s32** mag_buf,
+                        size_t i, const s16* &_x, const s16* &_y)
+    {
+        mag_buf[2][0] = mag_buf[2][size.width+1] = 0;
+        if (i < size.height - borderyb)
+        {
+            const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1);
+            //sobelH row #i+1
+            sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
+
+            ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset,
+                                           ((s16*)mag_buf[2]) + dxOffset,  ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
+        }
+        else if (i < size.height)
+        {
+            ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
+                                           ((s16*)mag_buf[2]) + dxOffset,  ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
+        }
+        else
+            memset(mag_buf[2], 0, mapstep*sizeof(s32));
+        _x = ((s16*)mag_buf[1]) + dxOffset;
+        _y = ((s16*)mag_buf[1]) + dyOffset;
+    }
+};
+template <bool L2gradient>
+struct _normEstimator<L2gradient, true>
+{
+    std::vector<u8> buffer;
+
+    inline _normEstimator(const Size2D &size, s32 cn, Margin,
+                          ptrdiff_t &mapstep, s32** mag_buf, u8* &map)
+    {
+        mapstep = size.width + 2;
+        buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) );
+        mag_buf[0] = (s32*)&buffer[0];
+        mag_buf[1] = mag_buf[0] + mapstep*cn;
+        mag_buf[2] = mag_buf[1] + mapstep*cn;
+        memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32));
+
+        map = (u8*)(mag_buf[2] + mapstep*cn);
+        memset(map, 1, mapstep);
+        memset(map + mapstep*(size.height + 1), 1, mapstep);
+    }
+    inline void firstRow(const Size2D &size, s32 cn,
+                         const u8 *, ptrdiff_t,
+                         s16* dxBase, ptrdiff_t dxStride,
+                         s16* dyBase, ptrdiff_t dyStride,
+                         s32** mag_buf)
+    {
+        s32* _norm = mag_buf[1] + 1;
+
+        s16* _dx = internal::getRowPtr(dxBase, dxStride, 0);
+        s16* _dy = internal::getRowPtr(dyBase, dyStride, 0);
+
+        NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
+
+        if(cn > 1)
+        {
+            for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
+            {
+                size_t maxIdx = jn;
+                for(s32 k = 1; k < cn; ++k)
+                    if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
+                _norm[j] = _norm[maxIdx];
+                _dx[j] = _dx[maxIdx];
+                _dy[j] = _dy[maxIdx];
+            }
+        }
+
+        _norm[-1] = _norm[size.width] = 0;
+    }
+    inline void nextRow(const Size2D &size, s32 cn,
+                        const u8 *, ptrdiff_t,
+                        s16* dxBase, ptrdiff_t dxStride,
+                        s16* dyBase, ptrdiff_t dyStride,
+                        const ptrdiff_t &mapstep, s32** mag_buf,
+                        size_t i, const s16* &_x, const s16* &_y)
+    {
+        s32* _norm = mag_buf[(i > 0) + 1] + 1;
+        if (i < size.height)
+        {
+            s16* _dx = internal::getRowPtr(dxBase, dxStride, i);
+            s16* _dy = internal::getRowPtr(dyBase, dyStride, i);
+
+            NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
+
+            if(cn > 1)
+            {
+                for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
+                {
+                    size_t maxIdx = jn;
+                    for(s32 k = 1; k < cn; ++k)
+                        if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
+                    _norm[j] = _norm[maxIdx];
+                    _dx[j] = _dx[maxIdx];
+                    _dy[j] = _dy[maxIdx];
+                }
+            }
+
+            _norm[-1] = _norm[size.width] = 0;
+        }
+        else
+            memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32));
+
+        _x = internal::getRowPtr(dxBase, dxStride, i-1);
+        _y = internal::getRowPtr(dyBase, dyStride, i-1);
+    }
+};
+
+template <bool L2gradient, bool externalSobel>
+inline void Canny3x3(const Size2D &size, s32 cn,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     s16 * dxBase, ptrdiff_t dxStride,
+                     s16 * dyBase, ptrdiff_t dyStride,
+                     f64 low_thresh, f64 high_thresh,
+                     Margin borderMargin)
+{
+    s32 low, high;
+    prepareThresh<L2gradient>(low_thresh, high_thresh, low, high);
+
+    ptrdiff_t mapstep;
+    s32* mag_buf[3];
+    u8* map;
+    _normEstimator<L2gradient, externalSobel> normEstimator(size, cn, borderMargin, mapstep, mag_buf, map);
+
+    size_t maxsize = std::max<size_t>( 1u << 10, size.width * size.height / 10 );
+    std::vector<u8*> stack( maxsize );
+    u8 **stack_top = &stack[0];
+    u8 **stack_bottom = &stack[0];
+
+    /* sector numbers
+       (Top-Left Origin)
+
+        1   2   3
+         *  *  *
+          * * *
+        0*******0
+          * * *
+         *  *  *
+        3   2   1
+    */
+
+    #define CANNY_PUSH(d)    *(d) = u8(2), *stack_top++ = (d)
+    #define CANNY_POP(d)     (d) = *--stack_top
+
+    //i == 0
+    normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf);
+    // calculate magnitude and angle of gradient, perform non-maxima supression.
+    // fill the map with one of the following values:
+    //   0 - the pixel might belong to an edge
+    //   1 - the pixel can not belong to an edge
+    //   2 - the pixel does belong to an edge
+    for (size_t i = 1; i <= size.height; i++)
+    {
+        const s16 *_x, *_y;
+        normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y);
+
+        u8* _map = map + mapstep*i + 1;
+        _map[-1] = _map[size.width] = 1;
+
+        s32* _mag = mag_buf[1] + 1; // take the central row
+        ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
+        ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
+
+        if ((stack_top - stack_bottom) + size.width > maxsize)
+        {
+            ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
+            maxsize = maxsize * 3/2;
+            stack.resize(maxsize);
+            stack_bottom = &stack[0];
+            stack_top = stack_bottom + sz;
+        }
+
+        s32 prev_flag = 0;
+        for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++)
+        {
+            #define CANNY_SHIFT 15
+            const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+            s32 m = _mag[j];
+
+            if (m > low)
+            {
+                s32 xs = _x[j];
+                s32 ys = _y[j];
+                s32 x = abs(xs);
+                s32 y = abs(ys) << CANNY_SHIFT;
+
+                s32 tg22x = x * TG22;
+
+                if (y < tg22x)
+                {
+                    if (m > _mag[j-1] && m >= _mag[j+1]) goto __push;
+                }
+                else
+                {
+                    s32 tg67x = tg22x + (x << (CANNY_SHIFT+1));
+                    if (y > tg67x)
+                    {
+                        if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push;
+                    }
+                    else
+                    {
+                        s32 s = (xs ^ ys) < 0 ? -1 : 1;
+                        if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push;
+                    }
+                }
+            }
+            prev_flag = 0;
+            _map[j] = u8(1);
+            continue;
+            __push:
+            if (!prev_flag && m > high && _map[j-mapstep] != 2)
+            {
+                CANNY_PUSH(_map + j);
+                prev_flag = 1;
+            }
+            else
+                _map[j] = 0;
+        }
+
+        // scroll the ring buffer
+        _mag = mag_buf[0];
+        mag_buf[0] = mag_buf[1];
+        mag_buf[1] = mag_buf[2];
+        mag_buf[2] = _mag;
+    }
+
+    // now track the edges (hysteresis thresholding)
+    while (stack_top > stack_bottom)
+    {
+        u8* m;
+        if ((size_t)(stack_top - stack_bottom) + 8u > maxsize)
+        {
+            ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
+            maxsize = maxsize * 3/2;
+            stack.resize(maxsize);
+            stack_bottom = &stack[0];
+            stack_top = stack_bottom + sz;
+        }
+
+        CANNY_POP(m);
+
+        if (!m[-1])         CANNY_PUSH(m - 1);
+        if (!m[1])          CANNY_PUSH(m + 1);
+        if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
+        if (!m[-mapstep])   CANNY_PUSH(m - mapstep);
+        if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
+        if (!m[mapstep-1])  CANNY_PUSH(m + mapstep - 1);
+        if (!m[mapstep])    CANNY_PUSH(m + mapstep);
+        if (!m[mapstep+1])  CANNY_PUSH(m + mapstep + 1);
+    }
+
+    // the final pass, form the final image
+    uint8x16_t v2 = vmovq_n_u8(2);
+    const u8* ptrmap = map + mapstep + 1;
+    for (size_t i = 0; i < size.height; i++, ptrmap += mapstep)
+    {
+        u8* _dst = internal::getRowPtr(dstBase, dstStride, i);
+        ptrdiff_t j = 0;
+        for (; j < (ptrdiff_t)size.width - 16; j += 16)
+        {
+            internal::prefetch(ptrmap);
+            uint8x16_t vmap = vld1q_u8(ptrmap + j);
+            uint8x16_t vdst = vceqq_u8(vmap, v2);
+            vst1q_u8(_dst+j, vdst);
+        }
+        for (; j < (ptrdiff_t)size.width; j++)
+            _dst[j] = (u8)-(ptrmap[j] >> 1);
+    }
+}
+
+} // namespace
+#endif
+
+bool isCanny3x3Supported(const Size2D &size)
+{
+    return isSupportedConfiguration() &&
+           size.height >= 2 && size.width >= 9;
+}
+
+void Canny3x3L1(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f64 low_thresh, f64 high_thresh,
+                Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isCanny3x3Supported(size));
+#ifdef CAROTENE_NEON
+    Canny3x3<false, false>(size, 1,
+                           srcBase, srcStride,
+                           dstBase, dstStride,
+                           NULL, 0,
+                           NULL, 0,
+                           low_thresh, high_thresh,
+                           borderMargin);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)low_thresh;
+    (void)high_thresh;
+    (void)borderMargin;
+#endif
+}
+
+void Canny3x3L2(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f64 low_thresh, f64 high_thresh,
+                Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isCanny3x3Supported(size));
+#ifdef CAROTENE_NEON
+    Canny3x3<true, false>(size, 1,
+                          srcBase, srcStride,
+                          dstBase, dstStride,
+                          NULL, 0,
+                          NULL, 0,
+                          low_thresh, high_thresh,
+                          borderMargin);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)low_thresh;
+    (void)high_thresh;
+    (void)borderMargin;
+#endif
+}
+
+void Canny3x3L1(const Size2D &size, s32 cn,
+                     s16 * dxBase, ptrdiff_t dxStride,
+                     s16 * dyBase, ptrdiff_t dyStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     f64 low_thresh, f64 high_thresh)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Canny3x3<false, true>(size, cn,
+                          NULL, 0,
+                          dstBase, dstStride,
+                          dxBase, dxStride,
+                          dyBase, dyStride,
+                          low_thresh, high_thresh,
+                          Margin());
+#else
+    (void)size;
+    (void)cn;
+    (void)dstBase;
+    (void)dstStride;
+    (void)dxBase;
+    (void)dxStride;
+    (void)dyBase;
+    (void)dyStride;
+    (void)low_thresh;
+    (void)high_thresh;
+#endif
+}
+
+void Canny3x3L2(const Size2D &size, s32 cn,
+                     s16 * dxBase, ptrdiff_t dxStride,
+                     s16 * dyBase, ptrdiff_t dyStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     f64 low_thresh, f64 high_thresh)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Canny3x3<true, true>(size, cn,
+                         NULL, 0,
+                         dstBase, dstStride,
+                         dxBase, dxStride,
+                         dyBase, dyStride,
+                         low_thresh, high_thresh,
+                         Margin());
+#else
+    (void)size;
+    (void)cn;
+    (void)dstBase;
+    (void)dstStride;
+    (void)dxBase;
+    (void)dxStride;
+    (void)dyBase;
+    (void)dyStride;
+    (void)low_thresh;
+    (void)high_thresh;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/channel_extract.cpp b/3rdparty/carotene/src/channel_extract.cpp
new file mode 100644
index 0000000000..fda8f6e153
--- /dev/null
+++ b/3rdparty/carotene/src/channel_extract.cpp
@@ -0,0 +1,486 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+void extract2(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              u32 coi)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; dj < roiw32; sj += 64, dj += 32)
+        {
+            internal::prefetch(src + sj);
+
+            uint8x16x2_t v_src = vld2q_u8(src + sj);
+            vst1q_u8(dst + dj, v_src.val[coi]);
+
+            v_src = vld2q_u8(src + sj + 32);
+            vst1q_u8(dst + dj + 16, v_src.val[coi]);
+        }
+#endif
+
+        for (; dj < roiw8; sj += 16, dj += 8)
+        {
+            uint8x8x2_t v_src = vld2_u8(src + sj);
+            vst1_u8(dst + dj, v_src.val[coi]);
+        }
+
+        for (; dj < size.width; sj += 2, ++dj)
+        {
+            dst[dj] = src[sj + coi];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)coi;
+#endif
+}
+
+void extract3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              u32 coi)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; dj < roiw32; sj += 96, dj += 32)
+        {
+            internal::prefetch(src + sj);
+
+            uint8x16x3_t v_src = vld3q_u8(src + sj);
+            vst1q_u8(dst + dj, v_src.val[coi]);
+
+            v_src = vld3q_u8(src + sj + 48);
+            vst1q_u8(dst + dj + 16, v_src.val[coi]);
+        }
+#endif
+
+        for (; dj < roiw8; sj += 24, dj += 8)
+        {
+            uint8x8x3_t v_src = vld3_u8(src + sj);
+            vst1_u8(dst + dj, v_src.val[coi]);
+        }
+
+        for (; dj < size.width; sj += 3, ++dj)
+        {
+            dst[dj] = src[sj + coi];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)coi;
+#endif
+}
+
+void extract4(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              u32 coi)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; dj < roiw32; sj += 128, dj += 32)
+        {
+            internal::prefetch(src + sj);
+
+            uint8x16x4_t v_src = vld4q_u8(src + sj);
+            vst1q_u8(dst + dj, v_src.val[coi]);
+
+            v_src = vld4q_u8(src + sj + 64);
+            vst1q_u8(dst + dj + 16, v_src.val[coi]);
+        }
+#endif
+
+        for (; dj < roiw8; sj += 32, dj += 8)
+        {
+            uint8x8x4_t v_src = vld4_u8(src + sj);
+            vst1_u8(dst + dj, v_src.val[coi]);
+        }
+
+        for (; dj < size.width; sj += 4, ++dj)
+        {
+            dst[dj] = src[sj + coi];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)coi;
+#endif
+}
+
+#define FILL_LINES2(macro,type) \
+            macro##_LINE(type,0) \
+            macro##_LINE(type,1)
+#define FILL_LINES3(macro,type) \
+            FILL_LINES2(macro,type) \
+            macro##_LINE(type,2)
+#define FILL_LINES4(macro,type) \
+            FILL_LINES3(macro,type) \
+            macro##_LINE(type,3)
+
+#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
+
+#ifdef CAROTENE_NEON
+
+#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
+#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
+#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
+#define SST_LINE(type, n) dst##n[dj] = src[sj + n];
+
+#define MUL2(val) (val << 1)
+#define MUL3(val) (MUL2(val) + val)
+#define MUL4(val) (val << 2)
+
+#define CONTDST2 srcStride == dst0Stride && \
+                 srcStride == dst1Stride &&
+#define CONTDST3 srcStride == dst0Stride && \
+                 srcStride == dst1Stride && \
+                 srcStride == dst2Stride &&
+#define CONTDST4 srcStride == dst0Stride && \
+                 srcStride == dst1Stride && \
+                 srcStride == dst2Stride && \
+                 srcStride == dst3Stride &&
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define SPLIT_ASM2(sgn, bits) __asm__ ( \
+                                          "vld2." #bits " {d0, d2}, [%[in0]]            \n\t" \
+                                          "vld2." #bits " {d1, d3}, [%[in1]]            \n\t" \
+                                          "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
+                                          "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
+                                          : \
+                                          : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
+                                            [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3" \
+                                      );
+#define SPLIT_ASM3(sgn, bits) __asm__ ( \
+                                          "vld3." #bits " {d0, d2, d4}, [%[in0]]        \n\t" \
+                                          "vld3." #bits " {d1, d3, d5}, [%[in1]]        \n\t" \
+                                          "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
+                                          "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
+                                          "vst1." #bits " {d4-d5}, [%[out2]]            \n\t" \
+                                          : \
+                                          : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
+                                            [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3","d4","d5" \
+                                      );
+#define SPLIT_ASM4(sgn, bits) __asm__ ( \
+                                          "vld4." #bits " {d0, d2, d4, d6}, [%[in0]]    \n\t" \
+                                          "vld4." #bits " {d1, d3, d5, d7}, [%[in1]]    \n\t" \
+                                          "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
+                                          "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
+                                          "vst1." #bits " {d4-d5}, [%[out2]]            \n\t" \
+                                          "vst1." #bits " {d6-d7}, [%[out3]]            \n\t" \
+                                          : \
+                                          : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
+                                            [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3","d4","d5","d6","d7" \
+                                      );
+
+#define SPLIT_QUAD(sgn, bits, n) { \
+                                     internal::prefetch(src + sj); \
+                                     SPLIT_ASM##n(sgn, bits) \
+                                 }
+
+#else
+
+#define SPLIT_QUAD(sgn, bits, n) { \
+                                     internal::prefetch(src + sj); \
+                                     vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
+                                     FILL_LINES##n(VST1Q, sgn##bits) \
+                                 }
+
+#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size,                                            \
+                                    const sgn##bits * srcBase, ptrdiff_t srcStride                      \
+                                    FILL_LINES##n(FARG, sgn##bits) )                                    \
+{                                                                                                       \
+    internal::assertSupportedConfiguration();                                                           \
+    Size2D size(_size);                                                                                 \
+    if (CONTDST##n                                                                                      \
+        dst0Stride == (ptrdiff_t)(size.width))                                                          \
+    {                                                                                                   \
+        size.width *= size.height;                                                                      \
+        size.height = 1;                                                                                \
+    }                                                                                                   \
+    typedef internal::VecTraits<sgn##bits, n>::vec128 vec128;                                           \
+    size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
+    typedef internal::VecTraits<sgn##bits, n>::vec64 vec64;                                             \
+    size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0;    \
+                                                                                                        \
+    for (size_t i = 0u; i < size.height; ++i)                                                           \
+    {                                                                                                   \
+        const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i);                             \
+        FILL_LINES##n(VROW, sgn##bits)                                                                  \
+        size_t sj = 0u, dj = 0u;                                                                        \
+                                                                                                        \
+        for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits))             \
+            SPLIT_QUAD(sgn, bits, n)                                                                    \
+                                                                                                        \
+        if (dj < roiw8)                                                                                 \
+        {                                                                                               \
+            vec64 v_src = vld##n##_##sgn##bits(src + sj);                                               \
+            FILL_LINES##n(VST1, sgn##bits)                                                              \
+            sj += MUL##n(8)/sizeof(sgn##bits);                                                          \
+            dj += 8/sizeof(sgn##bits);                                                                  \
+        }                                                                                               \
+                                                                                                        \
+        for (; dj < size.width; sj += n, ++dj)                                                          \
+        {                                                                                               \
+            FILL_LINES##n(SST, sgn##bits)                                                               \
+        }                                                                                               \
+    }                                                                                                   \
+}
+
+#define SPLIT64(sgn,n) void split##n(const Size2D &_size,                                               \
+                                     const sgn##64 * srcBase, ptrdiff_t srcStride                       \
+                                     FILL_LINES##n(FARG, sgn##64) )                                     \
+{                                                                                                       \
+    internal::assertSupportedConfiguration();                                                           \
+    Size2D size(_size);                                                                                 \
+    if (CONTDST##n                                                                                      \
+        dst0Stride == (ptrdiff_t)(size.width))                                                          \
+    {                                                                                                   \
+        size.width *= size.height;                                                                      \
+        size.height = 1;                                                                                \
+    }                                                                                                   \
+    typedef internal::VecTraits<sgn##64, n>::vec64 vec64;                                               \
+                                                                                                        \
+    for (size_t i = 0u; i < size.height; ++i)                                                           \
+    {                                                                                                   \
+        const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i);                               \
+        FILL_LINES##n(VROW, sgn##64)                                                                    \
+        size_t sj = 0u, dj = 0u;                                                                        \
+                                                                                                        \
+        for (; dj < size.width; sj += n, ++dj)                                                          \
+        {                                                                                               \
+            vec64 v_src = vld##n##_##sgn##64(src + sj);                                                 \
+            FILL_LINES##n(VST1, sgn##64)                                                                \
+        }                                                                                               \
+    }                                                                                                   \
+}
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define ALPHA_QUAD(sgn, bits) { \
+                                  internal::prefetch(src + sj); \
+                                  __asm__ ( \
+                                      "vld4." #bits " {d0, d2, d4, d6}, [%[in0]]    \n\t" \
+                                      "vld4." #bits " {d1, d3, d5, d7}, [%[in1]]    \n\t" \
+                                      "vst3." #bits " {d0, d2, d4}, [%[out3_1]]     \n\t" \
+                                      "vst3." #bits " {d1, d3, d5}, [%[out3_2]]     \n\t" \
+                                      "vst1." #bits " {d6-d7}, [%[out1]]            \n\t" \
+                                      : \
+                                      : [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
+                                        [in0]  "r" (src + sj), [in1]  "r" (src + sj + 32/sizeof(sgn##bits)) \
+                                      : "d0","d1","d2","d3","d4","d5","d6","d7" \
+                                  ); \
+                              }
+
+#else
+
+#define ALPHA_QUAD(sgn, bits) { \
+                                  internal::prefetch(src + sj); \
+                                  union { vec128_4 v4; vec128_3 v3; } vals; \
+                                  vals.v4 = vld4q_##sgn##bits(src + sj); \
+                                  vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
+                                  vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
+                              }
+
+#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size,                                          \
+                                          const sgn##bits * srcBase, ptrdiff_t srcStride,               \
+                                          sgn##bits * dst3Base, ptrdiff_t dst3Stride,                   \
+                                          sgn##bits * dst1Base, ptrdiff_t dst1Stride)                   \
+{                                                                                                       \
+    internal::assertSupportedConfiguration();                                                           \
+    Size2D size(_size);                                                                                 \
+    if (srcStride == dst3Stride &&                                                                      \
+        srcStride == dst1Stride &&                                                                      \
+        srcStride == (ptrdiff_t)(size.width))                                                           \
+    {                                                                                                   \
+        size.width *= size.height;                                                                      \
+        size.height = 1;                                                                                \
+    }                                                                                                   \
+    typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4;                                         \
+    typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3;                                         \
+    size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
+    typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4;                                           \
+    typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3;                                           \
+    size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0;    \
+                                                                                                        \
+    for (size_t i = 0u; i < size.height; ++i)                                                           \
+    {                                                                                                   \
+        const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i);                             \
+        sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i);                                \
+        sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i);                                \
+        size_t sj = 0u, d3j = 0u, d1j = 0u;                                                             \
+                                                                                                        \
+        for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits),       \
+                                                               d1j += 16/sizeof(sgn##bits))             \
+            ALPHA_QUAD(sgn, bits)                                                                       \
+                                                                                                        \
+        if (d1j < roiw8)                                                                                \
+        {                                                                                               \
+            union { vec64_4 v4; vec64_3 v3; } vals;                                                     \
+            vals.v4 = vld4_##sgn##bits(src + sj);                                                       \
+            vst3_u8(dst3 + d3j, vals.v3);                                                               \
+            vst1_u8(dst1 + d1j, vals.v4.val[3]);                                                        \
+            sj += MUL4(8)/sizeof(sgn##bits);                                                            \
+            d3j += MUL3(8)/sizeof(sgn##bits);                                                           \
+            d1j += 8/sizeof(sgn##bits);                                                                 \
+        }                                                                                               \
+                                                                                                        \
+        for (; d1j < size.width; sj += 4, d3j += 3, ++d1j)                                              \
+        {                                                                                               \
+            dst3[d3j+0] = src[sj + 0];                                                                  \
+            dst3[d3j+1] = src[sj + 1];                                                                  \
+            dst3[d3j+2] = src[sj + 2];                                                                  \
+            dst1[d1j]   = src[sj + 3];                                                                  \
+        }                                                                                               \
+    }                                                                                                   \
+}
+
+#else
+
+#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
+
+#define SPLIT(sgn,bits,n) void split##n(const Size2D &size,                                          \
+                                    const sgn##bits * srcBase, ptrdiff_t srcStride                   \
+                                    FILL_LINES##n(FARG, sgn##bits) )                                 \
+{                                                                                                    \
+    internal::assertSupportedConfiguration();                                                        \
+    (void)size;                                                                                      \
+    (void)srcBase;                                                                                   \
+    (void)srcStride;                                                                                 \
+    FILL_LINES##n(VOID, sgn##bits)                                                                   \
+}
+
+#define SPLIT64(sgn,n) SPLIT(sgn,64,n)
+
+#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size,                                        \
+                                          const sgn##bits * srcBase, ptrdiff_t srcStride,            \
+                                          sgn##bits * dst3Base, ptrdiff_t dst3Stride,                \
+                                          sgn##bits * dst1Base, ptrdiff_t dst1Stride)                \
+{                                                                                                    \
+    internal::assertSupportedConfiguration();                                                        \
+    (void)size;                                                                                      \
+    (void)srcBase;                                                                                   \
+    (void)srcStride;                                                                                 \
+    (void)dst3Base;                                                                                  \
+    (void)dst3Stride;                                                                                \
+    (void)dst1Base;                                                                                  \
+    (void)dst1Stride;                                                                                \
+}
+
+#endif //CAROTENE_NEON
+
+SPLIT(u, 8,2)
+SPLIT(u, 8,3)
+SPLIT(u, 8,4)
+SPLIT(u,16,2)
+SPLIT(u,16,3)
+SPLIT(u,16,4)
+SPLIT(s,32,2)
+SPLIT(s,32,3)
+SPLIT(s,32,4)
+
+SPLIT64(s, 2)
+SPLIT64(s, 3)
+SPLIT64(s, 4)
+
+SPLIT4ALPHA(u,8)
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/channels_combine.cpp b/3rdparty/carotene/src/channels_combine.cpp
new file mode 100644
index 0000000000..32b71470e2
--- /dev/null
+++ b/3rdparty/carotene/src/channels_combine.cpp
@@ -0,0 +1,389 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#define FILL_LINES2(macro,type) \
+            macro##_LINE(type,0) \
+            macro##_LINE(type,1)
+#define FILL_LINES3(macro,type) \
+            FILL_LINES2(macro,type) \
+            macro##_LINE(type,2)
+#define FILL_LINES4(macro,type) \
+            FILL_LINES3(macro,type) \
+            macro##_LINE(type,3)
+
+#define  FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
+
+#ifdef CAROTENE_NEON
+
+#define  VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
+#define  PREF_LINE(type, n) internal::prefetch(src##n + sj);
+#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
+#define  PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
+#define  VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
+#define   SLD_LINE(type, n) dst[dj + n] = src##n[sj];
+
+#define MUL2(val) (val << 1)
+#define MUL3(val) (MUL2(val) + val)
+#define MUL4(val) (val << 2)
+
+#define CONTSRC2 dstStride == src0Stride && \
+                 dstStride == src1Stride &&
+#define CONTSRC3 dstStride == src0Stride && \
+                 dstStride == src1Stride && \
+                 dstStride == src2Stride &&
+#define CONTSRC4 dstStride == src0Stride && \
+                 dstStride == src1Stride && \
+                 dstStride == src2Stride && \
+                 dstStride == src3Stride &&
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define MERGE_ASM2(sgn, bits) __asm__ ( \
+                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
+                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
+                                          "vst2." #bits " {d0, d2}, [%[out0]]           \n\t" \
+                                          "vst2." #bits " {d1, d3}, [%[out1]]           \n\t" \
+                                          : \
+                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
+                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3" \
+                                      );
+#define MERGE_ASM3(sgn, bits) __asm__ ( \
+                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
+                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
+                                          "vld1." #bits " {d4-d5}, [%[in2]]             \n\t" \
+                                          "vst3." #bits " {d0, d2, d4}, [%[out0]]       \n\t" \
+                                          "vst3." #bits " {d1, d3, d5}, [%[out1]]       \n\t" \
+                                          : \
+                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
+                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3","d4","d5" \
+                                      );
+#define MERGE_ASM4(sgn, bits) __asm__ ( \
+                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
+                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
+                                          "vld1." #bits " {d4-d5}, [%[in2]]             \n\t" \
+                                          "vld1." #bits " {d6-d7}, [%[in3]]             \n\t" \
+                                          "vst4." #bits " {d0, d2, d4, d6}, [%[out0]]   \n\t" \
+                                          "vst4." #bits " {d1, d3, d5, d7}, [%[out1]]   \n\t" \
+                                          : \
+                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
+                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3","d4","d5","d6","d7" \
+                                      );
+
+#define MERGE_QUAD(sgn, bits, n) { \
+                                     FILL_LINES##n(PREF, sgn##bits) \
+                                     MERGE_ASM##n(sgn, bits) \
+                                 }
+
+#else
+
+#define MERGE_QUAD(sgn, bits, n) { \
+                                     vec128 v_dst; \
+                                     /*FILL_LINES##n(PREF, sgn##bits) \
+                                     FILL_LINES##n(VLD1Q, sgn##bits)*/ \
+                                     FILL_LINES##n(PRLD, sgn##bits) \
+                                     vst##n##q_##sgn##bits(dst + dj, v_dst); \
+                                 }
+
+#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size                                             \
+                                        FILL_LINES##n(FARG, sgn##bits),                                     \
+                                        sgn##bits * dstBase, ptrdiff_t dstStride)                           \
+{                                                                                                           \
+    internal::assertSupportedConfiguration();                                                               \
+    Size2D size(_size);                                                                                     \
+    if (CONTSRC##n                                                                                          \
+        dstStride == (ptrdiff_t)(size.width))                                                               \
+    {                                                                                                       \
+        size.width *= size.height;                                                                          \
+        size.height = 1;                                                                                    \
+    }                                                                                                       \
+    typedef internal::VecTraits<sgn##bits, n>::vec128 vec128;                                               \
+    size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
+    typedef internal::VecTraits<sgn##bits, n>::vec64 vec64;                                                 \
+    size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0;    \
+                                                                                                            \
+    for (size_t i = 0u; i < size.height; ++i)                                                               \
+    {                                                                                                       \
+        FILL_LINES##n(VROW, sgn##bits)                                                                      \
+        sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i);                                       \
+        size_t sj = 0u, dj = 0u;                                                                            \
+                                                                                                            \
+        for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits))                 \
+            MERGE_QUAD(sgn, bits, n)                                                                        \
+                                                                                                            \
+        if ( sj < roiw8 )                                                                                   \
+        {                                                                                                   \
+            vec64 v_dst;                                                                                    \
+            FILL_LINES##n(VLD1, sgn##bits)                                                                  \
+            vst##n##_##sgn##bits(dst + dj, v_dst);                                                          \
+            sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits);                                   \
+        }                                                                                                   \
+                                                                                                            \
+        for (; sj < size.width; ++sj, dj += n)                                                              \
+        {                                                                                                   \
+            FILL_LINES##n(SLD, sgn##bits)                                                                   \
+        }                                                                                                   \
+    }                                                                                                       \
+}
+
+#define COMBINE64(sgn,n) void combine##n(const Size2D &_size                                                \
+                                               FILL_LINES##n(FARG, sgn##64),                                \
+                                               sgn##64 * dstBase, ptrdiff_t dstStride)                      \
+{                                                                                                           \
+    internal::assertSupportedConfiguration();                                                               \
+    Size2D size(_size);                                                                                     \
+    if (CONTSRC##n                                                                                          \
+        dstStride == (ptrdiff_t)(size.width))                                                               \
+    {                                                                                                       \
+        size.width *= size.height;                                                                          \
+        size.height = 1;                                                                                    \
+    }                                                                                                       \
+    typedef internal::VecTraits<sgn##64, n>::vec64 vec64;                                                   \
+                                                                                                            \
+    for (size_t i = 0u; i < size.height; ++i)                                                               \
+    {                                                                                                       \
+        FILL_LINES##n(VROW, sgn##64)                                                                        \
+        sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i);                                         \
+        size_t sj = 0u, dj = 0u;                                                                            \
+                                                                                                            \
+        for (; sj < size.width; ++sj, dj += n)                                                              \
+        {                                                                                                   \
+            vec64 v_dst;                                                                                    \
+            FILL_LINES##n(VLD1, sgn##64)                                                                    \
+            vst##n##_##sgn##64(dst + dj, v_dst);                                                            \
+            /*FILL_LINES##n(SLD, sgn##64)*/                                                                 \
+        }                                                                                                   \
+    }                                                                                                       \
+}
+
+#else
+
+#define  VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
+
+#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size                                              \
+                                        FILL_LINES##n(FARG, sgn##bits),                                     \
+                                        sgn##bits * dstBase, ptrdiff_t dstStride)                           \
+{                                                                                                           \
+    internal::assertSupportedConfiguration();                                                               \
+    (void)size;                                                                                             \
+    FILL_LINES##n(VOID, sgn##bits)                                                                          \
+    (void)dstBase;                                                                                          \
+    (void)dstStride;                                                                                        \
+}
+#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
+
+#endif //CAROTENE_NEON
+
+COMBINE(u, 8,2)
+COMBINE(u, 8,3)
+COMBINE(u, 8,4)
+COMBINE(u,16,2)
+COMBINE(u,16,3)
+COMBINE(u,16,4)
+COMBINE(s,32,2)
+COMBINE(s,32,3)
+COMBINE(s,32,4)
+COMBINE64(s, 2)
+COMBINE64(s, 3)
+COMBINE64(s, 4)
+
+void combineYUYV(const Size2D &size,
+                 const u8 * srcyBase, ptrdiff_t srcyStride,
+                 const u8 * srcuBase, ptrdiff_t srcuStride,
+                 const u8 * srcvBase, ptrdiff_t srcvStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; i += 1)
+    {
+        const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
+        const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
+        const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t syj = 0u, sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
+        {
+            internal::prefetch(srcy + syj);
+            internal::prefetch(srcu + sj);
+            internal::prefetch(srcv + sj);
+
+            uint8x16x2_t v_y = vld2q_u8(srcy + syj);
+            uint8x16x4_t v_dst;
+            v_dst.val[0] = v_y.val[0];
+            v_dst.val[1] = vld1q_u8(srcu + sj);
+            v_dst.val[2] = v_y.val[1];
+            v_dst.val[3] = vld1q_u8(srcv + sj);
+            vst4q_u8(dst + dj, v_dst);
+
+            v_y = vld2q_u8(srcy + syj + 32);
+            v_dst.val[0] = v_y.val[0];
+            v_dst.val[1] = vld1q_u8(srcu + sj + 16);
+            v_dst.val[2] = v_y.val[1];
+            v_dst.val[3] = vld1q_u8(srcv + sj + 16);
+            vst4q_u8(dst + dj + 64, v_dst);
+        }
+#endif
+
+        for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
+        {
+            uint8x8x2_t v_y = vld2_u8(srcy + syj);
+            uint8x8x4_t v_dst;
+            v_dst.val[0] = v_y.val[0];
+            v_dst.val[1] = vld1_u8(srcu + sj);
+            v_dst.val[2] = v_y.val[1];
+            v_dst.val[3] = vld1_u8(srcv + sj);
+            vst4_u8(dst + dj, v_dst);
+        }
+
+        for (; sj < size.width; ++sj, syj += 2, dj += 4)
+        {
+            dst[dj] = srcy[syj];
+            dst[dj + 1] = srcu[sj];
+            dst[dj + 2] = srcy[syj + 1];
+            dst[dj + 3] = srcv[sj];
+        }
+    }
+#else
+    (void)size;
+    (void)srcyBase;
+    (void)srcyStride;
+    (void)srcuBase;
+    (void)srcuStride;
+    (void)srcvBase;
+    (void)srcvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void combineUYVY(const Size2D &size,
+                 const u8 * srcyBase, ptrdiff_t srcyStride,
+                 const u8 * srcuBase, ptrdiff_t srcuStride,
+                 const u8 * srcvBase, ptrdiff_t srcvStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
+        const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
+        const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t syj = 0u, sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
+        {
+            internal::prefetch(srcy + syj);
+            internal::prefetch(srcu + sj);
+            internal::prefetch(srcv + sj);
+
+            uint8x16x2_t v_y = vld2q_u8(srcy + syj);
+            uint8x16x4_t v_dst;
+            v_dst.val[0] = vld1q_u8(srcu + sj);
+            v_dst.val[1] = v_y.val[0];
+            v_dst.val[2] = vld1q_u8(srcv + sj);
+            v_dst.val[3] = v_y.val[1];
+            vst4q_u8(dst + dj, v_dst);
+
+            v_y = vld2q_u8(srcy + syj + 32);
+            v_dst.val[0] = vld1q_u8(srcu + sj + 16);
+            v_dst.val[1] = v_y.val[0];
+            v_dst.val[2] = vld1q_u8(srcv + sj + 16);
+            v_dst.val[3] = v_y.val[1];
+            vst4q_u8(dst + dj + 64, v_dst);
+        }
+#endif
+
+        for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
+        {
+            uint8x8x2_t v_y = vld2_u8(srcy + syj);
+            uint8x8x4_t v_dst;
+            v_dst.val[0] = vld1_u8(srcu + sj);
+            v_dst.val[1] = v_y.val[0];
+            v_dst.val[2] = vld1_u8(srcv + sj);
+            v_dst.val[3] = v_y.val[1];
+            vst4_u8(dst + dj, v_dst);
+        }
+
+        for (; sj < size.width; ++sj, syj += 2, dj += 4)
+        {
+            dst[dj] = srcu[sj];
+            dst[dj + 1] = srcy[syj];
+            dst[dj + 2] = srcv[sj];
+            dst[dj + 3] = srcy[syj + 1];
+        }
+    }
+#else
+    (void)size;
+    (void)srcyBase;
+    (void)srcyStride;
+    (void)srcuBase;
+    (void)srcuStride;
+    (void)srcvBase;
+    (void)srcvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/cmp.cpp b/3rdparty/carotene/src/cmp.cpp
new file mode 100644
index 0000000000..eda121985e
--- /dev/null
+++ b/3rdparty/carotene/src/cmp.cpp
@@ -0,0 +1,340 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
+inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
+inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
+
+template <typename Op, int elsize> struct vtail
+{
+    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
+                               u8 * dst, const Op & op,
+                               size_t &x, size_t width)
+    {
+        //do nothing since there couldn't be enough data
+        (void)src0;
+        (void)src1;
+        (void)dst;
+        (void)op;
+        (void)x;
+        (void)width;
+    }
+};
+template <typename Op> struct vtail<Op, 2>
+{
+    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
+                               u8 * dst, const Op & op,
+                               size_t &x, size_t width)
+    {
+        typedef typename Op::type type;
+        typedef typename internal::VecTraits<type>::vec128 vec128;
+        typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
+        //There no more than 15 elements in the tail, so we could handle 8 element vector only once
+        if( x + 8 < width)
+        {
+            vec128  v_src0, v_src1;
+            uvec128 v_dst;
+
+            v_src0 = internal::vld1q(src0 + x);
+            v_src1 = internal::vld1q(src1 + x);
+            op(v_src0, v_src1, v_dst);
+            internal::vst1(dst + x, internal::vmovn(v_dst));
+            x+=8;
+        }
+    }
+};
+template <typename Op> struct vtail<Op, 1>
+{
+    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
+                               u8 * dst, const Op & op,
+                               size_t &x, size_t width)
+    {
+        typedef typename Op::type type;
+        typedef typename internal::VecTraits<type>::vec128 vec128;
+        typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
+        typedef typename internal::VecTraits<type>::vec64 vec64;
+        typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
+        //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
+        if( x + 16 < width)
+        {
+            vec128  v_src0, v_src1;
+            uvec128 v_dst;
+
+            v_src0 = internal::vld1q(src0 + x);
+            v_src1 = internal::vld1q(src1 + x);
+            op(v_src0, v_src1, v_dst);
+            internal::vst1q(dst + x, v_dst);
+            x+=16;
+        }
+        if( x + 8 < width)
+        {
+            vec64  v_src0, v_src1;
+            uvec64 v_dst;
+
+            v_src0 = internal::vld1(src0 + x);
+            v_src1 = internal::vld1(src1 + x);
+            op(v_src0, v_src1, v_dst);
+            internal::vst1(dst + x, v_dst);
+            x+=8;
+        }
+    }
+};
+
+template <typename Op>
+void vcompare(Size2D size,
+              const typename Op::type * src0Base, ptrdiff_t src0Stride,
+              const typename Op::type * src1Base, ptrdiff_t src1Stride,
+              u8 * dstBase, ptrdiff_t dstStride, const Op & op)
+{
+    typedef typename Op::type type;
+    typedef typename internal::VecTraits<type>::vec128 vec128;
+    typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
+
+    if (src0Stride == src1Stride && src0Stride == dstStride &&
+        src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+    const u32 step_base = 32 / sizeof(type);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
+        const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
+        size_t x = 0;
+
+        for( ; x < roiw_base; x += step_base )
+        {
+            internal::prefetch(src0 + x);
+            internal::prefetch(src1 + x);
+
+            vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
+            vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
+            uvec128 v_dst0;
+            uvec128 v_dst1;
+
+            op(v_src00, v_src10, v_dst0);
+            op(v_src01, v_src11, v_dst1);
+
+            vnst(dst + x, v_dst0, v_dst1);
+        }
+
+        vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);
+
+        for (; x < size.width; ++x)
+        {
+            op(src0 + x, src1 + x, dst + x);
+        }
+    }
+}
+
+template<typename T>
+struct OpCmpEQ
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
+    {
+        v_dst = internal::vceqq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
+    {
+        v_dst = internal::vceq(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] == src1[0] ? 255 : 0;
+    }
+};
+
+template<typename T>
+struct OpCmpNE
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
+    {
+        v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
+    {
+        v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
+    }
+
+    void operator() (const T * src0, const T * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] == src1[0] ? 0 : 255;
+    }
+};
+
+template<typename T>
+struct OpCmpGT
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
+    {
+        v_dst = internal::vcgtq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
+    {
+        v_dst = internal::vcgt(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] > src1[0] ? 255 : 0;
+    }
+};
+
+template<typename T>
+struct OpCmpGE
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
+    {
+        v_dst = internal::vcgeq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
+    {
+        v_dst = internal::vcge(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] >= src1[0] ? 255 : 0;
+    }
+};
+
+}
+
+#define IMPL_CMPOP(op, type)                              \
+void cmp##op(const Size2D &size,                          \
+             const type * src0Base, ptrdiff_t src0Stride, \
+             const type * src1Base, ptrdiff_t src1Stride, \
+                       u8 *dstBase, ptrdiff_t dstStride)  \
+{                                                         \
+    internal::assertSupportedConfiguration();             \
+    vcompare(size,                                        \
+             src0Base, src0Stride,                        \
+             src1Base, src1Stride,                        \
+             dstBase, dstStride,                          \
+             OpCmp##op<type>());                          \
+}
+
+#else
+
+#define IMPL_CMPOP(op, type)                              \
+void cmp##op(const Size2D &size,                          \
+             const type * src0Base, ptrdiff_t src0Stride, \
+             const type * src1Base, ptrdiff_t src1Stride, \
+             u8 *dstBase, ptrdiff_t dstStride)            \
+{                                                         \
+    internal::assertSupportedConfiguration();             \
+    (void)size;                                           \
+    (void)src0Base;                                       \
+    (void)src0Stride;                                     \
+    (void)src1Base;                                       \
+    (void)src1Stride;                                     \
+    (void)dstBase;                                        \
+    (void)dstStride;                                      \
+}
+
+#endif
+
+IMPL_CMPOP(EQ, u8)
+IMPL_CMPOP(EQ, s8)
+IMPL_CMPOP(EQ, u16)
+IMPL_CMPOP(EQ, s16)
+IMPL_CMPOP(EQ, u32)
+IMPL_CMPOP(EQ, s32)
+IMPL_CMPOP(EQ, f32)
+
+IMPL_CMPOP(NE, u8)
+IMPL_CMPOP(NE, s8)
+IMPL_CMPOP(NE, u16)
+IMPL_CMPOP(NE, s16)
+IMPL_CMPOP(NE, u32)
+IMPL_CMPOP(NE, s32)
+IMPL_CMPOP(NE, f32)
+
+IMPL_CMPOP(GT, u8)
+IMPL_CMPOP(GT, s8)
+IMPL_CMPOP(GT, u16)
+IMPL_CMPOP(GT, s16)
+IMPL_CMPOP(GT, u32)
+IMPL_CMPOP(GT, s32)
+IMPL_CMPOP(GT, f32)
+
+IMPL_CMPOP(GE, u8)
+IMPL_CMPOP(GE, s8)
+IMPL_CMPOP(GE, u16)
+IMPL_CMPOP(GE, s16)
+IMPL_CMPOP(GE, u32)
+IMPL_CMPOP(GE, s32)
+IMPL_CMPOP(GE, f32)
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/colorconvert.cpp b/3rdparty/carotene/src/colorconvert.cpp
new file mode 100644
index 0000000000..ea2db6043a
--- /dev/null
+++ b/3rdparty/carotene/src/colorconvert.cpp
@@ -0,0 +1,2846 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "saturate_cast.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+enum
+{
+    SHIFT = 14,
+    SHIFT_DELTA = 1 << (SHIFT - 1),
+
+    R2Y_BT601   = 4899,
+    G2Y_BT601   = 9617,
+    B2Y_BT601   = 1868,
+
+    R2Y_BT709   = 3483,
+    G2Y_BT709   = 11718,
+    B2Y_BT709   = 1183,
+};
+
+inline uint8x8_t convertToGray(const uint16x8_t & v_r,
+                               const uint16x8_t & v_g,
+                               const uint16x8_t & v_b,
+                               const uint16x4_t & v_r2y,
+                               const uint16x4_t & v_g2y,
+                               const uint16x4_t & v_b2y)
+{
+    uint32x4_t v_dst0 = vmull_u16(vget_low_u16(v_g), v_g2y);
+    uint32x4_t v_dst1 = vmull_u16(vget_high_u16(v_g), v_g2y);
+
+    v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_r), v_r2y);
+    v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_r), v_r2y);
+
+    v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_b), v_b2y);
+    v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_b), v_b2y);
+
+    uint8x8_t v_gray = vqmovn_u16(vcombine_u16(vrshrn_n_u32(v_dst0, SHIFT),
+                                               vrshrn_n_u32(v_dst1, SHIFT)));
+
+    return v_gray;
+}
+
+} // namespace
+
+#endif
+
+void rgb2gray(const Size2D &size, COLOR_SPACE color_space,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
+    const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
+    const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
+    register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
+    register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
+#else
+    uint16x4_t v_r2y = vdup_n_u16(R2Y),
+               v_g2y = vdup_n_u16(G2Y),
+               v_b2y = vdup_n_u16(B2Y);
+
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+        for (; dj < roiw8; sj += 24, dj += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+            "vld3.8 {d0-d2}, [%[in]] @RGB                       \n\t"
+            "vmovl.u8 q2, d0         @R (d4,d5)                 \n\t"
+            "vmovl.u8 q3, d1         @G (d6,d7)                 \n\t"
+            "vmovl.u8 q4, d2         @B (d8,d9)                 \n\t"
+            "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"
+            "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"
+            "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"
+            "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"
+            "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"
+            "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"
+            "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"
+            "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"
+            "vqmovn.u16 d4, q4                                  \n\t"
+            "vst1.8 {d4}, [%[out]]                              \n\t"
+            : /*no output*/
+            : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
+            : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
+            );
+        }
+#else
+        for (; dj < roiw16; sj += 48, dj += 16)
+        {
+            internal::prefetch(src + sj);
+            uint8x16x3_t v_src0 = vld3q_u8(src + sj);
+            // 0
+            uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),
+                       v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
+                       v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));
+            uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),
+            v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
+            v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));
+            uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
+        }
+
+        if (dj < roiw8)
+        {
+            uint8x8x3_t v_src = vld3_u8(src + sj);
+            uint16x8_t v_r = vmovl_u8(v_src.val[0]),
+                       v_g = vmovl_u8(v_src.val[1]),
+                       v_b = vmovl_u8(v_src.val[2]);
+            uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            vst1_u8(dst + dj, v_gray);
+            sj += 24; dj += 8;
+        }
+#endif
+
+        for (; dj < size.width; sj += 3, dj++)
+        {
+            u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;
+            dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
+        }
+    }
+#else
+    (void)size;
+    (void)color_space;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgbx2gray(const Size2D &size, COLOR_SPACE color_space,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
+    const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
+    const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
+    register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
+    register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
+#else
+    uint16x4_t v_r2y = vdup_n_u16(R2Y),
+               v_g2y = vdup_n_u16(G2Y),
+               v_b2y = vdup_n_u16(B2Y);
+
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+        for (; dj < roiw8; sj += 32, dj += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+            "vld4.8 {d0-d3}, [%[in]] @RGBA                      \n\t"
+            "vmovl.u8 q2, d0         @R (d4,d5)                 \n\t"
+            "vmovl.u8 q3, d1         @G (d6,d7)                 \n\t"
+            "vmovl.u8 q4, d2         @B (d8,d9)                 \n\t"
+            "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"
+            "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"
+            "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"
+            "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"
+            "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"
+            "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"
+            "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"
+            "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"
+            "vqmovn.u16 d4, q4                                  \n\t"
+            "vst1.8 {d4}, [%[out]]                              \n\t"
+            : /*no output*/
+            : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
+            : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
+            );
+        }
+#else
+        for (; dj < roiw16; sj += 64, dj += 16)
+        {
+            internal::prefetch(src + sj);
+            uint8x16x4_t v_src0 = vld4q_u8(src + sj);
+
+            // 0
+            uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),
+                       v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
+                       v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));
+            uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),
+            v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
+            v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));
+            uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
+        }
+
+        if (dj < roiw8)
+        {
+            uint8x8x4_t v_src = vld4_u8(src + sj);
+            uint16x8_t v_r = vmovl_u8(v_src.val[0]),
+                       v_g = vmovl_u8(v_src.val[1]),
+                       v_b = vmovl_u8(v_src.val[2]);
+            uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            vst1_u8(dst + dj, v_gray);
+            sj += 32; dj += 8;
+        }
+#endif
+
+        for (; dj < size.width; sj += 4, dj++)
+        {
+            u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;
+            dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
+        }
+    }
+#else
+    (void)size;
+    (void)color_space;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bgr2gray(const Size2D &size, COLOR_SPACE color_space,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
+    const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
+    const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
+    register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
+    register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
+#else
+    uint16x4_t v_r2y = vdup_n_u16(R2Y),
+               v_g2y = vdup_n_u16(G2Y),
+               v_b2y = vdup_n_u16(B2Y);
+
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+        for (; dj < roiw8; sj += 24, dj += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+            "vld3.8 {d0-d2}, [%[in]] @BGR                       \n\t"
+            "vmovl.u8 q2, d2         @R (d4,d5)                 \n\t"
+            "vmovl.u8 q3, d1         @G (d6,d7)                 \n\t"
+            "vmovl.u8 q4, d0         @B (d8,d9)                 \n\t"
+            "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"
+            "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"
+            "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"
+            "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"
+            "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"
+            "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"
+            "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"
+            "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"
+            "vqmovn.u16 d4, q4                                  \n\t"
+            "vst1.8 {d4}, [%[out]]                              \n\t"
+            : /*no output*/
+            : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
+            : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
+            );
+        }
+#else
+        for (; dj < roiw16; sj += 48, dj += 16)
+        {
+            internal::prefetch(src + sj);
+            uint8x16x3_t v_src0 = vld3q_u8(src + sj);
+
+            // 0
+            uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),
+                       v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
+                       v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));
+            uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),
+            v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
+            v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));
+            uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
+        }
+
+        if (dj < roiw8)
+        {
+            uint8x8x3_t v_src = vld3_u8(src + sj);
+            uint16x8_t v_b = vmovl_u8(v_src.val[0]),
+                       v_g = vmovl_u8(v_src.val[1]),
+                       v_r = vmovl_u8(v_src.val[2]);
+            uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            vst1_u8(dst + dj, v_gray);
+            sj += 24; dj += 8;
+        }
+#endif
+
+        for (; dj < size.width; sj += 3, dj++)
+        {
+            u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;
+            dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
+        }
+    }
+#else
+    (void)size;
+    (void)color_space;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bgrx2gray(const Size2D &size, COLOR_SPACE color_space,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
+    const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
+    const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
+    register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
+    register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
+#else
+    uint16x4_t v_r2y = vdup_n_u16(R2Y),
+               v_g2y = vdup_n_u16(G2Y),
+               v_b2y = vdup_n_u16(B2Y);
+
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+        for (; dj < roiw8; sj += 32, dj += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+            "vld4.8 {d0-d3}, [%[in]] @BGRA                      \n\t"
+            "vmovl.u8 q2, d2         @R (d4,d5)                 \n\t"
+            "vmovl.u8 q3, d1         @G (d6,d7)                 \n\t"
+            "vmovl.u8 q4, d0         @B (d8,d9)                 \n\t"
+            "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"
+            "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"
+            "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"
+            "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"
+            "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"
+            "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"
+            "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"
+            "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"
+            "vqmovn.u16 d4, q4                                  \n\t"
+            "vst1.8 {d4}, [%[out]]                              \n\t"
+            : /*no output*/
+            : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
+            : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
+            );
+        }
+#else
+        for (; dj < roiw16; sj += 64, dj += 16)
+        {
+            internal::prefetch(src + sj);
+            uint8x16x4_t v_src0 = vld4q_u8(src + sj);
+
+            // 0
+            uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),
+                       v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
+                       v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));
+            uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),
+            v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
+            v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));
+            uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
+        }
+
+        if (dj < roiw8)
+        {
+            uint8x8x4_t v_src = vld4_u8(src + sj);
+            uint16x8_t v_b = vmovl_u8(v_src.val[0]),
+                       v_g = vmovl_u8(v_src.val[1]),
+                       v_r = vmovl_u8(v_src.val[2]);
+            uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
+
+            vst1_u8(dst + dj, v_gray);
+            sj += 32; dj += 8;
+        }
+#endif
+
+        for (; dj < size.width; sj += 4, dj++)
+        {
+            u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;
+            dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
+        }
+    }
+#else
+    (void)size;
+    (void)color_space;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void gray2rgb(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+        for (; sj < roiw16; sj += 16, dj += 48)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            __asm__ (
+            "vld1.8 {d0-d1}, [%[in0]]            \n\t"
+            "vmov.8 q1, q0                       \n\t"
+            "vmov.8 q2, q0                       \n\t"
+            "vmov.8 q3, q1                       \n\t"
+            "vst3.8 {d2, d4, d6}, [%[out0]]      \n\t"
+            "vst3.8 {d3, d5, d7}, [%[out1]]      \n\t"
+            : /*no output*/
+            : [out0] "r" (dst + dj),      [out1] "r" (dst + dj + 24),
+              [in0] "r" (src + sj)
+            : "d0","d1","d2","d3","d4","d5","d6","d7"
+            );
+#else
+            uint8x16x3_t vRgb1;
+            vRgb1.val[0] = vld1q_u8(src + sj);
+
+            vRgb1.val[1] = vRgb1.val[0];
+            vRgb1.val[2] = vRgb1.val[0];
+
+            vst3q_u8(dst + dj, vRgb1);
+#endif
+        }
+
+        if (sj < roiw8)
+        {
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            __asm__ (
+            "vld1.8 {d0}, [%[in]]                \n\t"
+            "vmov.8 d1, d0                       \n\t"
+            "vmov.8 d2, d0                       \n\t"
+            "vst3.8 {d0-d2}, [%[out]]            \n\t"
+            : /*no output*/
+            : [out] "r" (dst + dj), [in] "r" (src + sj)
+            : "d0","d1","d2"
+            );
+#else
+            uint8x8x3_t vRgb2;
+            vRgb2.val[0] = vld1_u8(src + sj);
+            vRgb2.val[1] = vRgb2.val[0];
+            vRgb2.val[2] = vRgb2.val[0];
+
+            vst3_u8(dst + dj, vRgb2);
+#endif
+            sj += 8; dj += 24;
+        }
+
+        for (; sj < size.width; sj++, dj += 3)
+        {
+            dst[dj+0] = src[sj];
+            dst[dj+1] = src[sj];
+            dst[dj+2] = src[sj];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void gray2rgbx(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register uint8x16_t vc255   asm ("q4") = vmovq_n_u8(255);
+#else
+    uint8x16x4_t vRgba;
+    uint8x8x4_t  vRgba2;
+    vRgba.val[3]  = vmovq_n_u8(255);
+    vRgba2.val[3] = vget_low_u8(vRgba.val[3]);
+#endif
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+        for (; sj < roiw16; sj += 16, dj += 64)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            __asm__ (
+            "vld1.8 {d0-d1}, [%[in0]]            \n\t"
+            "vmov.8 q1, q0                       \n\t"
+            "vmov.8 q2, q0                       \n\t"
+            "vmov.8 q3, q1                       \n\t"
+            "vst4.8 {d2, d4, d6, d8}, [%[out0]]  \n\t"
+            "vst4.8 {d3, d5, d7, d9}, [%[out1]]  \n\t"
+            : /*no output*/
+            : [out0] "r" (dst + dj),      [out1] "r" (dst + dj + 32),
+              [in0] "r" (src + sj),
+              "w" (vc255)
+            : "d0","d1","d2","d3","d4","d5","d6","d7"
+            );
+#else
+            vRgba.val[0]  = vld1q_u8(src + sj);
+
+            vRgba.val[1] = vRgba.val[0];
+            vRgba.val[2] = vRgba.val[0];
+
+            vst4q_u8(dst + dj, vRgba);
+#endif
+        }
+
+        if (sj < roiw8)
+        {
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            __asm__ (
+            "vld1.8 {d5}, [%[in]]                \n\t"
+            "vmov.8 d6, d5                       \n\t"
+            "vmov.8 d7, d5                       \n\t"
+            "vst4.8 {d5-d8}, [%[out]]            \n\t"
+            : /*no output*/
+            : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (vc255)
+            : "d5","d6","d7"
+            );
+#else
+            vRgba2.val[0] = vld1_u8(src + sj);
+            vRgba2.val[1] = vRgba2.val[0];
+            vRgba2.val[2] = vRgba2.val[0];
+
+            vst4_u8(dst + dj, vRgba2);
+#endif
+            sj += 8; dj += 32;
+        }
+
+        for (; sj < size.width; sj++, dj += 4)
+        {
+            dst[dj+0] = src[sj];
+            dst[dj+1] = src[sj];
+            dst[dj+2] = src[sj];
+            dst[dj+3] = 255;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgb2rgbx(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+#if defined(__GNUC__) && defined(__arm__)
+    register uint8x8_t vc255_0  asm ("d3") = vmov_n_u8(255);
+#else
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;
+    v_dst0.v4.val[3] = vdupq_n_u8(255);
+    union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;
+    v_dst.v4.val[3] = vdup_n_u8(255);
+#endif
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+#if defined(__GNUC__) && defined(__arm__)
+        for (; j < roiw8; sj += 24, dj += 32, j += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+                "vld3.8 {d0, d1, d2}, [%[in0]]             \n\t"
+                "vst4.8 {d0, d1, d2, d3}, [%[out0]]        \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [in0]  "r" (src + sj),
+                  "w" (vc255_0)
+                : "d0","d1","d2"
+            );
+        }
+#else
+        for (; j < roiw16; sj += 48, dj += 64, j += 16)
+        {
+            internal::prefetch(src + sj);
+            v_dst0.v3 = vld3q_u8(src + sj);
+            vst4q_u8(dst + dj, v_dst0.v4);
+        }
+
+        if (j < roiw8)
+        {
+            v_dst.v3 = vld3_u8(src + sj);
+            vst4_u8(dst + dj, v_dst.v4);
+            sj += 24; dj += 32; j += 8;
+        }
+#endif
+
+        for (; j < size.width; ++j, sj += 3, dj += 4)
+        {
+            dst[dj] = src[sj];
+            dst[dj + 1] = src[sj + 1];
+            dst[dj + 2] = src[sj + 2];
+            dst[dj + 3] = 255;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgbx2rgb(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+#if !defined(__GNUC__) || !defined(__arm__)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;
+    union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;
+#endif
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+#if defined(__GNUC__) && defined(__arm__)
+        for (; j < roiw8; sj += 32, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+                "vld4.8 {d0, d1, d2, d3}, [%[in0]]         \n\t"
+                "vst3.8 {d0, d1, d2}, [%[out0]]            \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [in0]  "r" (src + sj)
+                : "d0","d1","d2","d3"
+            );
+        }
+#else
+        for (; j < roiw16; sj += 64, dj += 48, j += 16)
+        {
+            internal::prefetch(src + sj);
+            v_dst0.v4 = vld4q_u8(src + sj);
+            vst3q_u8(dst + dj, v_dst0.v3);
+        }
+
+        if (j < roiw8)
+        {
+            v_dst.v4 = vld4_u8(src + sj);
+            vst3_u8(dst + dj, v_dst.v3);
+            sj += 32; dj += 24; j += 8;
+        }
+#endif
+
+        for (; j < size.width; ++j, sj += 4, dj += 3)
+        {
+            dst[dj] = src[sj];
+            dst[dj + 1] = src[sj + 1];
+            dst[dj + 2] = src[sj + 2];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgb2bgr(const Size2D &size,
+             const u8 * srcBase, ptrdiff_t srcStride,
+             u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#if !defined(__GNUC__) || !defined(__arm__)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+
+#if defined(__GNUC__) && defined(__arm__)
+        for (; j < roiw8; sj += 24, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+                "vld3.8 {d0, d1, d2}, [%[in0]]             \n\t"
+                "vswp d0, d2                               \n\t"
+                "vst3.8 {d0, d1, d2}, [%[out0]]            \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [in0]  "r" (src + sj)
+                : "d0","d1","d2"
+            );
+        }
+#else
+        for (; j < roiw16; sj += 48, dj += 48, j += 16)
+        {
+            internal::prefetch(src + sj);
+            uint8x16x3_t vals0 = vld3q_u8(src + sj);
+
+            std::swap(vals0.val[0], vals0.val[2]);
+
+            vst3q_u8(dst + dj, vals0);
+        }
+
+        if (j < roiw8)
+        {
+            uint8x8x3_t vals = vld3_u8(src + sj);
+            std::swap(vals.val[0], vals.val[2]);
+            vst3_u8(dst + dj, vals);
+            sj += 24; dj += 24; j += 8;
+        }
+#endif
+
+        for (; j < size.width; ++j, sj += 3, dj += 3)
+        {
+            u8 b = src[sj + 2];//Handle src == dst case
+            dst[dj + 2] = src[sj    ];
+            dst[dj + 1] = src[sj + 1];
+            dst[dj    ] = b;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgbx2bgrx(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#if !defined(__GNUC__) || !defined(__arm__)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+#if defined(__GNUC__) && defined(__arm__)
+        for (; j < roiw8; sj += 32, dj += 32, j += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+                "vld4.8 {d0, d1, d2, d3}, [%[in0]]         \n\t"
+                "vswp d0, d2                               \n\t"
+                "vst4.8 {d0, d1, d2, d3}, [%[out0]]        \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [in0]  "r" (src + sj)
+                : "d0","d1","d2","d3"
+            );
+        }
+#else
+        for (; j < roiw16; sj += 64, dj += 64, j += 16)
+        {
+            internal::prefetch(src + sj);
+            uint8x16x4_t vals0 = vld4q_u8(src + sj);
+
+            std::swap(vals0.val[0], vals0.val[2]);
+
+            vst4q_u8(dst + dj, vals0);
+        }
+
+        if (j < roiw8)
+        {
+            uint8x8x4_t vals = vld4_u8(src + sj);
+            std::swap(vals.val[0], vals.val[2]);
+            vst4_u8(dst + dj, vals);
+            sj += 32; dj += 32; j += 8;
+        }
+#endif
+
+        for (; j < size.width; ++j, sj += 4, dj += 4)
+        {
+            u8 b = src[sj + 2];//Handle src == dst case
+            dst[dj + 2] = src[sj    ];
+            dst[dj + 1] = src[sj + 1];
+            dst[dj    ] = b;
+            dst[dj + 3] = src[sj + 3];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgbx2bgr(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#if !defined(__GNUC__) || !defined(__arm__)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+#if defined(__GNUC__) && defined(__arm__)
+        for (; j < roiw8; sj += 32, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+                "vld4.8 {d0, d1, d2, d3}, [%[in0]]         \n\t"
+                "vswp d0, d2                               \n\t"
+                "vst3.8 {d0, d1, d2}, [%[out0]]            \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [in0]  "r" (src + sj)
+                : "d0","d1","d2","d3"
+            );
+        }
+#else
+        for (; j < roiw16; sj += 64, dj += 48, j += 16)
+        {
+            internal::prefetch(src + sj);
+            union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;
+            vals0.v4 = vld4q_u8(src + sj);
+            std::swap(vals0.v3.val[0], vals0.v3.val[2]);
+            vst3q_u8(dst + dj, vals0.v3);
+        }
+
+        if (j < roiw8)
+        {
+            union { uint8x8x4_t v4; uint8x8x3_t v3; } vals;
+            vals.v4 = vld4_u8(src + sj);
+            std::swap(vals.v3.val[0], vals.v3.val[2]);
+            vst3_u8(dst + dj, vals.v3);
+            sj += 32; dj += 24; j += 8;
+        }
+#endif
+
+        for (; j < size.width; ++j, sj += 4, dj += 3)
+        {
+            dst[dj + 2] = src[sj    ];
+            dst[dj + 1] = src[sj + 1];
+            dst[dj    ] = src[sj + 2];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgb2bgrx(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#if defined(__GNUC__) && defined(__arm__)
+    register uint8x8_t vc255  asm ("d3") = vmov_n_u8(255);
+#else
+    union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;
+    vals0.v4.val[3] = vmovq_n_u8(255);
+    union { uint8x8x4_t v4; uint8x8x3_t v3; } vals8;
+    vals8.v4.val[3] = vmov_n_u8(255);
+#endif
+
+#if !defined(__GNUC__) || !defined(__arm__)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+#if defined(__GNUC__) && defined(__arm__)
+        for (; j < roiw8; sj += 24, dj += 32, j += 8)
+        {
+            internal::prefetch(src + sj);
+            __asm__ (
+                "vld3.8 {d0, d1, d2}, [%[in0]]             \n\t"
+                "vswp d0, d2                               \n\t"
+                "vst4.8 {d0, d1, d2, d3}, [%[out0]]        \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [in0]  "r" (src + sj),
+                   "w" (vc255)
+                : "d0","d1","d2"
+            );
+        }
+#else
+        for (; j < roiw16; sj += 48, dj += 64, j += 16)
+        {
+            internal::prefetch(src + sj);
+            vals0.v3 = vld3q_u8(src + sj);
+            std::swap(vals0.v4.val[0], vals0.v4.val[2]);
+            vst4q_u8(dst + dj, vals0.v4);
+        }
+
+        if (j < roiw8)
+        {
+            vals8.v3 = vld3_u8(src + sj);
+            std::swap(vals8.v4.val[0], vals8.v4.val[2]);
+            vst4_u8(dst + dj, vals8.v4);
+            sj += 24; dj += 32; j += 8;
+        }
+#endif
+
+        for (; j < size.width; ++j, sj += 3, dj += 4)
+        {
+            dst[dj + 3] = 255;
+            dst[dj + 2] = src[sj    ];
+            dst[dj + 1] = src[sj + 1];
+            dst[dj    ] = src[sj + 2];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+namespace {
+
+#ifdef CAROTENE_NEON
+inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const uint8x8_t vB,
+                                const s32 hrange )
+{
+    const s32 hsv_shift = 12;
+    register const f32 vsdiv_table = f32(255 << hsv_shift);
+    register f32 vhdiv_table = f32(hrange << hsv_shift);
+    register const s32 vhrange = hrange;
+    register const s32 v0 = s32(0);
+    register const s32 vshift = s32(1 << (hsv_shift-1));
+    register const s32 v6 = s32(6);
+
+    uint8x8_t vMin = vmin_u8(vR, vG);
+    uint8x8_t vMax = vmax_u8(vR, vG);
+
+    uint16x8_t vR_u16 = vmovl_u8(vR);
+    uint16x8_t vG_u16 = vmovl_u8(vG);
+
+    vMax = vmax_u8(vMax, vB);
+    vMin = vmin_u8(vMin, vB);
+    uint16x8_t vB_u16 = vmovl_u8(vB);
+
+    uint16x8_t vDiff = vsubl_u8(vMax, vMin);
+
+    uint16x8_t vV = vmovl_u8(vMax);
+    uint16x8_t vDiffx2 = vaddq_u16(vDiff, vDiff);
+    uint32x4_t vDiffL = vmovl_u16(vget_low_u16(vDiff));
+    uint32x4_t vDiffH = vmovl_u16(vget_high_u16(vDiff));
+
+    uint16x8_t vVEqR = vceqq_u16(vR_u16, vV);
+    uint16x8_t vVEqG = vceqq_u16(vG_u16, vV);
+
+    int16x8_t vG_B = vsubq_s16(vreinterpretq_s16_u16(vG_u16), vreinterpretq_s16_u16(vB_u16));
+    uint16x8_t vInvR = vmvnq_u16(vVEqR);
+    int16x8_t vB_R = vsubq_s16(vreinterpretq_s16_u16(vB_u16), vreinterpretq_s16_u16(vR_u16));
+    int16x8_t vR_G = vsubq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vG_u16));
+
+    uint16x8_t vMask2 = vandq_u16(vVEqG, vInvR);
+    vR_u16 = vandq_u16(vreinterpretq_u16_s16(vG_B), vVEqR);
+    int16x8_t vH2 = vaddq_s16(vB_R, vreinterpretq_s16_u16(vDiffx2));
+
+    vVEqR = vmvnq_u16(vVEqG);
+    vB_R = vaddq_s16(vreinterpretq_s16_u16(vDiffx2), vreinterpretq_s16_u16(vDiffx2));
+    vG_B = vandq_s16(vreinterpretq_s16_u16(vInvR), vreinterpretq_s16_u16(vVEqR));
+    vInvR = vandq_u16(vreinterpretq_u16_s16(vH2), vMask2);
+    vR_G = vaddq_s16(vR_G, vB_R);
+    int16x8_t vH = vaddq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vInvR));
+
+    uint32x4_t vV_L = vmovl_u16(vget_low_u16(vV));
+    vR_G = vandq_s16(vR_G, vG_B);
+    uint32x4_t vV_H = vmovl_u16(vget_high_u16(vV));
+    int16x8_t vDiff4 = vaddq_s16(vH, vR_G);
+
+    int32x4_t vc6 = vdupq_n_s32(v6);
+    uint32x4_t vLine1 = vmulq_u32(vDiffL, vreinterpretq_u32_s32(vc6));
+    uint32x4_t vLine2 = vmulq_u32(vDiffH, vreinterpretq_u32_s32(vc6));
+
+    float32x4_t vF1 = vcvtq_f32_u32(vV_L);
+    float32x4_t vF2 = vcvtq_f32_u32(vV_H);
+    float32x4_t vHF1 = vcvtq_f32_u32(vLine1);
+    float32x4_t vHF2 = vcvtq_f32_u32(vLine2);
+
+    float32x4_t vXInv1 = vrecpeq_f32(vF1);
+    float32x4_t vXInv2 = vrecpeq_f32(vF2);
+    float32x4_t vXInv3 = vrecpeq_f32(vHF1);
+    float32x4_t vXInv4 = vrecpeq_f32(vHF2);
+
+    float32x4_t vSt1 = vrecpsq_f32(vXInv1, vF1);
+    float32x4_t vSt2 = vrecpsq_f32(vXInv2, vF2);
+    float32x4_t vSt3 = vrecpsq_f32(vXInv3, vHF1);
+    float32x4_t vSt4 = vrecpsq_f32(vXInv4, vHF2);
+
+    vF1 = vmulq_f32(vXInv1, vSt1);
+    vF2 = vmulq_f32(vXInv2, vSt2);
+    vHF1 = vmulq_f32(vXInv3, vSt3);
+    vHF2 = vmulq_f32(vXInv4, vSt4);
+
+    float32x4_t vDivTab = vdupq_n_f32(vsdiv_table);
+    vSt1 = vmulq_f32(vF1, vDivTab);
+    vSt2 = vmulq_f32(vF2, vDivTab);
+    vDivTab = vdupq_n_f32(vhdiv_table);
+    vSt3 = vmulq_f32(vHF1, vDivTab);
+    vSt4 = vmulq_f32(vHF2, vDivTab);
+
+    float32x4_t bias = vdupq_n_f32(0.5f);
+
+    vSt1 = vaddq_f32(vSt1, bias);
+    vSt2 = vaddq_f32(vSt2, bias);
+    vSt3 = vaddq_f32(vSt3, bias);
+    vSt4 = vaddq_f32(vSt4, bias);
+
+    uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
+    uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
+    uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
+    uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
+
+    int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
+    int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
+
+    uint32x4_t vDiff_Res1 = vmulq_u32(vDiffL, vRes1);
+    uint32x4_t vDiff_Res2 = vmulq_u32(vDiffH, vRes2);
+    uint32x4_t vDiff_Res3 = vmulq_u32(vreinterpretq_u32_s32(vH_L), vRes3);
+    uint32x4_t vDiff_Res4 = vmulq_u32(vreinterpretq_u32_s32(vH_H), vRes4);
+
+    int32x4_t vShift = vdupq_n_s32(vshift);
+    uint32x4_t vAddRes1 = vaddq_u32(vDiff_Res1, vreinterpretq_u32_s32(vShift));
+    uint32x4_t vAddRes2 = vaddq_u32(vDiff_Res2, vreinterpretq_u32_s32(vShift));
+    uint32x4_t vAddRes3 = vaddq_u32(vDiff_Res3, vreinterpretq_u32_s32(vShift));
+    uint32x4_t vAddRes4 = vaddq_u32(vDiff_Res4, vreinterpretq_u32_s32(vShift));
+    int16x4_t vShrRes1 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes1), 8);
+    int16x4_t vShrRes2 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes2), 8);
+    int16x4_t vShrRes3 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes3), 8);
+    int16x4_t vShrRes4 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes4), 8);
+
+    int16x8_t vc0 = vdupq_n_s16((s16)v0);
+    int8x8_t vShrRes1_s8 = vshrn_n_s16(vcombine_s16(vShrRes1, vShrRes2), 4);
+    uint16x8_t vCltRes_u16 = vcltq_s16(vcombine_s16(vShrRes3, vShrRes4), vc0);
+    int8x8_t vShrRes2_s8 = vshrn_n_s16(vcombine_s16(vShrRes3, vShrRes4), 4);
+
+    int8x8_t vCltRes_s8 = vmovn_s16(vreinterpretq_s16_u16(vCltRes_u16));
+    int8x8_t vcHRange = vdup_n_s8((s8)vhrange);
+    uint8x8_t vHResAdd = vand_u8(vreinterpret_u8_s8(vCltRes_s8), vreinterpret_u8_s8(vcHRange));
+    int8x8_t vHRes = vadd_s8(vShrRes2_s8, vreinterpret_s8_u8(vHResAdd));
+
+    uint8x8x3_t vHsv;
+    vHsv.val[0] = vreinterpret_u8_s8(vHRes);
+    vHsv.val[1] = vreinterpret_u8_s8(vShrRes1_s8);
+    vHsv.val[2] = vMax;
+
+    return vHsv;
+}
+
+const u8 fastSaturate8u[] =
+{
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+     16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+     32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+     48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+     64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+     80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+     96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+    112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+    208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255
+};
+
+inline void convertToHSV(const s32 r, const s32 g, const s32 b,
+                         const s32 &hrange, const s32 &hsv_shift,
+                         u8* dst)
+{
+    s32 h, s, v = b;
+    s32 vmin = b, diff;
+    s32 vr, vg;
+
+    v += fastSaturate8u[g-v+256];
+    v += fastSaturate8u[r-v+256];
+    vmin -= fastSaturate8u[vmin-g+256];
+    vmin -= fastSaturate8u[vmin-r+256];
+
+    diff = v - vmin;
+    vr = v == r ? -1 : 0;
+    vg = v == g ? -1 : 0;
+
+    s = (s32(diff * (255 << hsv_shift) * (1.0f/(f32)v)) + (1 << (hsv_shift-1))) >> hsv_shift;
+    h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+    h = ((h * s32((hrange << hsv_shift)/(6.f*diff) + 0.5)) + (1 << (hsv_shift-1))) >> hsv_shift;
+    h += h < 0 ? hrange : 0;
+
+    dst[0] = internal::saturate_cast<u8>(h);
+    dst[1] = (u8)s;
+    dst[2] = (u8)v;
+}
+
+#define CONVERT_TO_HSV_ASM(loadop, rreg, breg)                        \
+            __asm__ (                                                 \
+               #loadop    ", [%[in]] @RGB                       \n\t" \
+            "vmin.u8 d3, d0, d1      @VMin (d3)                 \n\t" \
+            "vmax.u8 d6, d0, d1      @V (d6)                    \n\t" \
+            "vmovl.u8 q2, " #rreg "  @V16_R (d4,d5)             \n\t" \
+            "vmovl.u8 q4, d1         @V16_G (d8,d9)             \n\t" \
+            "vmax.u8 d6, d6, d2                                 \n\t" \
+            "vmin.u8 d3, d3, d2                                 \n\t" \
+            "vmovl.u8 q0, " #breg "  @V16_B (d0,d1)             \n\t" \
+            "vsubl.u8 q8, d6, d3     @V16_Diff (d16,d17)        \n\t" \
+                                                                      \
+            "vmovl.u8 q5, d6         @V16_V (d10,d11)           \n\t" \
+            "vadd.s16 q10, q8, q8    @V16_Diff_2 (d20,d21)      \n\t" \
+            "vmovl.u16 q9, d16       @V32_Diff_L (d18,d19)      \n\t" \
+            "vmovl.u16 q11, d17      @V32_Diff_H (d22,d23)      \n\t" \
+            "vceq.u16 q12, q2, q5    @V==R(d24,d25)             \n\t" \
+            "vceq.u16 q13, q4, q5    @V==G(d26,d27)             \n\t" \
+                                                                      \
+            "vsub.s16 q8, q4, q0     @V16_G-B (d16,d17)         \n\t" \
+            "vmvn.u16 q15, q12       @V16~R                     \n\t" \
+            "vsub.s16 q6, q0, q2     @V16_B-R (d12,d13)         \n\t" \
+            "vsub.s16 q7, q2, q4     @V16_R-G (d14,d15)         \n\t" \
+            "vand.u16 q1, q13, q15   @VMask2                    \n\t" \
+            "vand.u16 q2, q8, q12    @V16_H(d4,d5)              \n\t" \
+            "vadd.s16 q4, q6, q10    @V16_H2                    \n\t" \
+            "vmvn.u16 q12, q13       @V16~G                     \n\t" \
+            "vadd.s16 q6, q10, q10   @VDiff16_4 (d12,d13)       \n\t" \
+            "vand.u16 q8, q15, q12   @VMask3                    \n\t" \
+            "vand.u16 q15, q4, q1    @vH2(d30,d31)              \n\t" \
+            "vadd.s16 q7, q7, q6     @V16_H3 (d14,d15)          \n\t" \
+            "vadd.s16 q14, q2, q15   @vH16                      \n\t" \
+            "vmovl.u16 q12, d10      @V32_V_L                   \n\t" \
+            "vand.s16 q7, q7, q8     @vH16                      \n\t" \
+            "vmovl.u16 q13, d11      @V32_V_H                   \n\t" \
+            "vadd.s16 q2, q14, q7    @V16_Diff_4                \n\t" \
+                                                                      \
+            "vdup.32 q4, %[v6]                                  \n\t" \
+            "vmul.u32 q14, q9, q4                               \n\t" \
+            "vmul.u32 q15, q11, q4                              \n\t" \
+            "vcvt.f32.u32 q4, q12     @VF1 (d8,d9)              \n\t" \
+            "vcvt.f32.u32 q8, q13     @VF2                      \n\t" \
+            "vcvt.f32.u32 q0, q14     @HF1                      \n\t" \
+            "vcvt.f32.u32 q1, q15     @HF2                      \n\t" \
+            "vrecpe.f32 q12, q4       @Vxinv                    \n\t" \
+            "vrecpe.f32 q13, q8       @Vxinv                    \n\t" \
+            "vrecpe.f32 q5, q0        @Vxinv                    \n\t" \
+            "vrecpe.f32 q7, q1        @Vxinv                    \n\t" \
+            "vrecps.f32 q14, q12, q4  @Vst1                     \n\t" \
+            "vrecps.f32 q15, q13, q8  @Vst1                     \n\t" \
+            "vrecps.f32 q10, q5, q0   @Vst1                     \n\t" \
+            "vrecps.f32 q6, q7, q1    @Vst1                     \n\t" \
+            "vmul.f32 q4, q12, q14                              \n\t" \
+            "vmul.f32 q8, q13, q15                              \n\t" \
+            "vmul.f32 q0, q5, q10                               \n\t" \
+            "vmul.f32 q1, q7, q6                                \n\t" \
+            "vdup.32 q12, %[vsdiv_table]                        \n\t" \
+            "vmul.f32 q14, q4, q12                              \n\t" \
+            "vmul.f32 q15, q8, q12                              \n\t" \
+            "vdup.32 q12, %[vhdiv_table]                        \n\t" \
+            "vmul.f32 q10, q0, q12                              \n\t" \
+            "vmul.f32 q6, q1, q12                               \n\t" \
+                                                                      \
+            "vdup.32 q12, %[bias]                               \n\t" \
+                                                                      \
+            "vadd.f32 q7, q14, q12                              \n\t" \
+            "vadd.f32 q13, q15, q12                             \n\t" \
+            "vcvt.u32.f32 q4, q7                                \n\t" \
+            "vcvt.u32.f32 q8, q13                               \n\t" \
+                                                                      \
+            "vadd.f32 q14, q10, q12                             \n\t" \
+            "vadd.f32 q7, q6, q12                               \n\t" \
+            "vcvt.u32.f32 q0, q14                               \n\t" \
+            "vcvt.u32.f32 q1, q7      @Vres                     \n\t" \
+                                                                      \
+            "vmovl.s16 q7, d4         @V32_H_L (d14,d15)        \n\t" \
+            "vmovl.s16 q5, d5         @V32_H_H (d10,d11)        \n\t" \
+            "vmul.u32 q14, q9, q4                               \n\t" \
+            "vmul.u32 q15, q11, q8                              \n\t" \
+            "vmul.u32 q10, q7, q0                               \n\t" \
+            "vmul.u32 q6, q5, q1                                \n\t" \
+                                                                      \
+            "vdup.32 q12, %[vshift]                             \n\t" \
+            "vadd.u32 q13, q14, q12                             \n\t" \
+            "vadd.u32 q8, q15, q12                              \n\t" \
+            "vadd.u32 q0, q10, q12                              \n\t" \
+            "vadd.u32 q1, q6, q12                               \n\t" \
+            "vshrn.s32 d8, q13, #8                              \n\t" \
+            "vshrn.s32 d9, q8, #8                               \n\t" \
+            "vshrn.s32 d10, q0, #8                              \n\t" \
+            "vshrn.s32 d11, q1, #8                              \n\t" \
+                                                                      \
+            "vdup.16 q8, %[v0]                                  \n\t" \
+            "vshrn.s16 d5, q4, #4                               \n\t" \
+            "vclt.s16 q9, q5, q8                                \n\t" \
+            "vshrn.s16 d4, q5, #4                               \n\t" \
+                                                                      \
+            "vmovn.s16 d9, q9                                   \n\t" \
+            "vdup.8 d7, %[vhrange]                              \n\t" \
+            "vand.u8 d10, d9, d7                                \n\t" \
+            "vadd.s8 d4, d4, d10                                \n\t" \
+            "vst3.8 {d4-d6}, [%[out]] @HSV                      \n\t" \
+            : /*no output*/                                           \
+            : [out] "r" (dst + dj), [in] "r" (src + sj),              \
+                        [vsdiv_table] "r" (vsdiv_table),              \
+                        [vshift] "r" (vshift),                        \
+                        [vhdiv_table] "r" (vhdiv_table),              \
+                        [v6] "r" (v6), [vhrange] "r" (vhrange),       \
+                        [v0] "r" (v0), [bias] "r" (bias)              \
+            : "d0","d1","d2","d3","d4","d5","d6","d7",                \
+              "d8","d9","d10","d11","d12","d13","d14","d15",          \
+              "d16","d17","d18","d19","d20","d21","d22","d23",        \
+              "d24","d25","d26","d27","d28","d29","d30","d31"         \
+            );
+
+#if __GNUC_MINOR__ < 7
+
+#define YCRCB_CONSTS                                                        \
+    register int16x4_t vcYR  asm ("d31") = vmov_n_s16(4899);                \
+    register int16x4_t vcYG  asm ("d30") = vmov_n_s16(9617);                \
+    register int16x4_t vcYB  asm ("d29") = vmov_n_s16(1868);                \
+    register int16x4_t vcCrG asm ("d28") = vmov_n_s16(6860);                \
+    register int16x4_t vcCrB asm ("d27") = vmov_n_s16(1332);                \
+    register int16x4_t vcCbR asm ("d26") = vmov_n_s16(2765);                \
+    register int16x4_t vcCbG asm ("d25") = vmov_n_s16(5427);
+
+#else
+
+#define YCRCB_CONSTS                                                        \
+    const s16       convertCoeffs[] = { 4899, 4899, 4899, 4899,             \
+                                        9617, 9617, 9617, 9617,             \
+                                        1868, 1868, 1868, 1868,             \
+                                        6860, 6860, 6860, 6860,             \
+                                        1332, 1332, 1332, 1332,             \
+                                        2765, 2765, 2765, 2765,             \
+                                        5427, 5427, 5427, 5427  };          \
+    const int16x8_t vcYRG  = vld1q_s16(convertCoeffs);      /*YR and YG*/   \
+    const int16x4_t vcYB   = vld1_s16(convertCoeffs + 8);   /*YB*/          \
+    const int16x8_t vcCrGB = vld1q_s16(convertCoeffs + 12); /*CrG and CrB*/ \
+    const int16x8_t vcCbRG = vld1q_s16(convertCoeffs + 20); /*CbR and CbG*/
+
+#endif
+
+#define CONVERTTOYCRCB(loadcmd, rreg, greg, breg)                           \
+    __asm__ (                                                               \
+       #loadcmd   ", [%[in]] @RGB                       \n\t"               \
+    "vmovl.u8 q2, " #rreg "  @R (d4,d5)                 \n\t"               \
+    "vmovl.u8 q3, " #greg "  @G (d6,d7)                 \n\t"               \
+    "vmovl.u8 q4, " #breg "  @B (d8,d9)                 \n\t"               \
+                                                                            \
+    "vshll.u16 q7, d4, #13   @Cr(q7,q8):  R             \n\t"               \
+    "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"               \
+    "vshll.u16 q9, d8, #13   @Cb(q9,q10): B             \n\t"               \
+    "vshll.u16 q8, d5, #13   @Cr(q7,q8):  R             \n\t"               \
+    "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"               \
+    "vshll.u16 q10, d9, #13  @Cb(q9,q10): B             \n\t"               \
+                                                                            \
+    "vmlsl.s16 q7, d6, d28   @Cr(q7,q8):  RG            \n\t"               \
+    "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"               \
+    "vmlsl.s16 q9, d4, d26   @Cb(q9,q10): BR            \n\t"               \
+    "vmlsl.s16 q8, d7, d28   @Cr(q7,q8):  RG            \n\t"               \
+    "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"               \
+    "vmlsl.s16 q10, d5, d26  @Cb(q9,q10): BR            \n\t"               \
+                                                                            \
+    "vmlsl.s16 q7, d8, d27   @Cr(q7,q8):  RGB           \n\t"               \
+    "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"               \
+    "vmlsl.s16 q9, d6, d25   @Cb(q9,q10): BRG           \n\t"               \
+    "vmlsl.s16 q8, d9, d27   @Cr(q7,q8):  RGB           \n\t"               \
+    "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"               \
+    "vmlsl.s16 q10, d7, d25  @Cb(q9,q10): BRG           \n\t"               \
+                                                                            \
+    "vrshrn.s32 d4, q7, #14  @Cr -> q2                  \n\t"               \
+    "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"               \
+    "vrshrn.s32 d6, q9, #14  @Cb -> q3                  \n\t"               \
+    "vrshrn.s32 d5, q8, #14  @Cr -> q2                  \n\t"               \
+    "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"               \
+    "vrshrn.s32 d7, q10, #14 @Cb -> q3                  \n\t"               \
+                                                                            \
+    "vmov.s16 q5, #128                                  \n\t"               \
+    "vmov.s16 q6, #128                                  \n\t"               \
+    "vadd.i16 q5, q2         @Cr -> q5                  \n\t"               \
+    "vadd.i16 q6, q3         @Cb -> q6                  \n\t"               \
+                                                                            \
+    "vqmovn.u16 d4, q4                                  \n\t"               \
+    "vqmovun.s16 d5, q5                                 \n\t"               \
+    "vqmovun.s16 d6, q6                                 \n\t"               \
+                                                                            \
+    "vst3.8 {d4-d6}, [%[out]]                           \n\t"               \
+    : /*no output*/                                                         \
+    : [out] "r" (dst + dj), [in] "r" (src + sj),                            \
+      "w" (vcYR), "w" (vcYG), "w" (vcYB),                                   \
+      "w" (vcCrB), "w" (vcCrG), "w" (vcCbG), "w" (vcCbR)                    \
+    : "d0","d1","d2","d3","d4","d5","d6","d7",                              \
+      "d8","d9","d10","d11","d12","d13","d14","d15",                        \
+      "d16","d17","d18","d19","d20","d21"                                   \
+    );
+
+
+inline uint8x8x3_t convertToYCrCb( const int16x8_t& vR, const int16x8_t& vG, const int16x8_t& vB,
+                                   const int16x8_t& vcYRG, const int16x4_t& vcYB,
+                                   const int16x8_t& vcCrGB, const int16x8_t& vcCbRG )
+{
+    int32x4_t vCrL = vshll_n_s16(vget_low_s16(vR), 13);                  // R
+    int32x4_t vCrH = vshll_n_s16(vget_high_s16(vR), 13);                 // R
+    int32x4_t vYL  = vmull_s16(vget_low_s16(vG), vget_high_s16(vcYRG));  // G
+    int32x4_t vYH  = vmull_s16(vget_high_s16(vG), vget_high_s16(vcYRG)); // G
+    int32x4_t vCbL = vshll_n_s16(vget_low_s16(vB), 13);                  // B
+    int32x4_t vCbH = vshll_n_s16(vget_high_s16(vB), 13);                 // B
+
+    vCrL = vmlsl_s16(vCrL, vget_low_s16(vG), vget_low_s16(vcCrGB));      // RG
+    vCrH = vmlsl_s16(vCrH, vget_high_s16(vG), vget_low_s16(vcCrGB));     // RG
+    vYL  = vmlal_s16(vYL, vget_low_s16(vB), vcYB);                       // GB
+    vYH  = vmlal_s16(vYH, vget_high_s16(vB), vcYB);                      // GB
+    vCbL = vmlsl_s16(vCbL, vget_low_s16(vR), vget_low_s16(vcCbRG));      // BR
+    vCbH = vmlsl_s16(vCbH, vget_high_s16(vR), vget_low_s16(vcCbRG));     // BR
+
+    vCrL = vmlsl_s16(vCrL, vget_low_s16(vB), vget_high_s16(vcCrGB));     // RGB
+    vCrH = vmlsl_s16(vCrH, vget_high_s16(vB), vget_high_s16(vcCrGB));    // RGB
+    vYL  = vmlal_s16(vYL, vget_low_s16(vR), vget_low_s16(vcYRG));        // GBR
+    vYH  = vmlal_s16(vYH, vget_high_s16(vR), vget_low_s16(vcYRG));       // GBR
+    vCbL = vmlsl_s16(vCbL, vget_low_s16(vG), vget_high_s16(vcCbRG));     // BRG
+    vCbH = vmlsl_s16(vCbH, vget_high_s16(vG), vget_high_s16(vcCbRG));    // BRG
+
+    int16x4_t vCrL_ = vrshrn_n_s32(vCrL, 14);
+    int16x4_t vCrH_ = vrshrn_n_s32(vCrH, 14);
+    int16x4_t vYL_  = vrshrn_n_s32(vYL, 14);
+    int16x4_t vYH_  = vrshrn_n_s32(vYH, 14);
+    int16x4_t vCbL_ = vrshrn_n_s32(vCbL, 14);
+    int16x4_t vCbH_ = vrshrn_n_s32(vCbH, 14);
+
+    int16x8_t vCr = vmovq_n_s16(128);
+    int16x8_t vCb = vmovq_n_s16(128);
+
+    vCr = vaddq_s16(vCr, vcombine_s16(vCrL_, vCrH_));
+    vCb = vaddq_s16(vCb, vcombine_s16(vCbL_, vCbH_));
+
+    uint8x8x3_t vYCrCb;
+    vYCrCb.val[0] = vqmovn_u16(vreinterpretq_u16_s16(vcombine_s16(vYL_, vYH_)));
+    vYCrCb.val[1] = vqmovun_s16(vCr);
+    vYCrCb.val[2] = vqmovun_s16(vCb);
+
+    return vYCrCb;
+}
+
+#define S_CONVERTTOYCRCB(R, G, B)                                           \
+    s32 Y =         (R * 4899    + G * 9617 + B * 1868 + (1 << 13)) >> 14;  \
+    s32 Cr = 128 + ((R * 8192    - G * 6860 - B * 1332 + (1 << 13)) >> 14); \
+    s32 Cb = 128 + ((R * (-2765) - G * 5427 + B * 8192 + (1 << 13)) >> 14); \
+    dst[dj + 0] = internal::saturate_cast<u8>(Y);                           \
+    dst[dj + 1] = internal::saturate_cast<u8>(Cr);                          \
+    dst[dj + 2] = internal::saturate_cast<u8>(Cb);
+
+#define COEFF_Y   (   149)
+#define COEFF_BU  (   129)
+#define COEFF_RV  (   102)
+#define COEFF_GU  (    25)
+#define COEFF_GV  (    52)
+#define COEFF_R   (-14248)
+#define COEFF_G   (  8663)
+#define COEFF_B   (-17705)
+
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+#define YUV420ALPHA3_CONST
+#define YUV420ALPHA4_CONST register uint8x16_t c255  asm ("q13") = vmovq_n_u8(255);
+#define YUV420ALPHA3_CONVERT
+#define YUV420ALPHA4_CONVERT , "w" (c255)
+#define YUV420STORE1CMD3 "vst3.8 {d20, d22, d24}"
+#define YUV420STORE2CMD3 "vst3.8 {d21, d23, d25}"
+#define YUV420STORE1CMD4 "vst4.8 {d20, d22, d24, d26}"
+#define YUV420STORE2CMD4 "vst4.8 {d21, d23, d25, d27}"
+
+#define YUV420_CONSTS(cn, bIdx, vIdx)                            \
+    register const s32 cR = s16(COEFF_R);                        \
+    register const s32 cG = s16(COEFF_G);                        \
+    register const s32 cB = s16(COEFF_B);                        \
+                                                                 \
+    register uint8x16_t vc16  asm ("q15") = vmovq_n_u8(16);      \
+    register uint8x8_t cGU    asm ("d14") = vmov_n_u8(COEFF_GU); \
+    register uint8x8_t cGV    asm ("d15") = vmov_n_u8(COEFF_GV); \
+    register uint8x8_t cRV    asm ("d16") = vmov_n_u8(COEFF_RV); \
+    register uint8x8_t cBU    asm ("d17") = vmov_n_u8(COEFF_BU); \
+    register uint8x16_t cRGBY asm ("q3")  = vmovq_n_u8(COEFF_Y); \
+    YUV420ALPHA##cn##_CONST
+
+#define CONVERTYUV420TORGB(cn, ureg, vreg, rreg, breg)                                                    \
+    __asm__ (                                                                                             \
+        "vld2.8 {d0-d1}, [%[inUV]]                      @UV                                         \n\t" \
+        "vdup.16 q4, %[cG]                              @cG                                         \n\t" \
+        "vld2.8 {d2-d3}, [%[inY1]]                      @YY                                         \n\t" \
+        "vdup.16 "#rreg", %[cR]                         @cR                                         \n\t" \
+        "vld2.8 {d4-d5}, [%[inY2]]                      @YY                                         \n\t" \
+        "vdup.16 "#breg", %[cB]                         @cB                                         \n\t" \
+        "vmlsl.u8 q4, "#ureg", d14                      @cG-25u                                     \n\t" \
+        "vmax.u8 q1, q15                                @max(Y,16)                                  \n\t" \
+        "vmlal.u8 "#rreg", "#vreg", d16                 @cR+102*v                                   \n\t" \
+        "vmlal.u8 "#breg", "#ureg", d17                 @cB+129*u                                   \n\t" \
+        "vmax.u8 q2, q15                                @max(Y,16)                                  \n\t" \
+        "vmlsl.u8 q4, "#vreg", d15                      @cG-25u-52v                                 \n\t" \
+                                                                         /*q10,q11,q12,q13 - for output*/ \
+        "vmull.u8 q9, d3, d6                            @h 149*y                                    \n\t" \
+        "vmull.u8 q10, d2, d7                           @l 149*y                                    \n\t" \
+        "vshr.u16 q9, #1                                @h (149*y)/2                                \n\t" \
+        "vshr.u16 q10, #1                               @l (149*y)/2                                \n\t" \
+                                                                                                          \
+        "vhadd.s16 q0, q9, q4                           @hG ((149*y)/2 + cG - 25*u - 52*v)/2        \n\t" \
+        "vhadd.s16 q12, q10, q6                         @lB ((149*y)/2 + cB + 129*u)/2              \n\t" \
+        "vhadd.s16 q1, q9, q5                           @hR ((149*y)/2 + cR + 102*v)/2              \n\t" \
+        "vhadd.s16 q11, q10, q4                         @lG ((149*y)/2 + cG - 25*u - 52*v)/2        \n\t" \
+        "vhadd.s16 q9, q6                               @hB ((149*y)/2 + cB + 129*u)/2              \n\t" \
+        "vhadd.s16 q10, q5                              @lR ((149*y)/2 + cR + 102*v)/2              \n\t" \
+                                                                                                          \
+        "vqrshrun.s16 d24, q12, #5                      @lB ((149*y)/2 + cB + 129*u)/2/32           \n\t" \
+        "vqrshrun.s16 d22, q11, #5                      @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32     \n\t" \
+        "vqrshrun.s16 d20, q10, #5                      @lR ((149*y)/2 + cR + 102*v)/2/32           \n\t" \
+        "vqrshrun.s16 d23, q0, #5                       @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32     \n\t" \
+        "vqrshrun.s16 d21, q1, #5                       @hR ((149*y)/2 + cR + 102*v)/2/32           \n\t" \
+        "vqrshrun.s16 d25, q9, #5                       @hB ((149*y)/2 + cB + 129*u)/2/32           \n\t" \
+                                                                                                          \
+        "vzip.8 d22, d23                                @G                \n\t"                           \
+        "vzip.8 d20, d21                                @R                \n\t"                           \
+        "vzip.8 d24, d25                                @B                \n\t"                           \
+                                                                                                          \
+        YUV420STORE1CMD##cn", [%[out1]]                                \n\t"                              \
+        YUV420STORE2CMD##cn", [%[out1x]]                               \n\t"                              \
+                                                                                                          \
+        "vmull.u8 q9, d5, d6                            @h 149*y                \n\t"                     \
+        "vmull.u8 q10, d4, d7                           @l 149*y                \n\t"                     \
+        "vshr.u16 q9, #1                                @h (149*y)/2            \n\t"                     \
+        "vshr.u16 q10, #1                               @l (149*y)/2            \n\t"                     \
+                                                                                                          \
+        "vhadd.s16 q0, q9, q4                           @hG ((149*y)/2 + cG - 25*u - 52*v)/2        \n\t" \
+        "vhadd.s16 q12, q10, q6                         @lB ((149*y)/2 + cB + 129*u)/2              \n\t" \
+        "vhadd.s16 q1, q9, q5                           @hR ((149*y)/2 + cR + 102*v)/2              \n\t" \
+        "vhadd.s16 q11, q10, q4                         @lG ((149*y)/2 + cG - 25*u - 52*v)/2        \n\t" \
+        "vhadd.s16 q9, q6                               @hB ((149*y)/2 + cB + 129*u)/2              \n\t" \
+        "vhadd.s16 q10, q5                              @lR ((149*y)/2 + cR + 102*v)/2              \n\t" \
+                                                                                                          \
+        "vqrshrun.s16 d24, q12, #5                      @lB ((149*y)/2 + cB + 129*u)/2/32           \n\t" \
+        "vqrshrun.s16 d22, q11, #5                      @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32     \n\t" \
+        "vqrshrun.s16 d20, q10, #5                      @lR ((149*y)/2 + cR + 102*v)/2/32           \n\t" \
+        "vqrshrun.s16 d23, q0, #5                       @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32     \n\t" \
+        "vqrshrun.s16 d21, q1, #5                       @hR ((149*y)/2 + cR + 102*v)/2/32           \n\t" \
+        "vqrshrun.s16 d25, q9, #5                       @hB ((149*y)/2 + cB + 129*u)/2/32           \n\t" \
+                                                                                                          \
+        "vzip.8 d22, d23                                @G                \n\t"                           \
+        "vzip.8 d20, d21                                @R                \n\t"                           \
+        "vzip.8 d24, d25                                @B                \n\t"                           \
+                                                                                                          \
+        YUV420STORE1CMD##cn", [%[out2]]                                \n\t"                              \
+        YUV420STORE2CMD##cn", [%[out2x]]                               \n\t"                              \
+                                                                                                          \
+        : /*no output*/                                                                                   \
+        : [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj),                                                 \
+          [out1x] "r" (dst1 + dj+cn*8), [out2x] "r" (dst2 + dj+cn*8),                                     \
+          [inUV] "r" (uv+j), [inY1] "r" (y1+j), [inY2] "r" (y2+j),                                        \
+          [cR] "r" (cR), [cG] "r" (cG), [cB] "r" (cB),                                                    \
+          "w" (vc16), "w" (cGU), "w" (cGV), "w" (cBU), "w" (cRV), "w" (cRGBY) YUV420ALPHA##cn##_CONVERT   \
+        : "d0","d1","d2","d3","d4","d5","d8","d9","d10","d11","d12",                                      \
+          "d13","d18","d19","d20","d21","d22","d23","d24","d25"                                           \
+    );
+
+#else
+
+template<int bIdx>
+struct _convertYUV420Internals
+{
+    uint16x8_t vc14216;
+    uint16x8_t vc17672;
+    uint16x8_t vc8696;
+    uint8x8_t  vc102;
+    uint8x8_t  vc25;
+    uint8x8_t  vc129;
+    uint8x8_t  vc52;
+    uint16x8_t vc_1;
+    uint8x8_t  vc149;
+    uint8x8_t  vc16;
+    _convertYUV420Internals()
+    {
+        vc14216 = vdupq_n_u16(-COEFF_R);
+        vc17672 = vdupq_n_u16(-COEFF_B);
+        vc8696  = vdupq_n_u16(COEFF_G);
+        vc102   = vdup_n_u8(COEFF_RV);
+        vc25    = vdup_n_u8(COEFF_GU);
+        vc129   = vdup_n_u8(COEFF_BU);
+        vc52    = vdup_n_u8(COEFF_GV);
+        vc_1    = vdupq_n_u16((uint16_t)-1);
+        vc149   = vdup_n_u8(COEFF_Y);
+        vc16    = vdup_n_u8(16);
+    }
+
+    inline void UVrgbToRGB( const int16x8_t &ruv, const int16x8_t &guv, const int16x8_t &buv,
+                            const u8 *y, uint8x16x3_t &rgbl )
+    {
+        //y get line
+        uint8x8x2_t yl = vld2_u8(y);
+        yl.val[0] = vmax_u8(yl.val[0], vc16);
+        yl.val[1] = vmax_u8(yl.val[1], vc16);
+
+        //y part line
+        uint16x8_t yodd1 = vmlal_u8(vc_1, yl.val[0], vc149); //(-1+149*y)
+        uint16x8_t yevn1 = vmlal_u8(vc_1, yl.val[1], vc149); //(-1+149*y)
+        int16x8_t yodd1h = (int16x8_t)vshrq_n_u16(yodd1, 1);  //(-1+149*y)/2
+        int16x8_t yevn1h = (int16x8_t)vshrq_n_u16(yevn1, 1);  //(-1+149*y)/2
+
+        //y line calc rgb
+        int16x8_t rodd1w = vhsubq_s16(yodd1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2
+        int16x8_t gevn1w = vhaddq_s16(yevn1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2
+        int16x8_t bodd1w = vhsubq_s16(yodd1h, buv); //((-1+149*y)/2 - (17672-129*u))/2
+        int16x8_t revn1w = vhsubq_s16(yevn1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2
+        int16x8_t godd1w = vhaddq_s16(yodd1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2
+        int16x8_t bevn1w = vhsubq_s16(yevn1h, buv); //((-1+149*y)/2 - (17672-129*u))/2
+
+        //y line clamp + narrow
+        uint8x8_t rodd1n = vqshrun_n_s16(rodd1w, 5);
+        uint8x8_t revn1n = vqshrun_n_s16(revn1w, 5);
+        uint8x8_t godd1n = vqshrun_n_s16(godd1w, 5);
+        uint8x8x2_t r1 = vzip_u8 (rodd1n, revn1n);
+        uint8x8_t gevn1n = vqshrun_n_s16(gevn1w, 5);
+        uint8x8_t bodd1n = vqshrun_n_s16(bodd1w, 5);
+        uint8x8x2_t g1 = vzip_u8 (godd1n, gevn1n);
+        uint8x8_t bevn1n = vqshrun_n_s16(bevn1w, 5);
+        uint8x8x2_t b1 = vzip_u8 (bodd1n, bevn1n);
+        rgbl.val[2 - bIdx] = vcombine_u8(r1.val[0], r1.val[1]);
+        rgbl.val[1]        = vcombine_u8(g1.val[0], g1.val[1]);
+        rgbl.val[0 + bIdx] = vcombine_u8(b1.val[0], b1.val[1]);
+    }
+};
+
+template<int cn, int bIdx, int vIdx>
+struct _convertYUV420
+{
+    _convertYUV420Internals<bIdx> convertYUV420Internals;
+
+    inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,
+                       u8 *dst1, u8 *dst2 )
+    {
+        uint8x8x2_t raw_uv = vld2_u8(uv);
+        uint16x8_t gu =            vmlsl_u8(convertYUV420Internals.vc8696,  raw_uv.val[1-vIdx], convertYUV420Internals.vc25);  //(8696-25*u)
+        int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)
+
+        int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)
+        int16x8_t guv = (int16x8_t)vmlsl_u8(gu,      raw_uv.val[vIdx], convertYUV420Internals.vc52);  //((8696-25*u)-52*v))
+
+        uint8x16x3_t rgbl;
+        //y line1
+        convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl);
+        vst3q_u8(dst1, rgbl);
+        //y line2
+        convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl);
+        vst3q_u8(dst2, rgbl);
+    }
+};
+
+template<int bIdx, int vIdx>
+struct _convertYUV420<4, bIdx, vIdx>
+{
+    _convertYUV420Internals<bIdx> convertYUV420Internals;
+
+    inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,
+                       u8 *dst1, u8 *dst2 )
+    {
+        uint8x8x2_t raw_uv = vld2_u8(uv);
+        uint16x8_t gu =            vmlsl_u8(convertYUV420Internals.vc8696,  raw_uv.val[1-vIdx], convertYUV420Internals.vc25);  //(8696-25*u)
+        int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)
+
+        int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)
+        int16x8_t guv = (int16x8_t)vmlsl_u8(gu,      raw_uv.val[vIdx], convertYUV420Internals.vc52);  //((8696-25*u)-52*v))
+
+        union { uint8x16x4_t v4; uint8x16x3_t v3; } rgbl;
+        rgbl.v4.val[3] = vdupq_n_u8(0xff);
+        //y line1
+        convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl.v3);
+        vst4q_u8(dst1, rgbl.v4);
+        //y line2
+        convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl.v3);
+        vst4q_u8(dst2, rgbl.v4);
+    }
+};
+
+#define YUV420_CONSTS(cn, bIdx, vIdx) _convertYUV420<cn, bIdx, vIdx> convertYUV420;
+
+#endif
+
+template <int cn> inline void fillAlpha(u8 *, u8 *){}
+template <> inline void fillAlpha<4>(u8 *dst1, u8 *dst2)
+{
+    dst1[3] = 255;
+    dst1[7] = 255;
+    dst2[3] = 255;
+    dst2[7] = 255;
+}
+template <int cn, int bIdx, int vIdx>
+inline void convertYUV420ToRGB(const u8 *y1, const u8 *y2, const u8 *uv, u8* dst1, u8 *dst2)
+{
+    int Y11 = y1[0];
+    int Y12 = y1[1];
+    int Y21 = y2[0];
+    int Y22 = y2[1];
+
+    int U = uv[1 - vIdx];
+    int V = uv[vIdx];
+
+    int y11 = (COEFF_Y * std::max(16, Y11)) >> 1;
+    int y12 = (COEFF_Y * std::max(16, Y12)) >> 1;
+    int y21 = (COEFF_Y * std::max(16, Y21)) >> 1;
+    int y22 = (COEFF_Y * std::max(16, Y22)) >> 1;
+
+    int uvR = COEFF_R +                COEFF_RV * V;
+    int uvG = COEFF_G - COEFF_GU * U - COEFF_GV * V;
+    int uvB = COEFF_B + COEFF_BU * U;
+
+    dst1[2-bIdx] = internal::saturate_cast<u8>((((y11 + uvR) >> 1) + (1 << 4)) >> 5);
+    dst1[1] = internal::saturate_cast<u8>((((y11 + uvG) >> 1) + (1 << 4)) >> 5);
+    dst1[bIdx] = internal::saturate_cast<u8>((((y11 + uvB) >> 1) + (1 << 4)) >> 5);
+
+    dst1[cn+2-bIdx] = internal::saturate_cast<u8>((((y12 + uvR) >> 1) + (1 << 4)) >> 5);
+    dst1[cn+1] = internal::saturate_cast<u8>((((y12 + uvG) >> 1) + (1 << 4)) >> 5);
+    dst1[cn+bIdx] = internal::saturate_cast<u8>((((y12 + uvB) >> 1) + (1 << 4)) >> 5);
+
+    dst2[2-bIdx] = internal::saturate_cast<u8>((((y21 + uvR) >> 1) + (1 << 4)) >> 5);
+    dst2[1] = internal::saturate_cast<u8>((((y21 + uvG) >> 1) + (1 << 4)) >> 5);
+    dst2[bIdx] = internal::saturate_cast<u8>((((y21 + uvB) >> 1) + (1 << 4)) >> 5);
+
+    dst2[cn+2-bIdx] = internal::saturate_cast<u8>((((y22 + uvR) >> 1) + (1 << 4)) >> 5);
+    dst2[cn+1] = internal::saturate_cast<u8>((((y22 + uvG) >> 1) + (1 << 4)) >> 5);
+    dst2[cn+bIdx] = internal::saturate_cast<u8>((((y22 + uvB) >> 1) + (1 << 4)) >> 5);
+
+    fillAlpha<cn>(dst1, dst2);
+}
+
+// converts R, G, B (B, G, R) pixels to  RGB(BGR)565 format respectively
+inline uint8x16x2_t convertTo565( const uint8x16_t& vR, const uint8x16_t& vG, const uint8x16_t& vB )
+{
+    uint8x16x2_t vRgb565;                               // rrrrRRRR ggggGGGG bbbbBBBB
+
+    vRgb565.val[1] = vsriq_n_u8(vB, vG, 5);             // xxxxxxxx bbbbBggg
+    vRgb565.val[0] = vshlq_n_u8(vG, 3);                 // gGGGG000 bbbbBggg
+    vRgb565.val[0] = vsriq_n_u8(vRgb565.val[0], vR, 3); // gGGrrrrR bbbbBggg
+
+    return vRgb565;
+}
+inline void convertTo565( const u16 R, const u16 G, const u16 B, u8 * dst )
+{
+    *((u16*)dst) = (R >> 3)|((G&~3) << 3)|((B&~7) << 8);
+}
+#endif
+
+} //namespace
+
+void rgb2hsv(const Size2D &size,
+             const u8 * srcBase, ptrdiff_t srcStride,
+             u8 * dstBase, ptrdiff_t dstStride,
+             s32 hrange)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+    const s32 hsv_shift = 12;
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register const f32 vsdiv_table = f32(255 << hsv_shift);
+    register f32 vhdiv_table = f32(hrange << hsv_shift);
+    register const s32 vhrange = hrange;
+    register const s32 v0 = s32(0);
+    register const s32 vshift = s32(1 << (hsv_shift-1));
+    register const s32 v6 = s32(6);
+    register const f32 bias = 0.5f;
+#endif
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw8; sj += 24, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d0, d2)
+#else
+            uint8x8x3_t vRgb = vld3_u8(src + sj);
+            uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);
+            vst3_u8(dst + dj, vHsv);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 3, dj += 3)
+        {
+            convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)hrange;
+#endif
+}
+
+void rgbx2hsv(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              s32 hrange)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+    const s32 hsv_shift = 12;
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register const f32 vsdiv_table = f32(255 << hsv_shift);
+    register f32 vhdiv_table = f32(hrange << hsv_shift);
+    register const s32 vhrange = hrange;
+    register const s32 v0 = s32(0);
+    register const s32 vshift = s32(1 << (hsv_shift-1));
+    register const s32 v6 = s32(6);
+    register const f32 bias = 0.5f;
+#endif
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw8; sj += 32, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d0, d2)
+#else
+            uint8x8x4_t vRgb = vld4_u8(src + sj);
+            uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);
+            vst3_u8(dst + dj, vHsv);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 4, dj += 3)
+        {
+            convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)hrange;
+#endif
+}
+
+void bgr2hsv(const Size2D &size,
+             const u8 * srcBase, ptrdiff_t srcStride,
+             u8 * dstBase, ptrdiff_t dstStride,
+             s32 hrange)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+    const s32 hsv_shift = 12;
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register const f32 vsdiv_table = f32(255 << hsv_shift);
+    register f32 vhdiv_table = f32(hrange << hsv_shift);
+    register const s32 vhrange = hrange;
+    register const s32 v0 = s32(0);
+    register const s32 vshift = s32(1 << (hsv_shift-1));
+    register const s32 v6 = s32(6);
+    register const f32 bias = 0.5f;
+#endif
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw8; sj += 24, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d2, d0)
+#else
+            uint8x8x3_t vRgb = vld3_u8(src + sj);
+            uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);
+            vst3_u8(dst + dj, vHsv);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 3, dj += 3)
+        {
+            convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)hrange;
+#endif
+}
+
+void bgrx2hsv(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              s32 hrange)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+    const s32 hsv_shift = 12;
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+    register const f32 vsdiv_table = f32(255 << hsv_shift);
+    register f32 vhdiv_table = f32(hrange << hsv_shift);
+    register const s32 vhrange = hrange;
+    register const s32 v0 = s32(0);
+    register const s32 vshift = s32(1 << (hsv_shift-1));
+    register const s32 v6 = s32(6);
+    register const f32 bias = 0.5f;
+#endif
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw8; sj += 32, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d2, d0)
+#else
+            uint8x8x4_t vRgb = vld4_u8(src + sj);
+            uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);
+            vst3_u8(dst + dj, vHsv);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 4, dj += 3)
+        {
+            convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)hrange;
+#endif
+}
+
+void rgbx2bgr565(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw16; sj += 64, dj += 32, j += 16)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            __asm__ (
+                "vld4.8 {d2, d4, d6, d8}, [%[in0]]        @  q0       q1       q2       q3       q4       \n\t"
+                "vld4.8 {d3, d5, d7, d9}, [%[in1]]        @  xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
+                "vsri.8 q1, q2, #5                        @  xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
+                "vshl.u8 q0, q2, #3                       @  gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
+                "vsri.8 q0, q3, #3                        @  gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
+                "vst2.8 {d0, d2}, [%[out0]]                                                               \n\t"
+                "vst2.8 {d1, d3}, [%[out1]]                                                               \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [out1] "r" (dst + dj + 16),
+                  [in0]  "r" (src + sj),
+                  [in1]  "r" (src + sj + 32)
+                : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
+            );
+#else
+            uint8x16x4_t vRgba = vld4q_u8(src + sj);
+            uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);
+            vst2q_u8(dst + dj, vVal565);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 4, dj += 2)
+        {
+            convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgb2bgr565(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw16; sj += 48, dj += 32, j += 16)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            __asm__ (
+                "vld3.8 {d2, d4, d6}, [%[in0]]       @  q0       q1       q2       q3       q4       \n\t"
+                "vld3.8 {d3, d5, d7}, [%[in1]]       @  xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
+                "vsri.8 q1, q2, #5                   @  xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
+                "vshl.u8 q0, q2, #3                  @  gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
+                "vsri.8 q0, q3, #3                   @  gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
+                "vst2.8 {d0, d2}, [%[out0]]                                                          \n\t"
+                "vst2.8 {d1, d3}, [%[out1]]                                                          \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [out1] "r" (dst + dj + 16),
+                  [in0]  "r" (src + sj),
+                  [in1]  "r" (src + sj + 24)
+                : "d0","d1","d2","d3","d4","d5","d6","d7"
+            );
+#else
+            uint8x16x3_t vRgba = vld3q_u8(src + sj);
+            uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);
+            vst2q_u8(dst + dj, vVal565);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 3, dj += 2)
+        {
+            convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgbx2rgb565(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw16; sj += 64, dj += 32, j += 16)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            __asm__ (
+                "vld4.8 {d0, d2, d4, d6}, [%[in0]]    @  q0       q1       q2       q3         \n\t"
+                "vld4.8 {d1, d3, d5, d7}, [%[in1]]    @  rrrrRRRR ggggGGGG bbbbBBBB aaaaAAAA   \n\t"
+                "vsri.8 q2, q1, #5                    @  rrrrRRRR ggggGGGG bbbbBggg aaaaAAAA   \n\t"
+                "vshl.u8 q1, #3                       @  rrrrRRRR gGGGG000 bbbbBggg aaaaAAAA   \n\t"
+                "vsri.8 q1, q0, #3                    @  rrrrRRRR gGGrrrrR bbbbBggg aaaaAAAA   \n\t"
+                "vst2.8 {d2, d4}, [%[out0]]                                                    \n\t"
+                "vst2.8 {d3, d5}, [%[out1]]                                                    \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [out1] "r" (dst + dj + 16),
+                  [in0]  "r" (src + sj),
+                  [in1]  "r" (src + sj + 32)
+                : "d0","d1","d2","d3","d4","d5","d6","d7"
+            );
+#else
+            uint8x16x4_t vRgba = vld4q_u8(src + sj);
+            uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);
+            vst2q_u8(dst + dj, vVal565);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 4, dj += 2)
+        {
+            convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgb2rgb565(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw16; sj += 48, dj += 32, j += 16)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            __asm__ (
+                "vld3.8 {d0, d2, d4}, [%[in0]]        @  q0       q1       q2       q3         \n\t"
+                "vld3.8 {d1, d3, d5}, [%[in1]]        @  rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx   \n\t"
+                "vsri.8 q2, q1, #5                    @  rrrrRRRR ggggGGGG bbbbBggg xxxxxxxx   \n\t"
+                "vshl.u8 q1, #3                       @  rrrrRRRR gGGGG000 bbbbBggg xxxxxxxx   \n\t"
+                "vsri.8 q1, q0, #3                    @  rrrrRRRR gGGrrrrR bbbbBggg xxxxxxxx   \n\t"
+                "vst2.8 {d2, d4}, [%[out0]]                                                    \n\t"
+                "vst2.8 {d3, d5}, [%[out1]]                                                    \n\t"
+                : /*no output*/
+                : [out0] "r" (dst + dj),
+                  [out1] "r" (dst + dj + 16),
+                  [in0]  "r" (src + sj),
+                  [in1]  "r" (src + sj + 24)
+                : "d0","d1","d2","d3","d4","d5"
+            );
+#else
+            uint8x16x3_t vRgba = vld3q_u8(src + sj);
+            uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);
+            vst2q_u8(dst + dj, vVal565);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 3, dj += 2)
+        {
+            convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgb2ycrcb(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YCRCB_CONSTS
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw8; sj += 24, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTTOYCRCB(vld3.8 {d0-d2}, d0, d1, d2)
+#else
+            uint8x8x3_t vRgb = vld3_u8(src + sj);
+            int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[0]));
+            int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[1]));
+            int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[2]));
+            uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
+            vst3_u8(dst + dj, vYCrCb);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 3, dj += 3)
+        {
+            S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void rgbx2ycrcb(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YCRCB_CONSTS
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw8; sj += 32, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTTOYCRCB(vld4.8 {d0-d3}, d0, d1, d2)
+#else
+            uint8x8x4_t vRgba = vld4_u8(src + sj);
+            int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[0]));
+            int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[1]));
+            int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[2]));
+            uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
+            vst3_u8(dst + dj, vYCrCb);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 4, dj += 3)
+        {
+            S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bgr2ycrcb(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YCRCB_CONSTS
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw8; sj += 24, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTTOYCRCB(vld3.8 {d0-d2}, d2, d1, d0)
+#else
+            uint8x8x3_t vBgr = vld3_u8(src + sj);
+            int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[0]));
+            int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[1]));
+            int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[2]));
+            uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
+            vst3_u8(dst + dj, vYCrCb);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 3, dj += 3)
+        {
+            S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bgrx2ycrcb(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YCRCB_CONSTS
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u, j = 0u;
+
+        for (; j < roiw8; sj += 32, dj += 24, j += 8)
+        {
+            internal::prefetch(src + sj);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTTOYCRCB(vld4.8 {d0-d3}, d2, d1, d0)
+#else
+            uint8x8x4_t vBgra = vld4_u8(src + sj);
+            int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[0]));
+            int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[1]));
+            int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[2]));
+            uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
+            vst3_u8(dst + dj, vYCrCb);
+#endif
+        }
+
+        for (; j < size.width; ++j, sj += 4, dj += 3)
+        {
+            S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void yuv420sp2rgb(const Size2D &size,
+                  const u8 *  yBase, ptrdiff_t  yStride,
+                  const u8 * uvBase, ptrdiff_t uvStride,
+                  u8 * dstBase, ptrdiff_t dstStride)
+{
+    // input data:
+    ////////////// Y matrix:
+    // {y1, y2,   y3, y4,   y5, y6,   y7, y8,   y9, y10, y11, y12, y13, y14, y15, y16}
+    // {Y1, Y2,   Y3, Y4,   Y5, Y6,   Y7, Y8,   Y9, Y10, Y11, Y12, Y13, Y14, Y15, Y16}
+    ////////////// UV matrix:
+    // {v12, u12, v34, u34, v56, u56, v78, u78, v90 u90, V12, U12, V34, U34, V56, U56}
+
+    // fp version
+    // R = 1.164(Y - 16) + 1.596(V - 128)
+    // G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
+    // B = 1.164(Y - 16)                  + 2.018(U - 128)
+
+    // integer version
+    // R = [((149*y)/2 + (-14248+102*v)      )/2]/32
+    // G = [((149*y)/2 + ((8663- 25*u)-52*v))/2]/32
+    // B = [((149*y)/2 + (-17705+129*u)      )/2]/32
+
+    // error estimation:
+    //Rerr = 0.0000625 * y − 0.00225 * v                − 0.287
+    //Gerr = 0.0000625 * y + 0.0005  * v + 0.000375 * u + 0.128625
+    //Berr = 0.0000625 * y               − 0.002375 * u - 0.287375
+
+    //real error test:
+    //=================
+    //R: 1 less: 520960       ==  3.11% of full space
+    //G: 1 less: 251425       ==  1.50% of full space
+    //B: 1 less: 455424       ==  2.71% of full space
+    //=================
+    //R: 1 more: 642048       ==  3.83% of full space
+    //G: 1 more: 192458       ==  1.15% of full space
+    //B: 1 more: 445184       ==  2.65% of full space
+
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YUV420_CONSTS(3, 2, 0)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; i+=2)
+    {
+        const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
+        const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
+        const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
+        u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
+        u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
+
+        size_t dj = 0u, j = 0u;
+        for (; j < roiw16; dj += 48, j += 16)
+        {
+            internal::prefetch(uv + j);
+            internal::prefetch(y1 + j);
+            internal::prefetch(y2 + j);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTYUV420TORGB(3, d1, d0, q5, q6)
+#else
+            convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
+#endif
+        }
+        for (; j + 2 <= size.width; j+=2, dj += 6)
+        {
+            convertYUV420ToRGB<3, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)yBase;
+    (void)yStride;
+    (void)uvBase;
+    (void)uvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void yuv420sp2rgbx(const Size2D &size,
+                   const u8 *  yBase, ptrdiff_t  yStride,
+                   const u8 * uvBase, ptrdiff_t uvStride,
+                   u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YUV420_CONSTS(4, 2, 0)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; i+=2)
+    {
+        const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
+        const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
+        const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
+        u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
+        u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
+
+        size_t dj = 0u, j = 0u;
+        for (; j < roiw16; dj += 64, j += 16)
+        {
+            internal::prefetch(uv + j);
+            internal::prefetch(y1 + j);
+            internal::prefetch(y2 + j);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTYUV420TORGB(4, d1, d0, q5, q6)
+#else
+            convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
+#endif
+        }
+        for (; j + 2 <= size.width; j+=2, dj += 8)
+        {
+            convertYUV420ToRGB<4, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)yBase;
+    (void)yStride;
+    (void)uvBase;
+    (void)uvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void yuv420i2rgb(const Size2D &size,
+                 const u8 *  yBase, ptrdiff_t  yStride,
+                 const u8 * uvBase, ptrdiff_t uvStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YUV420_CONSTS(3, 2, 1)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; i+=2)
+    {
+        const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
+        const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
+        const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
+        u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
+        u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
+
+        size_t dj = 0u, j = 0u;
+        for (; j < roiw16; dj += 48, j += 16)
+        {
+            internal::prefetch(uv + j);
+            internal::prefetch(y1 + j);
+            internal::prefetch(y2 + j);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTYUV420TORGB(3, d0, d1, q5, q6)
+#else
+            convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
+#endif
+        }
+        for (; j + 2 <= size.width; j+=2, dj += 6)
+        {
+            convertYUV420ToRGB<3, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)yBase;
+    (void)yStride;
+    (void)uvBase;
+    (void)uvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void yuv420i2rgbx(const Size2D &size,
+                  const u8 *  yBase, ptrdiff_t  yStride,
+                  const u8 * uvBase, ptrdiff_t uvStride,
+                  u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YUV420_CONSTS(4, 2, 1)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; i+=2)
+    {
+        const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
+        const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
+        const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
+        u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
+        u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
+
+        size_t dj = 0u, j = 0u;
+        for (; j < roiw16; dj += 64, j += 16)
+        {
+            internal::prefetch(uv + j);
+            internal::prefetch(y1 + j);
+            internal::prefetch(y2 + j);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTYUV420TORGB(4, d0, d1, q5, q6)
+#else
+            convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
+#endif
+        }
+        for (; j + 2 <= size.width; j+=2, dj += 8)
+        {
+            convertYUV420ToRGB<4, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)yBase;
+    (void)yStride;
+    (void)uvBase;
+    (void)uvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void yuv420sp2bgr(const Size2D &size,
+                  const u8 *  yBase, ptrdiff_t  yStride,
+                  const u8 * uvBase, ptrdiff_t uvStride,
+                  u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YUV420_CONSTS(3, 0, 0)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; i+=2)
+    {
+        const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
+        const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
+        const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
+        u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
+        u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
+
+        size_t dj = 0u, j = 0u;
+        for (; j < roiw16; dj += 48, j += 16)
+        {
+            internal::prefetch(uv + j);
+            internal::prefetch(y1 + j);
+            internal::prefetch(y2 + j);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTYUV420TORGB(3, d1, d0, q6, q5)
+#else
+            convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
+#endif
+        }
+        for (; j + 2 <= size.width; j+=2, dj += 6)
+        {
+            convertYUV420ToRGB<3, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)yBase;
+    (void)yStride;
+    (void)uvBase;
+    (void)uvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void yuv420sp2bgrx(const Size2D &size,
+                   const u8 *  yBase, ptrdiff_t  yStride,
+                   const u8 * uvBase, ptrdiff_t uvStride,
+                   u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YUV420_CONSTS(4, 0, 0)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; i+=2)
+    {
+        const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
+        const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
+        const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
+        u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
+        u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
+
+        size_t dj = 0u, j = 0u;
+        for (; j < roiw16; dj += 64, j += 16)
+        {
+            internal::prefetch(uv + j);
+            internal::prefetch(y1 + j);
+            internal::prefetch(y2 + j);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTYUV420TORGB(4, d1, d0, q6, q5)
+#else
+            convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
+#endif
+        }
+        for (; j + 2 <= size.width; j+=2, dj += 8)
+        {
+            convertYUV420ToRGB<4, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)yBase;
+    (void)yStride;
+    (void)uvBase;
+    (void)uvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void yuv420i2bgr(const Size2D &size,
+                 const u8 *  yBase, ptrdiff_t  yStride,
+                 const u8 * uvBase, ptrdiff_t uvStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YUV420_CONSTS(3, 0, 1)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; i+=2)
+    {
+        const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
+        const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
+        const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
+        u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
+        u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
+
+        size_t dj = 0u, j = 0u;
+        for (; j < roiw16; dj += 48, j += 16)
+        {
+            internal::prefetch(uv + j);
+            internal::prefetch(y1 + j);
+            internal::prefetch(y2 + j);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTYUV420TORGB(3, d0, d1, q6, q5)
+#else
+            convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
+#endif
+        }
+        for (; j + 2 <= size.width; j+=2, dj += 6)
+        {
+            convertYUV420ToRGB<3, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)yBase;
+    (void)yStride;
+    (void)uvBase;
+    (void)uvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void yuv420i2bgrx(const Size2D &size,
+                  const u8 *  yBase, ptrdiff_t  yStride,
+                  const u8 * uvBase, ptrdiff_t uvStride,
+                  u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    YUV420_CONSTS(4, 0, 1)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0u; i < size.height; i+=2)
+    {
+        const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
+        const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
+        const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
+        u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
+        u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
+
+        size_t dj = 0u, j = 0u;
+        for (; j < roiw16; dj += 64, j += 16)
+        {
+            internal::prefetch(uv + j);
+            internal::prefetch(y1 + j);
+            internal::prefetch(y2 + j);
+#if defined(__GNUC__) && __GNUC_MINOR__ < 7
+            CONVERTYUV420TORGB(4, d0, d1, q6, q5)
+#else
+            convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
+#endif
+        }
+        for (; j + 2 <= size.width; j+=2, dj += 8)
+        {
+            convertYUV420ToRGB<4, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
+        }
+    }
+#else
+    (void)size;
+    (void)yBase;
+    (void)yStride;
+    (void)uvBase;
+    (void)uvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/common.cpp b/3rdparty/carotene/src/common.cpp
new file mode 100644
index 0000000000..c85b0123b6
--- /dev/null
+++ b/3rdparty/carotene/src/common.cpp
@@ -0,0 +1,108 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <cstdlib>
+#include <iostream>
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+bool isSupportedConfiguration()
+{
+#ifdef CAROTENE_NEON
+    return true;
+#else
+    return false;
+#endif
+}
+
+namespace internal {
+
+void assertSupportedConfiguration(bool parametersSupported)
+{
+    if (!isSupportedConfiguration()) {
+        std::cerr << "internal error: attempted to use an unavailable function" << std::endl;
+        std::abort();
+    }
+
+    if (!parametersSupported) {
+        std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl;
+        std::abort();
+    }
+}
+
+ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin)
+{
+    ptrdiff_t p = _p + (ptrdiff_t)startMargin;
+    size_t len = _len + startMargin + endMargin;
+    if( (size_t)p < len )
+        return _p;
+    else if( borderType == BORDER_MODE_REPLICATE )
+        p = p < 0 ? 0 : (ptrdiff_t)len - 1;
+    else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 )
+    {
+        s32 delta = borderType == BORDER_MODE_REFLECT101;
+        if( len == 1 )
+            return 0;
+        do
+        {
+            if( p < 0 )
+                p = -p - 1 + delta;
+            else
+                p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta;
+        }
+        while( (size_t)p >= len );
+    }
+    else if( borderType == BORDER_MODE_WRAP )
+    {
+        if( p < 0 )
+            p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len;
+        if( p >= (ptrdiff_t)len )
+            p %= (ptrdiff_t)len;
+    }
+    else if( borderType == BORDER_MODE_CONSTANT )
+        p = -1;
+    else
+        internal::assertSupportedConfiguration(false);
+    return p - (ptrdiff_t)startMargin;
+}
+
+} // namespace internal
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/common.hpp b/3rdparty/carotene/src/common.hpp
new file mode 100644
index 0000000000..e46231a58a
--- /dev/null
+++ b/3rdparty/carotene/src/common.hpp
@@ -0,0 +1,96 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_COMMON_HPP
+#define CAROTENE_SRC_COMMON_HPP
+
+#include <cstddef>
+#include <algorithm>
+
+#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON)
+#define CAROTENE_NEON
+#endif
+
+#ifdef CAROTENE_NEON
+#include <arm_neon.h>
+#include "intrinsics.hpp"
+#endif
+
+#include <carotene/functions.hpp>
+#include "saturate_cast.hpp"
+
+namespace CAROTENE_NS { namespace internal {
+
+inline void prefetch(const void *ptr, size_t offset = 32*10)
+{
+#if defined __GNUC__
+    __builtin_prefetch(reinterpret_cast<const char*>(ptr) + offset);
+#elif defined _MSC_VER && defined CAROTENE_NEON
+    __prefetch(reinterpret_cast<const char*>(ptr) + offset);
+#else
+    (void)ptr;
+    (void)offset;
+#endif
+}
+
+template <typename T>
+inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row)
+{
+    char *baseRaw = const_cast<char *>(reinterpret_cast<const char *>(base));
+    return reinterpret_cast<T *>(baseRaw + ptrdiff_t(row) * stride);
+}
+
+void assertSupportedConfiguration(bool parametersSupported = true);
+
+ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0);
+
+/*!
+ *  Aligns pointer by the certain number of bytes
+ *
+ *  This small inline function aligns the pointer by the certain number of bytes by shifting
+ *  it forward by 0 or a positive offset.
+ */
+template<typename T> inline T* alignPtr(T* ptr, size_t n=sizeof(T))
+{
+    return (T*)(((size_t)ptr + n-1) & -n);
+}
+
+}}
+
+#endif
diff --git a/3rdparty/carotene/src/convert.cpp b/3rdparty/carotene/src/convert.cpp
new file mode 100644
index 0000000000..2f95e29cb3
--- /dev/null
+++ b/3rdparty/carotene/src/convert.cpp
@@ -0,0 +1,1331 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
+    void convert(const Size2D &_size,                                           \
+                 const T1 * srcBase, ptrdiff_t srcStride,                       \
+                 T2 * dstBase, ptrdiff_t dstStride)                             \
+    {                                                                           \
+        internal::assertSupportedConfiguration();                               \
+        Size2D size(_size);                                                     \
+        if (srcStride == dstStride &&                                           \
+            srcStride == (ptrdiff_t)(size.width))                               \
+        {                                                                       \
+            size.width *= size.height;                                          \
+            size.height = 1;                                                    \
+        }                                                                       \
+        const ptrdiff_t sstep = srcStride / sizeof(T1);                         \
+        const ptrdiff_t dstep = dstStride / sizeof(T2);                         \
+        const size_t w = size.width & ~(SIMD_SIZE-1);                           \
+        if (size.width >= SIMD_SIZE)                                            \
+        {                                                                       \
+            const T1* _src = srcBase;                                           \
+            T2* _dst = dstBase;                                                 \
+            CVTINIT                                                             \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
+                CVTROW                                                          \
+        }                                                                       \
+        if(w < size.width)                                                      \
+        {                                                                       \
+            const T1* _src = srcBase;                                           \
+            T2* _dst = dstBase;                                                 \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
+                for(size_t i = w; i < size.width; i++ )                         \
+                    _dst[i] = internal::saturate_cast<T2>(_src[i]);             \
+        }                                                                       \
+    }
+
+#else
+
+#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
+    void convert(const Size2D &,                                                \
+                 const T1 *, ptrdiff_t,                                         \
+                 T2 *, ptrdiff_t)                                               \
+    {                                                                           \
+        internal::assertSupportedConfiguration();                               \
+    }
+
+#endif
+
+CVT_FUNC(u8, s8, 16,
+     uint8x16_t v127 = vdupq_n_u8(127);,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         uint8x16_t vu8 = vld1q_u8(_src + i);
+         int8x16_t vu1 = vreinterpretq_s8_u8(vminq_u8(vu8, v127));
+         vst1q_s8(_dst + i, vu1);
+     }
+})
+
+#if __GNUC_MINOR__ < 7
+CVT_FUNC(u8, u16, 16,
+     register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
+             "vst2.8 {d0,d2}, [%[dst1]]                             \n\t"
+             "vst2.8 {d1,d3}, [%[dst2]]                             \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 8),
+               "w" (zero0)
+             : "d0","d1"
+         );
+     }
+})
+#else
+CVT_FUNC(u8, u16, 16,
+     uint8x16x2_t vline;
+     vline.val[1] = vmovq_n_u8(0);,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         vline.val[0] = vld1q_u8(_src + i);
+         vst2q_u8((uint8_t*)(_dst + i), vline);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVT_FUNC(u8, s32, 16,
+     register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);
+     register uint8x16_t zero1 asm ("q2") = vmovq_n_u8(0);
+     register uint8x16_t zero2 asm ("q3") = vmovq_n_u8(0);,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
+             "vst4.8 {d0,d2,d4,d6}, [%[dst1]]                       \n\t"
+             "vst4.8 {d1,d3,d5,d7}, [%[dst2]]                       \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 8),
+               "w" (zero0), "w" (zero1), "w" (zero2)
+             : "d0","d1"
+         );
+     }
+})
+#else
+CVT_FUNC(u8, s32, 16,
+     uint8x16x4_t vline;
+     vline.val[1] = vmovq_n_u8(0);
+     vline.val[2] = vmovq_n_u8(0);
+     vline.val[3] = vmovq_n_u8(0);,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+        vline.val[0] = vld1q_u8(_src + i);
+        vst4q_u8((uint8_t*)(_dst + i), vline);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(u8, f32, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
+             "vmovl.u8 q1, d0                                       \n\t"
+             "vmovl.u8 q2, d1                                       \n\t"
+             "vmovl.u16 q3, d2                                      \n\t"
+             "vmovl.u16 q4, d3                                      \n\t"
+             "vmovl.u16 q5, d4                                      \n\t"
+             "vmovl.u16 q6, d5                                      \n\t"
+             "vcvt.f32.u32 q7, q3                                   \n\t"
+             "vcvt.f32.u32 q8, q4                                   \n\t"
+             "vcvt.f32.u32 q9, q5                                   \n\t"
+             "vcvt.f32.u32 q10, q6                                  \n\t"
+             "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
+             "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
+             "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
+             "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 4),
+               [dst3] "r" (_dst + i + 8),
+               [dst4] "r" (_dst + i + 12)
+             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
+         );
+     }
+})
+#else
+CVT_FUNC(u8, f32, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         uint8x16_t vline_u8 = vld1q_u8(_src + i);
+
+         uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8(vline_u8));
+         uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline_u8));
+
+         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16(vline1_u16));
+         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+         uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16(vline2_u16));
+         uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+
+         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+         float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+         float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+
+         vst1q_f32(_dst + i, vline1_f32);
+         vst1q_f32(_dst + i + 4, vline2_f32);
+         vst1q_f32(_dst + i + 8, vline3_f32);
+         vst1q_f32(_dst + i + 12, vline4_f32);
+     }
+})
+#endif
+
+CVT_FUNC(s8, u8, 16,
+     int8x16_t vZero = vdupq_n_s8(0);,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         int8x16_t vu8 = vld1q_s8(_src + i);
+         uint8x16_t vu1 = vreinterpretq_u8_s8(vmaxq_s8(vu8, vZero));
+         vst1q_u8(_dst + i, vu1);
+     }
+})
+
+#if __GNUC_MINOR__ < 7
+CVT_FUNC(s8, u16, 16,
+     register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
+             "vmax.s8 q0, q1                                        \n\t"
+             "vst2.8 {d0,d2}, [%[dst1]]                             \n\t"
+             "vst2.8 {d1,d3}, [%[dst2]]                             \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 8),
+               "w" (zero0)
+             : "d0","d1"
+         );
+     }
+})
+#else
+CVT_FUNC(s8, u16, 16,
+     int8x16x2_t vline_s8;
+     vline_s8.val[1] = vmovq_n_s8(0);,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         vline_s8.val[0] = vld1q_s8(_src + i);
+         vline_s8.val[0] = vmaxq_s8(vline_s8.val[0], vline_s8.val[1]);
+         vst2q_s8((int8_t*)(_dst + i), vline_s8);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s8, s16, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
+             "vmovl.s8 q1, d0                                       \n\t"
+             "vmovl.s8 q2, d1                                       \n\t"
+             "vst1.16 {d2-d3}, [%[dst1]]                            \n\t"
+             "vst1.16 {d4-d5}, [%[dst2]]                            \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 8)
+             : "d0","d1","d2","d3","d4","d5"
+         );
+     }
+})
+#else
+CVT_FUNC(s8, s16, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         int8x16_t vline_s8 = vld1q_s8(_src + i);
+
+         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
+         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
+
+         vst1q_s16(_dst + i, vline1_s16);
+         vst1q_s16(_dst + i + 8, vline2_s16);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVT_FUNC(s8, s32, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
+             "vmovl.s8 q1, d0                                       \n\t"
+             "vmovl.s8 q2, d1                                       \n\t"
+             "vmovl.s16 q3, d2                                      \n\t"
+             "vmovl.s16 q4, d3                                      \n\t"
+             "vmovl.s16 q5, d4                                      \n\t"
+             "vmovl.s16 q6, d5                                      \n\t"
+             "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
+             "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
+             "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
+             "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 4),
+               [dst3] "r" (_dst + i + 8),
+               [dst4] "r" (_dst + i + 12)
+             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
+         );
+     }
+})
+#else
+CVT_FUNC(s8, s32, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         int8x16_t vline_s8 = vld1q_s8(_src + i);
+
+         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
+         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
+
+         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
+         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
+         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+
+         vst1q_s32(_dst + i, vline1_s32);
+         vst1q_s32(_dst + i + 4, vline2_s32);
+         vst1q_s32(_dst + i + 8, vline3_s32);
+         vst1q_s32(_dst + i + 12, vline4_s32);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s8, f32, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
+             "vmovl.s8 q1, d0                                       \n\t"
+             "vmovl.s8 q2, d1                                       \n\t"
+             "vmovl.s16 q3, d2                                      \n\t"
+             "vmovl.s16 q4, d3                                      \n\t"
+             "vmovl.s16 q5, d4                                      \n\t"
+             "vmovl.s16 q6, d5                                      \n\t"
+             "vcvt.f32.s32 q7, q3                                   \n\t"
+             "vcvt.f32.s32 q8, q4                                   \n\t"
+             "vcvt.f32.s32 q9, q5                                   \n\t"
+             "vcvt.f32.s32 q10, q6                                  \n\t"
+             "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
+             "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
+             "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
+             "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 4),
+               [dst3] "r" (_dst + i + 8),
+               [dst4] "r" (_dst + i + 12)
+             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
+         );
+     }
+})
+#else
+CVT_FUNC(s8, f32, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         int8x16_t vline_s8 = vld1q_s8(_src + i);
+
+         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
+         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
+
+         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
+         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
+         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+
+         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+         float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+         float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+
+         vst1q_f32(_dst + i, vline1_f32);
+         vst1q_f32(_dst + i + 4, vline2_f32);
+         vst1q_f32(_dst + i + 8, vline3_f32);
+         vst1q_f32(_dst + i + 12, vline4_f32);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(u16, u8, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
+             "vqmovn.u16 d4, q0                                     \n\t"
+             "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
+             "vqmovn.u16 d5, q1                                     \n\t"
+             "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i),
+               [src2] "r" (_src + i + 8),
+               [dst] "r" (_dst + i + 0)
+             : "d0","d1","d2","d3","d4","d5"
+         );
+     }
+})
+#else
+CVT_FUNC(u16, u8, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         uint16x8_t vline1_u16 = vld1q_u16(_src + i);
+         uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
+
+         uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
+         uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
+
+         vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(u16, s8, 16,
+    register uint8x16_t v127 asm ("q4") = vmovq_n_u8(127);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
+            "vqmovn.u16 d4, q0                                     \n\t"
+            "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
+            "vqmovn.u16 d5, q1                                     \n\t"
+            "vmin.u8 q3, q2, q4                                    \n\t"
+            "vst1.8 {d6-d7}, [%[dst]]                              \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [src2] "r" (_src + i + 8),
+              [dst] "r" (_dst + i + 0),
+              "w" (v127)
+            : "d0","d1","d2","d3","d4","d5","d6","d7"
+         );
+    }
+})
+#else
+CVT_FUNC(u16, s8, 16,
+    uint8x8_t v127 = vmov_n_u8(127);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline1_u16 = vld1q_u16(_src + i);
+        uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
+
+        uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
+        uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
+        vline1_u8 = vmin_u8(vline1_u8, v127);
+        vline2_u8 = vmin_u8(vline2_u8, v127);
+
+        vst1q_s8(_dst + i, vcombine_s8(vreinterpret_s8_u8(vline1_u8), vreinterpret_s8_u8(vline2_u8)));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVT_FUNC(u16, s16, 8,
+     register uint16x8_t v32767 asm ("q4") = vmovq_n_u16(0x7FFF);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
+             "vmin.u16 q1, q0, q4                                    \n\t"
+             "vst1.16 {d2-d3}, [%[dst]]                              \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst] "r" (_dst + i + 0),
+               "w" (v32767)
+             : "d0","d1","d2","d3"
+         );
+     }
+})
+#else
+CVT_FUNC(u16, s16, 8,
+     uint16x8_t v32767 = vmovq_n_u16(0x7FFF);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         uint16x8_t vline_u16 = vld1q_u16(_src + i);
+         vline_u16 = vminq_u16(vline_u16, v32767);
+         vst1q_s16((_dst + i), vreinterpretq_s16_u16(vline_u16));
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVT_FUNC(u16, s32, 8,
+     register uint16x8_t zero0 asm ("q1") = vmovq_n_u16(0);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.16 {d0-d1}, [%[src]]                        \n\t"
+             "vst2.16 {d0,d2}, [%[dst1]]                       \n\t"
+             "vst2.16 {d1,d3}, [%[dst2]]                       \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i),
+               [dst2] "r" (_dst + i + 4),
+               "w" (zero0)
+             : "d0","d1"//,"d2","d3"//,"d4","d5","d6","d7"
+         );
+     }
+})
+#else
+CVT_FUNC(u16, s32, 8,
+     uint16x8x2_t vline;
+     vline.val[1] = vmovq_n_u16(0);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         vline.val[0] = vld1q_u16(_src + i);
+         vst2q_u16((uint16_t*)(_dst + i), vline);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(u16, f32, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
+             "vmovl.u16 q1, d0                                       \n\t"
+             "vmovl.u16 q2, d1                                       \n\t"
+             "vcvt.f32.u32 q3, q1                                    \n\t"
+             "vcvt.f32.u32 q4, q2                                    \n\t"
+             "vst1.32 {d6-d7}, [%[dst1]]                             \n\t"
+             "vst1.32 {d8-d9}, [%[dst2]]                             \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 4)
+             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
+         );
+     }
+})
+#else
+CVT_FUNC(u16, f32, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         uint16x8_t vline_u16 = vld1q_u16(_src + i);
+
+         uint32x4_t vline_u32_lo = vmovl_u16(vget_low_u16(vline_u16));
+         uint32x4_t vline_u32_hi = vmovl_u16(vget_high_u16(vline_u16));
+
+         float32x4_t vline_f32_lo = vcvtq_f32_u32(vline_u32_lo);
+         float32x4_t vline_f32_hi = vcvtq_f32_u32(vline_u32_hi);
+
+         vst1q_f32(_dst + i, vline_f32_lo);
+         vst1q_f32(_dst + i + 4, vline_f32_hi);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s16, u8, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
+             "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
+             "vqmovun.s16 d4, q0                                    \n\t"
+             "vqmovun.s16 d5, q1                                    \n\t"
+             "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i),
+               [src2] "r" (_src + i + 8),
+               [dst] "r" (_dst + i + 0)
+             : "d0","d1","d2","d3","d4","d5"
+         );
+     }
+})
+#else
+CVT_FUNC(s16, u8, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         int16x8_t vline1_s16 = vld1q_s16(_src + i);
+         int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
+
+         uint8x8_t vline1_u8 = vqmovun_s16(vline1_s16);
+         uint8x8_t vline2_u8 = vqmovun_s16(vline2_s16);
+
+         vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s16, s8, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
+             "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
+             "vqmovn.s16 d4, q0                                     \n\t"
+             "vqmovn.s16 d5, q1                                     \n\t"
+             "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i),
+               [src2] "r" (_src + i + 8),
+               [dst] "r" (_dst + i + 0)
+             : "d0","d1","d2","d3","d4","d5"
+         );
+     }
+})
+#else
+CVT_FUNC(s16, s8, 16,
+,
+{
+     for (size_t i = 0; i < w; i += 16)
+     {
+         internal::prefetch(_src + i);
+         int16x8_t vline1_s16 = vld1q_s16(_src + i);
+         int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
+
+         int8x8_t vline1_s8 = vqmovn_s16(vline1_s16);
+         int8x8_t vline2_s8 = vqmovn_s16(vline2_s16);
+
+         vst1q_s8(_dst + i, vcombine_s8(vline1_s8, vline2_s8));
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVT_FUNC(s16, u16, 8,
+     register int16x8_t vZero asm ("q4") = vmovq_n_s16(0);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
+             "vmax.s16 q1, q0, q4                                    \n\t"
+             "vst1.16 {d2-d3}, [%[dst]]                              \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst] "r" (_dst + i + 0),
+               "w" (vZero)
+             : "d0","d1","d2","d3"
+         );
+     }
+})
+#else
+CVT_FUNC(s16, u16, 8,
+     int16x4_t vZero = vmov_n_s16(0);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         int16x8_t vline_s16 = vld1q_s16(_src + i);
+
+         int16x4_t vline_s16_lo = vmax_s16(vget_low_s16(vline_s16), vZero);
+         int16x4_t vline_s16_hi = vmax_s16(vget_high_s16(vline_s16), vZero);
+
+         vst1q_u16(_dst + i, vcombine_u16(vreinterpret_u16_s16(vline_s16_lo), vreinterpret_u16_s16(vline_s16_hi)));
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s16, s32, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
+             "vmovl.s16 q1, d0                                       \n\t"
+             "vmovl.s16 q2, d1                                       \n\t"
+             "vst1.32 {d2-d3}, [%[dst1]]                             \n\t"
+             "vst1.32 {d4-d5}, [%[dst2]]                             \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 4)
+             : "d0","d1","d2","d3","d4","d5"
+         );
+     }
+})
+#else
+CVT_FUNC(s16, s32, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         int16x8_t vline_s16 = vld1q_s16(_src + i);
+
+         int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
+         int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
+
+         vst1q_s32(_dst + i, vline_s32_lo);
+         vst1q_s32(_dst + i + 4, vline_s32_hi);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s16, f32, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
+             "vmovl.s16 q1, d0                                       \n\t"
+             "vmovl.s16 q2, d1                                       \n\t"
+             "vcvt.f32.s32 q3, q1                                    \n\t"
+             "vcvt.f32.s32 q4, q2                                    \n\t"
+             "vst1.32 {d6-d7}, [%[dst1]]                             \n\t"
+             "vst1.32 {d8-d9}, [%[dst2]]                             \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst1] "r" (_dst + i + 0),
+               [dst2] "r" (_dst + i + 4)
+             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
+         );
+     }
+})
+#else
+CVT_FUNC(s16, f32, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         int16x8_t vline_s16 = vld1q_s16(_src + i);
+
+         int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
+         int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
+         float32x4_t vline_f32_lo = vcvtq_f32_s32(vline_s32_lo);
+         float32x4_t vline_f32_hi = vcvtq_f32_s32(vline_s32_hi);
+
+         vst1q_f32(_dst + i, vline_f32_lo);
+         vst1q_f32(_dst + i + 4, vline_f32_hi);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s32, u8, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
+             "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
+             "vqmovun.s32 d4, q0                                      \n\t"
+             "vqmovun.s32 d5, q1                                      \n\t"
+             "vqmovn.u16  d6, q2                                      \n\t"
+             "vst1.8 {d6}, [%[dst]]                                   \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i + 0),
+               [src2] "r" (_src + i + 4),
+               [dst] "r" (_dst + i)
+             : "d0","d1","d2","d3","d4","d5","d6"
+         );
+     }
+})
+#else
+CVT_FUNC(s32, u8, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         int32x4_t vline1_s32 = vld1q_s32(_src + i);
+         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+
+         uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
+         uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
+         uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
+
+         vst1_u8(_dst + i, vline_u8);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s32, s8, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
+             "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
+             "vqmovn.s32 d4, q0                                       \n\t"
+             "vqmovn.s32 d5, q1                                       \n\t"
+             "vqmovn.s16  d6, q2                                      \n\t"
+             "vst1.8 {d6}, [%[dst]]                                   \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i + 0),
+               [src2] "r" (_src + i + 4),
+               [dst] "r" (_dst + i)
+             : "d0","d1","d2","d3","d4","d5","d6"
+         );
+     }
+})
+#else
+CVT_FUNC(s32, s8, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         int32x4_t vline1_s32 = vld1q_s32(_src + i);
+         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+
+         int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
+         int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
+         int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
+
+         vst1_s8(_dst + i, vline_s8);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s32, u16, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
+             "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
+             "vqmovun.s32 d4, q0                                      \n\t"
+             "vqmovun.s32 d5, q1                                      \n\t"
+             "vst1.16 {d4-d5}, [%[dst]]                               \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i + 0),
+               [src2] "r" (_src + i + 4),
+               [dst] "r" (_dst + i)
+             : "d0","d1","d2","d3","d4","d5"
+         );
+     }
+})
+#else
+CVT_FUNC(s32, u16, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         int32x4_t vline1_s32 = vld1q_s32(_src + i);
+         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+
+         uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
+         uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
+
+         vst1q_u16(_dst + i, vcombine_u16(vline1_u16, vline2_u16));
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s32, s16, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
+             "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
+             "vqmovn.s32 d4, q0                                       \n\t"
+             "vqmovn.s32 d5, q1                                       \n\t"
+             "vst1.8 {d4-d5}, [%[dst]]                                \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i + 0),
+               [src2] "r" (_src + i + 4),
+               [dst] "r" (_dst + i)
+             : "d0","d1","d2","d3","d4","d5"
+         );
+     }
+})
+#else
+CVT_FUNC(s32, s16, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         int32x4_t vline1_s32 = vld1q_s32(_src + i);
+         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+
+         int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
+         int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
+
+         vst1q_s16(_dst + i, vcombine_s16(vline1_s16, vline2_s16));
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(s32, f32, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d0-d1}, [%[src]]                              \n\t"
+             "vcvt.f32.s32 q1, q0                                    \n\t"
+             "vst1.32 {d2-d3}, [%[dst]]                              \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst] "r" (_dst + i)
+             : "d0","d1","d2","d3"//,"d4","d5"
+         );
+         __asm__ (
+             "vld1.32 {d0-d1}, [%[src]]                              \n\t"
+             "vcvt.f32.s32 q1, q0                                    \n\t"
+             "vst1.32 {d2-d3}, [%[dst]]                              \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i + 4),
+               [dst] "r" (_dst + i + 4)
+             : "d0","d1","d2","d3"//,"d4","d5"
+         );
+     }
+})
+#else
+CVT_FUNC(s32, f32, 8,
+,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         int32x4_t vline_s32 = vld1q_s32(_src + i);
+         float32x4_t vline_f32 = vcvtq_f32_s32(vline_s32);
+         vst1q_f32(_dst + i, vline_f32);
+
+         vline_s32 = vld1q_s32(_src + i + 4);
+         vline_f32 = vcvtq_f32_s32(vline_s32);
+         vst1q_f32(_dst + i + 4, vline_f32);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(f32, u8, 8,
+    register float32x4_t vmult asm ("q0") = vdupq_n_f32((float)(1 << 16));
+    register uint32x4_t  vmask asm ("q1") = vdupq_n_u32(1<<16);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vcvt.u32.f32 q6, q4                                     \n\t"
+            "vcvt.u32.f32 q7, q5                                     \n\t"
+            "vbic q8, q1, q6                                         \n\t"
+            "vbic q9, q1, q7                                         \n\t"
+            "vshr.u32 q10, q8, #16                                   \n\t"
+            "vshr.u32 q11, q9, #16                                   \n\t"
+            "vqsub.u32 q12, q6, q10                                  \n\t"
+            "vqsub.u32 q13, q7, q11                                  \n\t"
+            "vqrshrn.u32 d28, q12, #16                               \n\t"
+            "vqrshrn.u32 d29, q13, #16                               \n\t"
+            "vqmovn.u16 d30, q14                                     \n\t"
+            "vst1.8 {d30}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vmult), "w" (vmask)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
+        );
+     }
+})
+#else
+CVT_FUNC(f32, u8, 8,
+    float32x4_t vmult = vdupq_n_f32((float)(1 << 16));
+    uint32x4_t  vmask = vdupq_n_u32(1<<16);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+
+        float32x4_t vline1w_f32 = vmulq_f32(vline1_f32, vmult);
+        float32x4_t vline2w_f32 = vmulq_f32(vline2_f32, vmult);
+
+        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1w_f32);
+        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2w_f32);
+
+        uint32x4_t vl1_masked = vbicq_u32(vmask, vline1_u32);
+        uint32x4_t vl2_masked = vbicq_u32(vmask, vline2_u32);
+        uint32x4_t vl1_masked2 = vshrq_n_u32(vl1_masked, 16);
+        uint32x4_t vl2_masked2 = vshrq_n_u32(vl2_masked, 16);
+        uint32x4_t vline1r_u32 = vqsubq_u32(vline1_u32, vl1_masked2);
+        uint32x4_t vline2r_u32 = vqsubq_u32(vline2_u32, vl2_masked2);
+
+        uint16x4_t vline1_u16 = vqrshrn_n_u32(vline1r_u32, 16);
+        uint16x4_t vline2_u16 = vqrshrn_n_u32(vline2r_u32, 16);
+
+        uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
+        vst1_u8(_dst + i, vline_u8);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(f32, s8, 8,
+     register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d2-d3}, [%[src1]]                              \n\t"
+             "vld1.32 {d4-d5}, [%[src2]]                              \n\t"
+             "vadd.f32 q3, q1, q0                                     \n\t"
+             "vadd.f32 q4, q2, q0                                     \n\t"
+             "vcvt.s32.f32 q5, q3                                     \n\t"
+             "vcvt.s32.f32 q6, q4                                     \n\t"
+             "vqmovn.s32 d14, q5                                      \n\t"
+             "vqmovn.s32 d15, q6                                      \n\t"
+             "vqmovn.s16 d16, q7                                      \n\t"
+             "vst1.8 {d16}, [%[dst]]                                  \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i + 0),
+               [src2] "r" (_src + i + 4),
+               [dst] "r" (_dst + i),
+               "w" (vhalf)
+             : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17"
+         );
+     }
+})
+#else
+CVT_FUNC(f32, s8, 8,
+     float32x4_t vhalf = vdupq_n_f32(0.5f);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         float32x4_t vline1_f32 = vld1q_f32(_src + i);
+         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+
+         vline1_f32 = vaddq_f32(vline1_f32, vhalf);
+         vline2_f32 = vaddq_f32(vline2_f32, vhalf);
+
+         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+         int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
+         int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
+
+         int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
+
+         vst1_s8(_dst + i, vline_s8);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(f32, u16, 8,
+     register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d2-d3}, [%[src]]                               \n\t"
+             "vadd.f32 q2, q1, q0                                     \n\t"
+             "vcvt.u32.f32 q3, q2                                     \n\t"
+             "vqmovn.u32 d8, q3                                       \n\t"
+             "vst1.16 {d8}, [%[dst]]                                  \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst] "r" (_dst + i),
+               "w" (vhalf)
+             : "d2","d3","d4","d5","d6","d7","d8"
+         );
+         __asm__ (
+             "vld1.32 {d2-d3}, [%[src]]                               \n\t"
+             "vadd.f32 q2, q1, q0                                     \n\t"
+             "vcvt.u32.f32 q3, q2                                     \n\t"
+             "vqmovn.u32 d8, q3                                       \n\t"
+             "vst1.16 {d8}, [%[dst]]                                  \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i + 4),
+               [dst] "r" (_dst + i + 4),
+               "w" (vhalf)
+             : "d2","d3","d4","d5","d6","d7","d8"
+         );
+     }
+})
+#else
+CVT_FUNC(f32, u16, 8,
+     float32x4_t vhalf = vdupq_n_f32(0.5f);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         float32x4_t vline_f32 = vld1q_f32(_src + i);
+
+         vline_f32 = vaddq_f32(vline_f32, vhalf);
+         uint32x4_t vline_u32 = vcvtq_u32_f32(vline_f32);
+         uint16x4_t vline_u16 = vqmovn_u32(vline_u32);
+
+         vst1_u16(_dst + i, vline_u16);
+
+         vline_f32 = vld1q_f32(_src + i + 4);
+
+         vline_f32 = vaddq_f32(vline_f32, vhalf);
+         vline_u32 = vcvtq_u32_f32(vline_f32);
+         vline_u16 = vqmovn_u32(vline_u32);
+
+         vst1_u16(_dst + i + 4, vline_u16);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(f32, s16, 8,
+     register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d2-d3}, [%[src]]                               \n\t"
+             "vadd.f32 q2, q1, q0                                     \n\t"
+             "vcvt.s32.f32 q3, q2                                     \n\t"
+             "vqmovn.s32 d8, q3                                       \n\t"
+             "vst1.16 {d8}, [%[dst]]                                  \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i),
+               [dst] "r" (_dst + i),
+               "w" (vhalf)
+             : "d2","d3","d4","d5","d6","d7","d8"
+         );
+         __asm__ (
+             "vld1.32 {d2-d3}, [%[src]]                               \n\t"
+             "vadd.f32 q2, q1, q0                                     \n\t"
+             "vcvt.s32.f32 q3, q2                                     \n\t"
+             "vqmovn.s32 d8, q3                                       \n\t"
+             "vst1.16 {d8}, [%[dst]]                                  \n\t"
+             : /*no output*/
+             : [src] "r" (_src + i + 4),
+               [dst] "r" (_dst + i + 4),
+               "w" (vhalf)
+             : "d2","d3","d4","d5","d6","d7","d8"
+         );
+     }
+})
+#else
+CVT_FUNC(f32, s16, 8,
+     float32x4_t vhalf = vdupq_n_f32(0.5f);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         float32x4_t vline_f32 = vld1q_f32(_src + i);
+
+         vline_f32 = vaddq_f32(vline_f32, vhalf);
+         int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
+         int16x4_t vline_s16 = vqmovn_s32(vline_s32);
+
+         vst1_s16(_dst + i, vline_s16);
+
+         vline_f32 = vld1q_f32(_src + i + 4);
+
+         vline_f32 = vaddq_f32(vline_f32, vhalf);
+         vline_s32 = vcvtq_s32_f32(vline_f32);
+         vline_s16 = vqmovn_s32(vline_s32);
+
+         vst1_s16(_dst + i + 4, vline_s16);
+     }
+})
+#endif
+
+#if __GNUC_MINOR__ < 6
+CVT_FUNC(f32, s32, 8,
+     register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         __asm__ (
+             "vld1.32 {d2-d3}, [%[src1]]                              \n\t"
+             "vld1.32 {d4-d5}, [%[src2]]                              \n\t"
+             "vadd.f32 q3, q1, q0                                     \n\t"
+             "vadd.f32 q4, q2, q0                                     \n\t"
+             "vcvt.s32.f32 q5, q3                                     \n\t"
+             "vcvt.s32.f32 q6, q4                                     \n\t"
+             "vst1.32 {q5}, [%[dst1]]                                 \n\t"
+             "vst1.32 {q6}, [%[dst2]]                                 \n\t"
+             : /*no output*/
+             : [src1] "r" (_src + i),
+               [src2] "r" (_src + i + 4),
+               [dst1] "r" (_dst + i),
+               [dst2] "r" (_dst + i + 4),
+               "w" (vhalf)
+             : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
+         );
+     }
+})
+#else
+CVT_FUNC(f32, s32, 8,
+     float32x4_t vhalf = vdupq_n_f32(0.5f);,
+{
+     for (size_t i = 0; i < w; i += 8)
+     {
+         internal::prefetch(_src + i);
+         float32x4_t vline_f32 = vld1q_f32(_src + i);
+
+         vline_f32 = vaddq_f32(vline_f32, vhalf);
+         int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
+
+         vst1q_s32(_dst + i, vline_s32);
+
+         vline_f32 = vld1q_f32(_src + i + 4);
+
+         vline_f32 = vaddq_f32(vline_f32, vhalf);
+         vline_s32 = vcvtq_s32_f32(vline_f32);
+
+         vst1q_s32(_dst + i + 4, vline_s32);
+     }
+})
+#endif
+
+void convert(const Size2D &_size,
+             const u8 * srcBase, ptrdiff_t srcStride,
+             s16 * dstBase, ptrdiff_t dstStride)
+{
+    convert(_size, srcBase, srcStride, (u16*)dstBase, dstStride);
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/convert_depth.cpp b/3rdparty/carotene/src/convert_depth.cpp
new file mode 100644
index 0000000000..21b0c18a69
--- /dev/null
+++ b/3rdparty/carotene/src/convert_depth.cpp
@@ -0,0 +1,399 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <int shift>
+void lshiftConst(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+            vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
+            vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
+            vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
+        }
+
+        for (; j < size.width; j++)
+        {
+            dst[j] = ((s16)src[j] << shift);
+        }
+    }
+}
+
+template <>
+void lshiftConst<0>(const Size2D &size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    s16 * dstBase, ptrdiff_t dstStride)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+            vst1q_s16(dst + j, v_dst0);
+            vst1q_s16(dst + j + 8, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
+            vst1q_s16(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+        {
+            dst[j] = (s16)src[j];
+        }
+    }
+}
+
+template <int shift>
+void rshiftConst(const Size2D &size,
+                 const s16 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 CONVERT_POLICY cpolicy)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src + j);
+                int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
+                          v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
+                uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
+                                               vqmovun_s16(v_src1));
+                vst1q_u8(dst + j, v_dst);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
+                vst1_u8(dst + j, vqmovun_s16(v_src));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src + j);
+                int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
+                          v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
+                int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
+                                              vmovn_s16(v_src1));
+                vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
+                vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = (u8)((src[j] >> shift));
+            }
+        }
+    }
+}
+
+template <>
+void rshiftConst<0>(const Size2D &size,
+                    const s16 * srcBase, ptrdiff_t srcStride,
+                    u8 * dstBase, ptrdiff_t dstStride,
+                    CONVERT_POLICY cpolicy)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src + j);
+                int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+                uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
+                vst1q_u8(dst + j, v_dst);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src = vld1q_s16(src + j);
+                vst1_u8(dst + j, vqmovun_s16(v_src));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = internal::saturate_cast<u8>(src[j]);
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src + j);
+                int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+                int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
+                vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src = vld1q_s16(src + j);
+                vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = (u8)src[j];
+            }
+        }
+    }
+}
+
+typedef void (* lshiftConstFunc)(const Size2D &size,
+                                const u8 * srcBase, ptrdiff_t srcStride,
+                                s16 * dstBase, ptrdiff_t dstStride);
+
+typedef void (* rshiftConstFunc)(const Size2D &size,
+                                const s16 * srcBase, ptrdiff_t srcStride,
+                                u8 * dstBase, ptrdiff_t dstStride,
+                                CONVERT_POLICY cpolicy);
+
+} // namespace
+
+#endif
+
+void lshift(const Size2D &size,
+            const u8 * srcBase, ptrdiff_t srcStride,
+            s16 * dstBase, ptrdiff_t dstStride,
+            u32 shift)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    if (shift >= 16u)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            std::memset(dst, 0, sizeof(s16) * size.width);
+        }
+        return;
+    }
+
+    // this ugly contruction is needed to avoid:
+    // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
+    // return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
+
+    lshiftConstFunc funcs[16] =
+    {
+        lshiftConst<0>,
+        lshiftConst<1>,
+        lshiftConst<2>,
+        lshiftConst<3>,
+        lshiftConst<4>,
+        lshiftConst<5>,
+        lshiftConst<6>,
+        lshiftConst<7>,
+        lshiftConst<8>,
+        lshiftConst<9>,
+        lshiftConst<10>,
+        lshiftConst<11>,
+        lshiftConst<12>,
+        lshiftConst<13>,
+        lshiftConst<14>,
+        lshiftConst<15>
+    }, func = funcs[shift];
+
+    func(size, srcBase, srcStride, dstBase, dstStride);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)shift;
+#endif
+}
+
+void rshift(const Size2D &size,
+            const s16 * srcBase, ptrdiff_t srcStride,
+            u8 * dstBase, ptrdiff_t dstStride,
+            u32 shift, CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    if (shift >= 16)
+    {
+        if (cpolicy == CONVERT_POLICY_WRAP)
+        {
+            size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+            size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+            int16x8_t v_zero = vdupq_n_s16(0);
+
+            for (size_t i = 0; i < size.height; ++i)
+            {
+                const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+                u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+                size_t j = 0;
+
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src + j);
+                    int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+                    uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
+                                                   vmovn_u16(vcltq_s16(v_src1, v_zero)));
+                    vst1q_u8(dst + j, v_dst);
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    int16x8_t v_src = vld1q_s16(src + j);
+                    vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    dst[j] = src[j] >= 0 ? 0 : 255;
+                }
+            }
+        }
+        else
+        {
+            for (size_t i = 0; i < size.height; ++i)
+            {
+                u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+                std::memset(dst, 0, sizeof(u8) * size.width);
+            }
+        }
+        return;
+    }
+
+    // this ugly contruction is needed to avoid:
+    // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
+    // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
+
+    rshiftConstFunc funcs[16] =
+    {
+        rshiftConst<0>,
+        rshiftConst<1>,
+        rshiftConst<2>,
+        rshiftConst<3>,
+        rshiftConst<4>,
+        rshiftConst<5>,
+        rshiftConst<6>,
+        rshiftConst<7>,
+        rshiftConst<8>,
+        rshiftConst<9>,
+        rshiftConst<10>,
+        rshiftConst<11>,
+        rshiftConst<12>,
+        rshiftConst<13>,
+        rshiftConst<14>,
+        rshiftConst<15>
+    }, func = funcs[shift];
+
+    func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)shift;
+    (void)cpolicy;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/convert_scale.cpp b/3rdparty/carotene/src/convert_scale.cpp
new file mode 100644
index 0000000000..50c110b3ee
--- /dev/null
+++ b/3rdparty/carotene/src/convert_scale.cpp
@@ -0,0 +1,2498 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
+    void convertScale(const Size2D &_size,                                       \
+                      const T1 * srcBase, ptrdiff_t srcStride,                   \
+                      T2 * dstBase, ptrdiff_t dstStride,                         \
+                      f64 alpha, f64 beta)                                       \
+    {                                                                            \
+        internal::assertSupportedConfiguration();                                \
+        Size2D size(_size);                                                      \
+        if (srcStride == dstStride &&                                            \
+            srcStride == (ptrdiff_t)(size.width))                                \
+        {                                                                        \
+            size.width *= size.height;                                           \
+            size.height = 1;                                                     \
+        }                                                                        \
+        const ptrdiff_t sstep = srcStride / sizeof(T1);                          \
+        const ptrdiff_t dstep = dstStride / sizeof(T2);                          \
+        const size_t w = size.width & ~(SIMD_SIZE-1);                            \
+        if (size.width >= SIMD_SIZE)                                             \
+        {                                                                        \
+            const T1* _src = srcBase;                                            \
+            T2* _dst = dstBase;                                                  \
+            CVTINIT                                                              \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
+                CVTROW                                                           \
+        }                                                                        \
+        if(w < size.width)                                                       \
+        {                                                                        \
+            const T1* _src = srcBase;                                            \
+            T2* _dst = dstBase;                                                  \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
+                for(size_t i = w; i < size.width; i++ )                          \
+                    _dst[i] = internal::saturate_cast<T2>(_src[i]*alpha + beta); \
+        }                                                                        \
+    }
+
+#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW)                             \
+    void convertScale(const Size2D &_size,                                       \
+                      const T1 * srcBase, ptrdiff_t srcStride,                   \
+                      T1 * dstBase, ptrdiff_t dstStride,                         \
+                      f64 alpha, f64 beta)                                       \
+    {                                                                            \
+        internal::assertSupportedConfiguration();                                \
+        Size2D size(_size);                                                      \
+        if (srcStride == dstStride &&                                            \
+            srcStride == (ptrdiff_t)(size.width))                                \
+        {                                                                        \
+            size.width *= size.height;                                           \
+            size.height = 1;                                                     \
+        }                                                                        \
+        const ptrdiff_t sstep = srcStride / sizeof(T1);                          \
+        const ptrdiff_t dstep = dstStride / sizeof(T1);                          \
+        const size_t w = size.width & ~(SIMD_SIZE-1);                            \
+        if (size.width >= SIMD_SIZE)                                             \
+        {                                                                        \
+            const T1* _src = srcBase;                                            \
+            T1* _dst = dstBase;                                                  \
+            CVTSINIT                                                             \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
+                CVTSROW                                                          \
+        }                                                                        \
+        if(w < size.width)                                                       \
+        {                                                                        \
+            const T1* _src = srcBase;                                            \
+            T1* _dst = dstBase;                                                  \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
+                for(size_t i = w; i < size.width; i++ )                          \
+                    _dst[i] = internal::saturate_cast<T1>(_src[i]*alpha + beta); \
+        }                                                                        \
+    }
+
+#else
+
+#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
+    void convertScale(const Size2D &,                                            \
+                      const T1 *, ptrdiff_t,                                     \
+                      T2 *, ptrdiff_t,                                           \
+                      f64, f64)                                                  \
+    {                                                                            \
+        internal::assertSupportedConfiguration();                                \
+    }
+
+#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW)                             \
+    void convertScale(const Size2D &,                                            \
+                      const T1 *, ptrdiff_t,                                     \
+                      T1 *, ptrdiff_t,                                           \
+                      f64, f64)                                                  \
+    {                                                                            \
+        internal::assertSupportedConfiguration();                                \
+    }
+
+#endif
+
+#if defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC1(u8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovun.s32 d22, q7                                   \n\t"
+            "vqmovun.s32 d23, q8                                   \n\t"
+            "vqmovun.s32 d24, q9                                   \n\t"
+            "vqmovun.s32 d25, q10                                  \n\t"
+            "vqmovn.u16 d26, q11                                   \n\t"
+            "vqmovn.u16 d27, q12                                   \n\t"
+            "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC1(u8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
+        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
+        vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
+    }
+})
+#endif
+
+#if defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(u8, s8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovn.s32 d22, q7                                    \n\t"
+            "vqmovn.s32 d23, q8                                    \n\t"
+            "vqmovn.s32 d24, q9                                    \n\t"
+            "vqmovn.s32 d25, q10                                   \n\t"
+            "vqmovn.s16 d26, q11                                   \n\t"
+            "vqmovn.s16 d27, q12                                   \n\t"
+            "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
+            : //no output
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, s8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
+        int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
+        vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
+    }
+})
+#endif
+
+#if defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(u8, u16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovun.s32 d22, q7                                   \n\t"
+            "vqmovun.s32 d23, q8                                   \n\t"
+            "vqmovun.s32 d24, q9                                   \n\t"
+            "vqmovun.s32 d25, q10                                  \n\t"
+            "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
+            "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 8),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, u16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
+        vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
+    }
+})
+#endif
+
+#if defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(u8, s16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovn.s32 d22, q7                                    \n\t"
+            "vqmovn.s32 d23, q8                                    \n\t"
+            "vqmovn.s32 d24, q9                                    \n\t"
+            "vqmovn.s32 d25, q10                                   \n\t"
+            "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
+            "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
+            : //no output
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 8),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, s16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
+        vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(u8, s32, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
+            "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
+            "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
+            "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              [dst3] "r" (_dst + i + 8),
+              [dst4] "r" (_dst + i + 12),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10",
+            "d11","d12","d13","d14","d15","d16","d17",
+            "d18","d19","d20","d21","d22","d23","d24",
+            "d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, s32, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vst1q_s32(_dst + i + 0,  vline1_s32);
+        vst1q_s32(_dst + i + 4,  vline2_s32);
+        vst1q_s32(_dst + i + 8,  vline3_s32);
+        vst1q_s32(_dst + i + 12, vline4_s32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(u8, f32, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
+            "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
+            "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
+            "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              [dst3] "r" (_dst + i + 8),
+              [dst4] "r" (_dst + i + 12),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10",
+            "d11","d12","d13","d14","d15","d16","d17",
+            "d18","d19","d20","d21","d22","d23","d24",
+            "d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, f32, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vst1q_f32(_dst + i + 0,  vline1_f32);
+        vst1q_f32(_dst + i + 4,  vline2_f32);
+        vst1q_f32(_dst + i + 8,  vline3_f32);
+        vst1q_f32(_dst + i + 12, vline4_f32);
+    }
+})
+#endif
+
+#if defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(s8, u8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovun.s32 d22, q7                                   \n\t"
+            "vqmovun.s32 d23, q8                                   \n\t"
+            "vqmovun.s32 d24, q9                                   \n\t"
+            "vqmovun.s32 d25, q10                                  \n\t"
+            "vqmovn.u16 d26, q11                                   \n\t"
+            "vqmovn.u16 d27, q12                                   \n\t"
+            "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, u8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
+        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
+        vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
+    }
+})
+#endif
+
+#if defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC1(s8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovn.s32 d22, q7                                    \n\t"
+            "vqmovn.s32 d23, q8                                    \n\t"
+            "vqmovn.s32 d24, q9                                    \n\t"
+            "vqmovn.s32 d25, q10                                   \n\t"
+            "vqmovn.s16 d26, q11                                   \n\t"
+            "vqmovn.s16 d27, q12                                   \n\t"
+            "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC1(s8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
+        int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
+        vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
+    }
+})
+#endif
+
+#if defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(s8, u16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovun.s32 d22, q7                                   \n\t"
+            "vqmovun.s32 d23, q8                                   \n\t"
+            "vqmovun.s32 d24, q9                                   \n\t"
+            "vqmovun.s32 d25, q10                                  \n\t"
+            "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
+            "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 8),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, u16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
+        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
+        vst1q_u16(_dst + i + 0, vRes1_u16);
+        vst1q_u16(_dst + i + 8, vRes2_u16);
+    }
+})
+#endif
+
+#if defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(s8, s16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovn.s32 d22, q7                                    \n\t"
+            "vqmovn.s32 d23, q8                                    \n\t"
+            "vqmovn.s32 d24, q9                                    \n\t"
+            "vqmovn.s32 d25, q10                                   \n\t"
+            "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
+            "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 8),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, s16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
+        int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
+        vst1q_s16(_dst + i + 0, vRes1_s16);
+        vst1q_s16(_dst + i + 8, vRes2_s16);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s8, s32, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
+            "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
+            "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
+            "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              [dst3] "r" (_dst + i + 8),
+              [dst4] "r" (_dst + i + 12),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10",
+            "d11","d12","d13","d14","d15","d16","d17",
+            "d18","d19","d20","d21","d22","d23","d24",
+            "d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, s32, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vst1q_s32(_dst + i + 0,  vline1_s32);
+        vst1q_s32(_dst + i + 4,  vline2_s32);
+        vst1q_s32(_dst + i + 8,  vline3_s32);
+        vst1q_s32(_dst + i + 12, vline4_s32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s8, f32, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
+            "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
+            "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
+            "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              [dst3] "r" (_dst + i + 8),
+              [dst4] "r" (_dst + i + 12),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10",
+            "d11","d12","d13","d14","d15","d16","d17",
+            "d18","d19","d20","d21","d22","d23","d24",
+            "d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, f32, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vst1q_f32(_dst + i + 0,  vline1_f32);
+        vst1q_f32(_dst + i + 4,  vline2_f32);
+        vst1q_f32(_dst + i + 8,  vline3_f32);
+        vst1q_f32(_dst + i + 12, vline4_f32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(u16, u8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
+            "vmovl.u16 q3, d4                                      \n\t"
+            "vmovl.u16 q4, d5                                      \n\t"
+            "vcvt.f32.u32 q5, q3                                   \n\t"
+            "vcvt.f32.u32 q6, q4                                   \n\t"
+            "vmul.f32 q7, q5, q0                                   \n\t"
+            "vmul.f32 q8, q6, q0                                   \n\t"
+            "vadd.f32 q9, q7, q1                                   \n\t"
+            "vadd.f32 q10, q8, q1                                  \n\t"
+            "vcvt.s32.f32 q11, q9                                  \n\t"
+            "vcvt.s32.f32 q12, q10                                 \n\t"
+            "vqmovn.s32 d26, q11                                   \n\t"
+            "vqmovn.s32 d27, q12                                   \n\t"
+            "vqmovun.s16 d28, q13                                  \n\t"
+             "vst1.8 {d28}, [%[dst]]                               \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+               "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, u8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
+        vst1_u8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(u16, s8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
+            "vmovl.u16 q3, d4                                      \n\t"
+            "vmovl.u16 q4, d5                                      \n\t"
+            "vcvt.f32.u32 q5, q3                                   \n\t"
+            "vcvt.f32.u32 q6, q4                                   \n\t"
+            "vmul.f32 q7, q5, q0                                   \n\t"
+            "vmul.f32 q8, q6, q0                                   \n\t"
+            "vadd.f32 q9, q7, q1                                   \n\t"
+            "vadd.f32 q10, q8, q1                                  \n\t"
+            "vcvt.s32.f32 q11, q9                                  \n\t"
+            "vcvt.s32.f32 q12, q10                                 \n\t"
+            "vqmovn.s32 d26, q11                                   \n\t"
+            "vqmovn.s32 d27, q12                                   \n\t"
+            "vqmovn.s16 d28, q13                                   \n\t"
+            "vst1.8 {d28}, [%[dst]]                                \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+               "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, s8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
+        vst1_s8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC1(u16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u16 q3, d4                                       \n\t"
+            "vmovl.u16 q4, d5                                       \n\t"
+            "vcvt.f32.u32 q5, q3                                    \n\t"
+            "vcvt.f32.u32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vqmovun.s32 d26, q11                                   \n\t"
+            "vqmovun.s32 d27, q12                                   \n\t"
+            "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+              "w" (vshift), "w" (vscale)
+            : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
+        );
+    }
+})
+#else
+CVTS_FUNC1(u16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
+        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
+        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(u16, s16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u16 q3, d4                                       \n\t"
+            "vmovl.u16 q4, d5                                       \n\t"
+            "vcvt.f32.u32 q5, q3                                    \n\t"
+            "vcvt.f32.u32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vqmovn.s32 d26, q11                                    \n\t"
+            "vqmovn.s32 d27, q12                                    \n\t"
+            "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+              "w" (vshift), "w" (vscale)
+            : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, s16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(u16, s32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                        \n\t"
+            "vmovl.u16 q3, d4                                 \n\t"
+            "vmovl.u16 q4, d5                                 \n\t"
+            "vcvt.f32.u32 q5, q3                              \n\t"
+            "vcvt.f32.u32 q6, q4                              \n\t"
+            "vmul.f32 q7, q5, q0                              \n\t"
+            "vmul.f32 q8, q6, q0                              \n\t"
+            "vadd.f32 q9, q7, q1                              \n\t"
+            "vadd.f32 q10, q8, q1                             \n\t"
+            "vcvt.s32.f32 q11, q9                             \n\t"
+            "vcvt.s32.f32 q12, q10                            \n\t"
+            "vst1.32 {d22-d23}, [%[dst1]]                     \n\t"
+            "vst1.32 {d24-d25}, [%[dst2]]                     \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i),
+              [dst2] "r" (_dst + i + 4),
+              "w" (vshift), "w" (vscale)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, s32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vst1q_s32(_dst + i + 0, vline1_s32);
+        vst1q_s32(_dst + i + 4, vline2_s32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(u16, f32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u16 q3, d4                                       \n\t"
+            "vmovl.u16 q4, d5                                       \n\t"
+             "vcvt.f32.u32 q5, q3                                    \n\t"
+            "vcvt.f32.u32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vst1.32 {d18-d19}, [%[dst1]]                           \n\t"
+            "vst1.32 {d20-d21}, [%[dst2]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, f32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vst1q_f32(_dst + i + 0, vline1_f32);
+        vst1q_f32(_dst + i + 4, vline2_f32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s16, u8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
+            "vmovl.s16 q3, d4                                      \n\t"
+            "vmovl.s16 q4, d5                                      \n\t"
+            "vcvt.f32.s32 q5, q3                                   \n\t"
+            "vcvt.f32.s32 q6, q4                                   \n\t"
+            "vmul.f32 q7, q5, q0                                   \n\t"
+            "vmul.f32 q8, q6, q0                                   \n\t"
+            "vadd.f32 q9, q7, q1                                   \n\t"
+            "vadd.f32 q10, q8, q1                                  \n\t"
+            "vcvt.s32.f32 q11, q9                                  \n\t"
+            "vcvt.s32.f32 q12, q10                                 \n\t"
+            "vqmovn.s32 d26, q11                                   \n\t"
+            "vqmovn.s32 d27, q12                                   \n\t"
+            "vqmovun.s16 d28, q13                                  \n\t"
+            "vst1.8 {d28}, [%[dst]]                                \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+               "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, u8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
+        vst1_u8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s16, s8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
+            "vmovl.s16 q3, d4                                      \n\t"
+            "vmovl.s16 q4, d5                                      \n\t"
+            "vcvt.f32.s32 q5, q3                                   \n\t"
+            "vcvt.f32.s32 q6, q4                                   \n\t"
+            "vmul.f32 q7, q5, q0                                   \n\t"
+            "vmul.f32 q8, q6, q0                                   \n\t"
+            "vadd.f32 q9, q7, q1                                   \n\t"
+            "vadd.f32 q10, q8, q1                                  \n\t"
+            "vcvt.s32.f32 q11, q9                                  \n\t"
+            "vcvt.s32.f32 q12, q10                                 \n\t"
+            "vqmovn.s32 d26, q11                                   \n\t"
+            "vqmovn.s32 d27, q12                                   \n\t"
+            "vqmovn.s16 d28, q13                                   \n\t"
+            "vst1.8 {d28}, [%[dst]]                                \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+               "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, s8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
+        vst1_s8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s16, u16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s16 q3, d4                                       \n\t"
+            "vmovl.s16 q4, d5                                       \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vcvt.f32.s32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vqmovun.s32 d26, q11                                   \n\t"
+            "vqmovun.s32 d27, q12                                   \n\t"
+            "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, u16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
+        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
+        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC1(s16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s16 q3, d4                                       \n\t"
+            "vmovl.s16 q4, d5                                       \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vcvt.f32.s32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vqmovn.s32 d26, q11                                    \n\t"
+            "vqmovn.s32 d27, q12                                    \n\t"
+            "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+              "w" (vshift), "w" (vscale)
+            : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
+        );
+    }
+})
+#else
+CVTS_FUNC1(s16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s16, s32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s16 q3, d4                                       \n\t"
+            "vmovl.s16 q4, d5                                       \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vcvt.f32.s32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vst1.32 {d22-d23}, [%[dst1]]                           \n\t"
+            "vst1.32 {d24-d25}, [%[dst2]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, s32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vst1q_s32(_dst + i + 0, vline1_s32);
+        vst1q_s32(_dst + i + 4, vline2_s32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s16, f32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s16 q3, d4                                       \n\t"
+            "vmovl.s16 q4, d5                                       \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vcvt.f32.s32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                     \n\t"
+            "vadd.f32 q10, q8, q1                                     \n\t"
+            "vst1.32 {d18-d19}, [%[dst1]]                             \n\t"
+            "vst1.32 {d20-d21}, [%[dst2]]                             \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, f32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vst1q_f32(_dst + i + 0, vline1_f32);
+        vst1q_f32(_dst + i + 4, vline2_f32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s32, u8, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vcvt.f32.s32 q4, q2                                     \n\t"
+            "vcvt.f32.s32 q5, q3                                     \n\t"
+            "vmul.f32 q6, q4, q0                                     \n\t"
+            "vmul.f32 q7, q5, q0                                     \n\t"
+            "vadd.f32 q8, q6, q1                                     \n\t"
+            "vadd.f32 q9, q7, q1                                     \n\t"
+            "vcvt.s32.f32 q10, q8                                    \n\t"
+            "vcvt.s32.f32 q11, q9                                    \n\t"
+            "vqmovun.s32 d24, q10                                    \n\t"
+            "vqmovun.s32 d25, q11                                    \n\t"
+            "vqmovn.u16  d26, q12                                    \n\t"
+            "vst1.8 {d26}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, u8, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
+        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
+        uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
+        vst1_u8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s32, s8, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vcvt.f32.s32 q4, q2                                     \n\t"
+            "vcvt.f32.s32 q5, q3                                     \n\t"
+            "vmul.f32 q6, q4, q0                                     \n\t"
+            "vmul.f32 q7, q5, q0                                     \n\t"
+            "vadd.f32 q8, q6, q1                                     \n\t"
+            "vadd.f32 q9, q7, q1                                     \n\t"
+            "vcvt.s32.f32 q10, q8                                    \n\t"
+            "vcvt.s32.f32 q11, q9                                    \n\t"
+            "vqmovn.s32 d24, q10                                     \n\t"
+            "vqmovn.s32 d25, q11                                     \n\t"
+            "vqmovn.s16  d26, q12                                    \n\t"
+            "vst1.8 {d26}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, s8, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
+        vst1_s8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s32, u16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
+            "vcvt.f32.s32 q4, q2                                    \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vmul.f32 q6, q4, q0                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vadd.f32 q8, q6, q1                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vcvt.s32.f32 q10, q8                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vqmovun.s32 d24, q10                                   \n\t"
+            "vqmovun.s32 d25, q11                                   \n\t"
+            "vst1.16 {d24-d25}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, u16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
+        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
+        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s32, s16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
+            "vcvt.f32.s32 q4, q2                                    \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vmul.f32 q6, q4, q0                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vadd.f32 q8, q6, q1                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vcvt.s32.f32 q10, q8                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vqmovn.s32 d24, q10                                    \n\t"
+            "vqmovn.s32 d25, q11                                    \n\t"
+            "vst1.8 {d24-d25}, [%[dst]]                             \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, s16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC1(s32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
+            "vcvt.f32.s32 q4, q2                                    \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vmul.f32 q6, q4, q0                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vadd.f32 q8, q6, q1                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vcvt.s32.f32 q10, q8                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vst1.32 {d20-d21}, [%[dst1]]                           \n\t"
+            "vst1.32 {d22-d23}, [%[dst2]]                           \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+        );
+    }
+})
+#else
+CVTS_FUNC1(s32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vst1q_s32(_dst + i + 0, vline1_s32);
+        vst1q_s32(_dst + i + 4, vline2_s32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(s32, f32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
+            "vcvt.f32.s32 q4, q2                                    \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vmul.f32 q6, q4, q0                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vadd.f32 q8, q6, q1                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vst1.32 {d16-d17}, [%[dst1]]                           \n\t"
+            "vst1.32 {d18-d19}, [%[dst2]]                           \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [src2] "r" (_src + i + 4),
+              [dst1] "r" (_dst + i),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+           : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, f32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vst1q_f32(_dst + i + 0, vline1_f32);
+        vst1q_f32(_dst + i + 4, vline2_f32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(f32, u8, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha));
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta));
+    register uint32x4_t  vmask  asm ("q2") = vdupq_n_u32(1<<16);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d6-d7}, [%[src1]]                              \n\t"
+            "vld1.32 {d8-d9}, [%[src2]]                              \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vmul.f32 q6, q4, q0                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vadd.f32 q8, q6, q1                                     \n\t"
+            "vcvt.u32.f32 q9, q7                                     \n\t"
+            "vcvt.u32.f32 q10, q8                                    \n\t"
+            "vbic q11, q2, q6                                        \n\t"
+            "vbic q12, q2, q7                                        \n\t"
+            "vshr.u32 q13, q11, #16                                  \n\t"
+            "vshr.u32 q14, q12, #16                                  \n\t"
+            "vqsub.u32 q7, q9, q13                                   \n\t"
+            "vqsub.u32 q8, q10, q14                                  \n\t"
+            "vqrshrn.u32 d22, q7, #16                                \n\t"
+            "vqrshrn.u32 d23, q8, #16                                \n\t"
+            "vqmovn.u16 d30, q11                                     \n\t"
+            "vst1.8 {d30}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vscale), "w" (vshift), "w" (vmask)
+            : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, u8, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha));
+    float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta));
+    uint32x4_t  vmask  = vdupq_n_u32(1<<16);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift);
+        float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift);
+        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32);
+        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32);
+        uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32));
+        uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32));
+        vline1Mask = vshrq_n_u32(vline1Mask, 16);
+        vline2Mask = vshrq_n_u32(vline2Mask, 16);
+        vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask);
+        vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask);
+        uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16);
+        uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16);
+        uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
+
+        vst1_u8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(f32, s8, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vcvt.s32.f32 q8, q6                                     \n\t"
+            "vcvt.s32.f32 q9, q7                                     \n\t"
+            "vqmovn.s32 d14, q8                                      \n\t"
+            "vqmovn.s32 d15, q9                                      \n\t"
+            "vqmovn.s16 d16, q7                                      \n\t"
+            "vst1.8 {d16}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, s8, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
+        vst1_s8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(f32, u16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vcvt.u32.f32 q8, q6                                     \n\t"
+            "vcvt.u32.f32 q9, q7                                     \n\t"
+            "vqmovn.u32 d8, q8                                       \n\t"
+            "vqmovn.u32 d9, q9                                       \n\t"
+            "vst1.16 {d8-d9}, [%[dst]]                               \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, u16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
+        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
+        uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
+        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(f32, s16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vcvt.s32.f32 q8, q6                                     \n\t"
+            "vcvt.s32.f32 q9, q7                                     \n\t"
+            "vqmovn.s32 d8, q8                                       \n\t"
+            "vqmovn.s32 d9, q9                                       \n\t"
+            "vst1.16 {d8-d9}, [%[dst]]                               \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, s16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC(f32, s32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vcvt.s32.f32 q4, q6                                     \n\t"
+            "vcvt.s32.f32 q5, q7                                     \n\t"
+            "vst1.32 {d8-d9},   [%[dst1]]                            \n\t"
+            "vst1.32 {d10-d11}, [%[dst2]]                            \n\t"
+            : //no output
+            : [src1] "r" (_src + i),
+              [src2] "r" (_src + i + 4),
+              [dst1] "r" (_dst + i),
+              [dst2] "r" (_dst + i + 4),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, s32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vst1q_s32(_dst + i + 0, vline1_s32);
+        vst1q_s32(_dst + i + 4, vline2_s32);
+    }
+})
+#endif
+
+#if __GNUC_MINOR__ < 7
+CVTS_FUNC1(f32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vst1.32 {d12-d13}, [%[dst1]]                            \n\t"
+            "vst1.32 {d14-d15}, [%[dst2]]                            \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC1(f32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vst1q_f32(_dst + i + 0, vline1_f32);
+        vst1q_f32(_dst + i + 4, vline2_f32);
+    }
+})
+#endif
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/convolution.cpp b/3rdparty/carotene/src/convolution.cpp
new file mode 100644
index 0000000000..498d7ad883
--- /dev/null
+++ b/3rdparty/carotene/src/convolution.cpp
@@ -0,0 +1,340 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+
+namespace CAROTENE_NS {
+
+bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
+                            BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE) &&
+        (ksize.width == 3) && (ksize.height == 3);
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <int shift>
+int32x4_t vshrq_s32(int32x4_t value)
+{
+    return vshrq_n_s32(value, shift);
+}
+
+template <>
+int32x4_t vshrq_s32<0>(int32x4_t value)
+{
+    return value;
+}
+
+} // namespace
+
+typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
+
+#endif
+
+void convolution(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE border, u8 borderValue,
+                 const Size2D & ksize, s16 * kernelBase, u32 scale)
+{
+    internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
+#ifdef CAROTENE_NEON
+    const uint8x8_t v_zero_u8 = vdup_n_u8(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+    const int32x4_t v_zero_s32 = vdupq_n_s32(0);
+
+    uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
+              tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
+              tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
+    uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+    static const vshrq_s32_func vshrq_s32_a[33] =
+    {
+        vshrq_s32<0>,
+        vshrq_s32<1>,
+        vshrq_s32<2>,
+        vshrq_s32<3>,
+        vshrq_s32<4>,
+        vshrq_s32<5>,
+        vshrq_s32<6>,
+        vshrq_s32<7>,
+        vshrq_s32<8>,
+        vshrq_s32<9>,
+        vshrq_s32<10>,
+        vshrq_s32<11>,
+        vshrq_s32<12>,
+        vshrq_s32<13>,
+        vshrq_s32<14>,
+        vshrq_s32<15>,
+        vshrq_s32<16>,
+        vshrq_s32<17>,
+        vshrq_s32<18>,
+        vshrq_s32<19>,
+        vshrq_s32<20>,
+        vshrq_s32<21>,
+        vshrq_s32<22>,
+        vshrq_s32<23>,
+        vshrq_s32<24>,
+        vshrq_s32<25>,
+        vshrq_s32<26>,
+        vshrq_s32<27>,
+        vshrq_s32<28>,
+        vshrq_s32<29>,
+        vshrq_s32<30>,
+        vshrq_s32<31>,
+        vshrq_s32<32>
+    };
+    vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        u8 prevx[3] = { 0, 0, 0 },
+           currx[3] = { 0, 0, 0 },
+           nextx[3] = { 0, 0, 0 };
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx[0] = prevx[1] = prevx[2] = borderValue;
+                else
+                {
+                    prevx[0] = srow0 ? srow0[x4] : borderValue;
+                    prevx[1] =         srow1[x4]              ;
+                    prevx[2] = srow2 ? srow2[x4] : borderValue;
+                }
+
+                currx[0] = srow0 ? srow0[x3] : borderValue;
+                currx[1] =         srow1[x3]              ;
+                currx[2] = srow2 ? srow2[x3] : borderValue;
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev[0] = tcurr[0];
+                tcurr[0] = tnext[0];
+
+                tprev[1] = tcurr[1];
+                tcurr[1] = tnext[1];
+
+                tprev[2] = tcurr[2];
+                tcurr[2] = tnext[2];
+            }
+
+            tnext[0] = x0;
+            tnext[1] = x1;
+            tnext[2] = x2;
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr[0] = tcurr[1] = tcurr[2] = v_border;
+                else if (border == BORDER_MODE_REPLICATE)
+                {
+                    tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
+                    tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
+                    tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
+                }
+
+                continue;
+            }
+
+            int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[0], tcurr[0], 7);
+                t1 = tcurr[0];
+                t2 = vext_u8(tcurr[0], tnext[0], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
+            }
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[1], tcurr[1], 7);
+                t1 = tcurr[1];
+                t2 = vext_u8(tcurr[1], tnext[1], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
+            }
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[2], tcurr[2], 7);
+                t1 = tcurr[2];
+                t2 = vext_u8(tcurr[2], tnext[2], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
+            }
+
+
+            // make scale
+            v_dst0 = vshrq_s32_p(v_dst0);
+            v_dst1 = vshrq_s32_p(v_dst1);
+
+            // and add them
+            vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
+                                                          vqmovun_s32(v_dst1))));
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                {
+                    nextx[0] = borderValue;
+                    nextx[1] = borderValue;
+                    nextx[2] = borderValue;
+                }
+                else if (border == BORDER_MODE_REPLICATE)
+                {
+                    nextx[0] = srow0[x];
+                    nextx[1] = srow1[x];
+                    nextx[2] = srow2[x];
+                }
+            }
+            else
+            {
+                nextx[0] = srow0 ? srow0[x + 1] : borderValue;
+                nextx[1] =         srow1[x + 1]              ;
+                nextx[2] = srow2 ? srow2[x + 1] : borderValue;
+            }
+
+            s32 val = 0;
+            for (s32 _y = 0; _y < 3; ++_y)
+                val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
+                       currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
+                       nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
+
+            drow[x] = internal::saturate_cast<u8>(val >> scale);
+
+            // make shift
+            prevx[0] = currx[0];
+            currx[0] = nextx[0];
+
+            prevx[1] = currx[1];
+            currx[1] = nextx[1];
+
+            prevx[2] = currx[2];
+            currx[2] = nextx[2];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+    (void)ksize;
+    (void)kernelBase;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/count_nonzero.cpp b/3rdparty/carotene/src/count_nonzero.cpp
new file mode 100644
index 0000000000..be87767cbd
--- /dev/null
+++ b/3rdparty/carotene/src/count_nonzero.cpp
@@ -0,0 +1,430 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <limits>
+
+namespace CAROTENE_NS {
+
+s32 countNonZero(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw16 = size.width & ~15u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        #define COUNTNONZERO8U_BLOCK_SIZE (16*255)
+        uint8x16_t vc1 = vmovq_n_u8(1);
+        for (; i < roiw16;)
+        {
+            size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
+            uint8x16_t vs = vmovq_n_u8(0);
+
+            for (; i <= lim; i+= 16)
+            {
+                internal::prefetch(src + i);
+                uint8x16_t vln = vld1q_u8(src + i);
+                uint8x16_t vnz = vminq_u8(vln, vc1);
+                vs = vaddq_u8(vs, vnz);
+            }
+
+            uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
+            uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
+
+            s32 s[2];
+            vst1_u32((u32*)s, vs2);
+
+            if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
+            {
+                return 0x7fFFffFF;
+            }
+            result += (s[0] += s[1]);
+            if (s[0] < 0 || result < 0)
+            {
+                return 0x7fFFffFF;
+            }
+        }
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width & ~7u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        #define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
+        uint16x8_t vc1 = vmovq_n_u16(1);
+        for (; i < roiw8;)
+        {
+            size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
+            uint16x8_t vs = vmovq_n_u16(0);
+
+            for (; i <= lim; i+= 8)
+            {
+                internal::prefetch(src + i);
+                uint16x8_t vln = vld1q_u16(src + i);
+                uint16x8_t vnz = vminq_u16(vln, vc1);
+                vs = vaddq_u16(vs, vnz);
+            }
+
+            uint32x4_t vs4 = vpaddlq_u16(vs);
+            uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
+
+            s32 s[2];
+            vst1_u32((u32*)s, vs2);
+
+            if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
+            {
+                return 0x7fFFffFF;
+            }
+            result += (s[0] += s[1]);
+            if (s[0] < 0 || result < 0)
+            {
+                return 0x7fFFffFF;
+            }
+        }
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width & ~3u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u32* src = (const u32*)internal::getRowPtr( srcBase,  srcStride, k);
+        u32 i = 0;
+
+        uint32x4_t vc1 = vmovq_n_u32(1);
+        uint32x4_t vs = vmovq_n_u32(0);
+
+        for (; i < roiw4; i += 4 )
+        {
+            internal::prefetch(src + i);
+            uint32x4_t vln = vld1q_u32(src + i);
+            uint32x4_t vnz = vminq_u32(vln, vc1);
+            vs = vqaddq_u32(vs, vnz);
+        }
+
+        uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+
+        s32 s[2];
+        vst1_u32((u32*)s, vs2);
+
+        if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width & ~3u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        float32x4_t vc0 = vmovq_n_f32(0);
+        int32x4_t vs = vmovq_n_s32(0);
+
+        for (; i < roiw4; i += 4 )
+        {
+            internal::prefetch(src + i);
+            float32x4_t vln = vld1q_f32(src + i);
+            int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
+            vs = vqaddq_s32(vs, vnz);
+        }
+
+        int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
+
+        int s[2];
+        vst1_s32(s, vs2);
+
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
+
+        if (result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const f64 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width & ~7u;
+    size_t roiw4 = size.width & ~3u;
+    size_t roiw2 = size.width & ~1u;
+    uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
+    uint32x4_t vc0 = vmovq_n_u32(0);
+
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f64* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        int32x2_t vs1 = vmov_n_s32(0);
+        int32x2_t vs2 = vmov_n_s32(0);
+        int32x2_t vs3 = vmov_n_s32(0);
+        int32x2_t vs4 = vmov_n_s32(0);
+
+        for (; i < roiw8; i += 8 )
+        {
+            internal::prefetch(src + i + 6);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+            uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
+            uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
+            uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+            uint64x2_t vm2 = vandq_u64(vln2, vmask1);
+            uint64x2_t vm3 = vandq_u64(vln3, vmask1);
+            uint64x2_t vm4 = vandq_u64(vln4, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+            uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
+            uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
+            uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+            uint32x4_t vlx2 = vmvnq_u32(vequ2);
+            uint32x4_t vlx3 = vmvnq_u32(vequ3);
+            uint32x4_t vlx4 = vmvnq_u32(vequ4);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+            int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
+            int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
+            int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            vs2 = vqadd_s32(vs2, vnz2);
+            vs3 = vqadd_s32(vs3, vnz3);
+            vs4 = vqadd_s32(vs4, vnz4);
+        }
+
+        if (i < roiw4)
+        {
+            internal::prefetch(src + i + 2);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+            uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+            uint64x2_t vm2 = vandq_u64(vln2, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+            uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+            uint32x4_t vlx2 = vmvnq_u32(vequ2);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+            int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            vs2 = vqadd_s32(vs2, vnz2);
+            i += 4;
+        }
+
+        if (i < roiw2)
+        {
+            internal::prefetch(src + i);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            i += 2;
+        }
+
+        vs1 = vqadd_s32(vs1, vs2);
+        vs3 = vqadd_s32(vs3, vs4);
+        vs1 = vqadd_s32(vs1, vs3);
+        int32x2_t vsneg = vqneg_s32(vs1);
+
+        s32 s[2];
+        vst1_s32(s, vsneg);
+
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
+        if (result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/div.cpp b/3rdparty/carotene/src/div.cpp
new file mode 100644
index 0000000000..9c03202a83
--- /dev/null
+++ b/3rdparty/carotene/src/div.cpp
@@ -0,0 +1,694 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+#include <cfloat>
+#include <cmath>
+#include <limits>
+
+namespace CAROTENE_NS {
+
+namespace {
+
+#ifdef CAROTENE_NEON
+
+template <typename T>
+inline T divSaturateQ(const T &v1, const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
+                                                            internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
+                                                            internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
+template <>
+inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
+
+template <typename T>
+inline T divSaturate(const T &v1, const T &v2, const float scale)
+{
+    return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
+template <>
+inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
+
+
+template <typename T>
+inline T divWrapQ(const T &v1, const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
+                                                       internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
+                                                       internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
+template <>
+inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
+
+template <typename T>
+inline T divWrap(const T &v1, const T &v2, const float scale)
+{
+    return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
+template <>
+inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
+
+inline  uint8x16_t vtstq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vtstq_u8 (v0, v1); }
+inline  uint16x8_t vtstq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vtstq_u16(v0, v1); }
+inline  uint32x4_t vtstq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vtstq_u32(v0, v1); }
+inline   int8x16_t vtstq(const int8x16_t   & v0, const int8x16_t   & v1) { return vreinterpretq_s8_u8  (vtstq_s8 (v0, v1)); }
+inline   int16x8_t vtstq(const int16x8_t   & v0, const int16x8_t   & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
+inline   int32x4_t vtstq(const int32x4_t   & v0, const int32x4_t   & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
+
+inline   uint8x8_t vtst(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vtst_u8 (v0, v1); }
+inline  uint16x4_t vtst(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vtst_u16(v0, v1); }
+inline  uint32x2_t vtst(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vtst_u32(v0, v1); }
+inline    int8x8_t vtst(const int8x8_t    & v0, const int8x8_t    & v1) { return vreinterpret_s8_u8  (vtst_s8 (v0, v1)); }
+inline   int16x4_t vtst(const int16x4_t   & v0, const int16x4_t   & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
+inline   int32x2_t vtst(const int32x2_t   & v0, const int32x2_t   & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
+#endif
+
+template <typename T>
+void div(const Size2D &size,
+         const T * src0Base, ptrdiff_t src0Stride,
+         const T * src1Base, ptrdiff_t src1Stride,
+         T * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::vec64 vec64;
+
+    if (scale == 0.0f ||
+        (std::numeric_limits<T>::is_integer &&
+         (scale * std::numeric_limits<T>::max()) <  1.0f &&
+         (scale * std::numeric_limits<T>::max()) > -1.0f))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            T * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(T) * size.width);
+        }
+        return;
+    }
+
+    const size_t step128 = 16 / sizeof(T);
+    size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
+    const size_t step64 = 8 / sizeof(T);
+    size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        T * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                vec128 v_src0 = internal::vld1q(src0 + j);
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src0 = internal::vld1(src0 + j);
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                vec128 v_src0 = internal::vld1q(src0 + j);
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src0 = internal::vld1(src0 + j);
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+
+template <typename T>
+inline T recipSaturateQ(const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
+template <>
+inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
+
+template <typename T>
+inline T recipSaturate(const T &v2, const float scale)
+{
+    return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
+template <>
+inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
+
+
+template <typename T>
+inline T recipWrapQ(const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
+template <>
+inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
+
+template <typename T>
+inline T recipWrap(const T &v2, const float scale)
+{
+    return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
+template <>
+inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
+#endif
+
+template <typename T>
+void recip(const Size2D &size,
+           const T * src1Base, ptrdiff_t src1Stride,
+           T * dstBase, ptrdiff_t dstStride,
+           f32 scale,
+           CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::vec64 vec64;
+
+    if (scale == 0.0f ||
+        (std::numeric_limits<T>::is_integer &&
+         scale <  1.0f &&
+         scale > -1.0f))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            T * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(T) * size.width);
+        }
+        return;
+    }
+
+    const size_t step128 = 16 / sizeof(T);
+    size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
+    const size_t step64 = 8 / sizeof(T);
+    size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        T * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src1 + j);
+
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src1 + j);
+
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+}
+
+void div(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         u8 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s8 * src0Base, ptrdiff_t src0Stride,
+         const s8 * src1Base, ptrdiff_t src1Stride,
+         s8 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const u16 * src0Base, ptrdiff_t src0Stride,
+         const u16 * src1Base, ptrdiff_t src1Stride,
+         u16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s32 * src0Base, ptrdiff_t src0Stride,
+         const s32 * src1Base, ptrdiff_t src1Stride,
+         s32 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const f32 * src0Base, ptrdiff_t src0Stride,
+         const f32 * src1Base, ptrdiff_t src1Stride,
+         f32 * dstBase, ptrdiff_t dstStride,
+         f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (scale == 0.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(f32) * size.width);
+        }
+        return;
+    }
+
+    float32x4_t v_zero = vdupq_n_f32(0.0f);
+
+    size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
+    size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
+
+    if (std::fabs(scale - 1.0f) < FLT_EPSILON)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+            const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src0 = vld1q_f32(src0 + j);
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
+                vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
+                                   vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src0 = vld1_f32(src0 + j);
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
+                vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
+                                  vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+            const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src0 = vld1q_f32(src0 + j);
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
+                vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
+                                   vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
+                                                         internal::vrecpq_f32(v_src1))), v_mask)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src0 = vld1_f32(src0 + j);
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
+                vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
+                                  vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
+                                                                internal::vrecp_f32(v_src1))), v_mask)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+void reciprocal(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s8 * srcBase, ptrdiff_t srcStride,
+                s8 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                u16 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s16 * srcBase, ptrdiff_t srcStride,
+                s16 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s32 * srcBase, ptrdiff_t srcStride,
+                s32 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const f32 * srcBase, ptrdiff_t srcStride,
+                f32 * dstBase, ptrdiff_t dstStride,
+                f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (scale == 0.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(f32) * size.width);
+        }
+        return;
+    }
+
+    float32x4_t v_zero = vdupq_n_f32(0.0f);
+
+    size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
+    size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
+
+    if (std::fabs(scale - 1.0f) < FLT_EPSILON)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
+                vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
+                                   vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
+                vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
+                                  vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? 1.0f / src1[j] : 0;
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
+                vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
+                                   vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
+                                                                     scale)),v_mask)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
+                vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
+                                  vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
+                                                                  scale)), v_mask)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? scale / src1[j] : 0;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/dot_product.cpp b/3rdparty/carotene/src/dot_product.cpp
new file mode 100644
index 0000000000..1759ea7cd5
--- /dev/null
+++ b/3rdparty/carotene/src/dot_product.cpp
@@ -0,0 +1,260 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+f64 dotProduct(const Size2D &_size,
+               const u8 * src0Base, ptrdiff_t src0Stride,
+               const u8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
+// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
+#define DOT_UINT_BLOCKSIZE 66050*8
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        uint64x2_t ws = vmovq_n_u64(0);
+
+        while(i + 16 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
+
+            uint32x4_t s1 = vmovq_n_u32(0);
+            uint32x4_t s2 = vmovq_n_u32(0);
+
+            for (; i <= lim; i += 16)
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+
+                uint8x16_t vs1 = vld1q_u8(src0 + i);
+                uint8x16_t vs2 = vld1q_u8(src1 + i);
+
+                uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
+                uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
+
+                s1 = vpadalq_u16(s1, vdot1);
+                s2 = vpadalq_u16(s2, vdot2);
+            }
+
+            ws = vpadalq_u32(ws, s1);
+            ws = vpadalq_u32(ws, s2);
+        }
+
+        if(i + 8 <= size.width)
+        {
+            uint8x8_t vs1 = vld1_u8(src0 + i);
+            uint8x8_t vs2 = vld1_u8(src1 + i);
+
+            ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
+            i += 8;
+        }
+
+        result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
+
+        for (; i < size.width; ++i)
+            result += s32(src0[i]) * s32(src1[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f64 dotProduct(const Size2D &_size,
+               const s8 * src0Base, ptrdiff_t src0Stride,
+               const s8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
+// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
+#define DOT_INT_BLOCKSIZE 131070*8
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        int64x2_t ws = vmovq_n_s64(0);
+
+        while(i + 16 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
+
+            int32x4_t s1 = vmovq_n_s32(0);
+            int32x4_t s2 = vmovq_n_s32(0);
+
+            for (; i <= lim; i += 16)
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+
+                int8x16_t vs1 = vld1q_s8(src0 + i);
+                int8x16_t vs2 = vld1q_s8(src1 + i);
+
+                int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
+                int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
+
+                s1 = vpadalq_s16(s1, vdot1);
+                s2 = vpadalq_s16(s2, vdot2);
+            }
+
+            ws = vpadalq_s32(ws, s1);
+            ws = vpadalq_s32(ws, s2);
+        }
+
+        if(i + 8 <= size.width)
+        {
+            int8x8_t vs1 = vld1_s8(src0 + i);
+            int8x8_t vs2 = vld1_s8(src1 + i);
+
+            ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
+            i += 8;
+        }
+
+        result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
+
+        for (; i < size.width; ++i)
+            result += s32(src0[i]) * s32(src1[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f64 dotProduct(const Size2D &_size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+#define DOT_FLOAT_BLOCKSIZE (1 << 13)
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        while(i + 4 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
+            float32x4_t v_sum = vdupq_n_f32(0.0f);
+
+            for( ; i <= lim; i += 4 )
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
+            }
+
+            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
+            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
+        }
+
+        if(i + 2 <= size.width)
+        {
+            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
+            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
+            i += 2;
+        }
+
+        for (; i < size.width; ++i)
+            result += src0[i] * src1[i];
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/fast.cpp b/3rdparty/carotene/src/fast.cpp
new file mode 100644
index 0000000000..9506c1b6be
--- /dev/null
+++ b/3rdparty/carotene/src/fast.cpp
@@ -0,0 +1,428 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+
+/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
+   Below is the original copyright and the references */
+
+/*
+Copyright (c) 2006, 2008 Edward Rosten
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ *Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+ *Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+ *Neither the name of the University of Cambridge nor the names of
+  its contributors may be used to endorse or promote products derived
+  from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+The references are:
+ * Machine learning for high-speed corner detection,
+   E. Rosten and T. Drummond, ECCV 2006
+ * Faster and better: A machine learning approach to corner detection
+   E. Rosten, R. Porter and T. Drummond, PAMI, 2009
+*/
+
+#include "common.hpp"
+
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+namespace
+{
+
+void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
+{
+    pixel[0] = 0 + row_stride * 3;
+    pixel[1] = 1 + row_stride * 3;
+    pixel[2] = 2 + row_stride * 2;
+    pixel[3] = 3 + row_stride * 1;
+    pixel[4] = 3 + row_stride * 0;
+    pixel[5] = 3 + row_stride * -1;
+    pixel[6] = 2 + row_stride * -2;
+    pixel[7] = 1 + row_stride * -3;
+    pixel[8] = 0 + row_stride * -3;
+    pixel[9] = -1 + row_stride * -3;
+    pixel[10] = -2 + row_stride * -2;
+    pixel[11] = -3 + row_stride * -1;
+    pixel[12] = -3 + row_stride * 0;
+    pixel[13] = -3 + row_stride * 1;
+    pixel[14] = -2 + row_stride * 2;
+    pixel[15] = -1 + row_stride * 3;
+}
+
+u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
+{
+    const s32 K = 8, N = 16 + K + 1;
+    s32 k, v = ptr[0];
+    s16 d[(N + 7) & ~7];
+    for( k = 0; k < N; k++ )
+        d[k] = (s16)(v - ptr[pixel[k]]);
+
+    int16x8_t q0 = vdupq_n_s16((s16)(-1000));
+    int16x8_t q1 = vdupq_n_s16((s16)(1000));
+
+    int16x8_t d0_7   = vld1q_s16(d +  0);
+    int16x8_t d8_15  = vld1q_s16(d +  8);
+    int16x8_t d16_23 = vld1q_s16(d + 16);
+    int16x8_t d24    = vld1q_s16(d + 24);
+
+    //k == 0
+    int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
+    int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
+    int16x8_t ak0 = vminq_s16(v0k0, v1k0);
+    int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 3);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    v1k0 = vextq_s16(d0_7, d8_15, 4);
+    ak0 = vminq_s16(ak0, v1k0);
+    bk0 = vmaxq_s16(bk0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 5);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    v1k0 = vextq_s16(d0_7, d8_15, 6);
+    ak0 = vminq_s16(ak0, v1k0);
+    bk0 = vmaxq_s16(bk0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 7);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    ak0 = vminq_s16(ak0, d8_15);
+    bk0 = vmaxq_s16(bk0, d8_15);
+
+    q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
+    q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
+
+    v1k0 = vextq_s16(d8_15, d16_23, 1);
+    q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
+    q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
+
+    //k == 8
+    int16x8_t v0k8 = v1k0;
+    int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
+    int16x8_t ak8 = vminq_s16(v0k8, v1k8);
+    int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 3);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    v1k8 = vextq_s16(d8_15, d16_23, 4);
+    ak8 = vminq_s16(ak8, v1k8);
+    bk8 = vmaxq_s16(bk8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 5);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    v1k8 = vextq_s16(d8_15, d16_23, 6);
+    ak8 = vminq_s16(ak8, v1k8);
+    bk8 = vmaxq_s16(bk8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 7);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    ak8 = vminq_s16(ak8, d16_23);
+    bk8 = vmaxq_s16(bk8, d16_23);
+
+    q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
+    q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
+
+    v1k8 = vextq_s16(d16_23, d24, 1);
+    q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
+    q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
+
+    //fin
+    int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
+    int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
+    int32x4_t q2w = vmovl_s16(q2);
+    int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
+    int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
+
+    return (u8)(vget_lane_s32(q8, 0) - 1);
+}
+
+} //namespace
+#endif
+
+void FAST(const Size2D &size,
+          u8 *srcBase, ptrdiff_t srcStride,
+          KeypointStore *keypoints,
+          u8 threshold, bool nonmax_suppression)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    //keypoints.clear();
+
+    const s32 K = 8, N = 16 + K + 1;
+    ptrdiff_t i, j, k, pixel[N];
+    makeOffsets(pixel, srcStride);
+    for(k = 16; k < N; k++)
+        pixel[k] = pixel[k - 16];
+
+    uint8x16_t delta = vdupq_n_u8(128);
+    uint8x16_t t = vdupq_n_u8(threshold);
+    uint8x16_t K16 = vdupq_n_u8((u8)K);
+
+    u8 threshold_tab[512];
+    for( i = -255; i <= 255; i++ )
+        threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
+
+    std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
+    u8* buf[3];
+    buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
+    ptrdiff_t* cpbuf[3];
+    cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
+    cpbuf[1] = cpbuf[0] + size.width + 1;
+    cpbuf[2] = cpbuf[1] + size.width + 1;
+    memset(buf[0], 0, size.width*3);
+
+    for(i = 3; i < (ptrdiff_t)size.height-2; i++)
+    {
+        const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
+        u8* curr = buf[(i - 3)%3];
+        ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
+        memset(curr, 0, size.width);
+        ptrdiff_t ncorners = 0;
+
+        if( i < (ptrdiff_t)size.height - 3 )
+        {
+            j = 3;
+
+            for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
+            {
+                internal::prefetch(ptr);
+                internal::prefetch(ptr + pixel[0]);
+                internal::prefetch(ptr + pixel[2]);
+
+                uint8x16_t v0 = vld1q_u8(ptr);
+                int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
+                int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
+
+                int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
+                int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
+                int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
+                int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
+
+                uint8x16_t m0 =   vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
+                uint8x16_t m1 =   vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
+                m0 = vorrq_u8(m0, m1);
+
+                u64 mask[2];
+                vst1q_u64(mask, vreinterpretq_u64_u8(m0));
+
+                if( mask[0] == 0 )
+                {
+                    if (mask[1] != 0)
+                    {
+                        j -= 8;
+                        ptr -= 8;
+                    }
+                    continue;
+                }
+
+                uint8x16_t c0 = vmovq_n_u8(0);
+                uint8x16_t c1 = vmovq_n_u8(0);
+                uint8x16_t max0 = vmovq_n_u8(0);
+                uint8x16_t max1 = vmovq_n_u8(0);
+                for( k = 0; k < N; k++ )
+                {
+                    int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
+                    m0 = vcgtq_s8(x, v2);
+                    m1 = vcgtq_s8(v1, x);
+
+                    c0 = vandq_u8(vsubq_u8(c0, m0), m0);
+                    c1 = vandq_u8(vsubq_u8(c1, m1), m1);
+
+                    max0 = vmaxq_u8(max0, c0);
+                    max1 = vmaxq_u8(max1, c1);
+                }
+
+                max0 = vmaxq_u8(max0, max1);
+                u8 m[16];
+                vst1q_u8(m, vcgtq_u8(max0, K16));
+
+                for( k = 0; k < 16; ++k )
+                    if(m[k])
+                    {
+                        cornerpos[ncorners++] = j+k;
+                        if(nonmax_suppression)
+                            curr[j+k] = cornerScore(ptr+k, pixel);
+                    }
+            }
+
+            for( ; j < (s32)size.width - 3; j++, ptr++ )
+            {
+                s32 v = ptr[0];
+                const u8* tab = &threshold_tab[0] - v + 255;
+                s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
+
+                if( d == 0 )
+                    continue;
+
+                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
+                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
+                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
+
+                if( d == 0 )
+                    continue;
+
+                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
+                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
+                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
+                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
+
+                if( d & 1 )
+                {
+                    s32 vt = v - threshold, count = 0;
+
+                    for( k = 0; k < N; k++ )
+                    {
+                        s32 x = ptr[pixel[k]];
+                        if(x < vt)
+                        {
+                            if( ++count > K )
+                            {
+                                cornerpos[ncorners++] = j;
+                                if(nonmax_suppression)
+                                    curr[j] = cornerScore(ptr, pixel);
+                                break;
+                            }
+                        }
+                        else
+                            count = 0;
+                    }
+                }
+
+                if( d & 2 )
+                {
+                    s32 vt = v + threshold, count = 0;
+
+                    for( k = 0; k < N; k++ )
+                    {
+                        s32 x = ptr[pixel[k]];
+                        if(x > vt)
+                        {
+                            if( ++count > K )
+                            {
+                                cornerpos[ncorners++] = j;
+                                if(nonmax_suppression)
+                                    curr[j] = cornerScore(ptr, pixel);
+                                break;
+                            }
+                        }
+                        else
+                            count = 0;
+                    }
+                }
+            }
+        }
+
+        cornerpos[-1] = ncorners;
+
+        if( i == 3 )
+            continue;
+
+        const u8* prev = buf[(i - 4 + 3)%3];
+        const u8* pprev = buf[(i - 5 + 3)%3];
+        cornerpos = cpbuf[(i - 4 + 3)%3];
+        ncorners = cornerpos[-1];
+
+        for( k = 0; k < ncorners; k++ )
+        {
+            j = cornerpos[k];
+            s32 score = prev[j];
+            if( !nonmax_suppression ||
+                    (score > prev[j+1] && score > prev[j-1] &&
+                     score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
+                     score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
+            {
+                keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)keypoints;
+    (void)threshold;
+    (void)nonmax_suppression;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/fill_minmaxloc.cpp b/3rdparty/carotene/src/fill_minmaxloc.cpp
new file mode 100644
index 0000000000..fdf0e35d03
--- /dev/null
+++ b/3rdparty/carotene/src/fill_minmaxloc.cpp
@@ -0,0 +1,442 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+void process(const T * src, size_t j0, size_t j1, size_t i,
+             T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+             T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    for (size_t j = j0; j < j1; ++j)
+    {
+        T val = src[j];
+
+        if (val == maxVal)
+        {
+            if (maxLocCount < maxLocCapacity)
+            {
+                maxLocPtr[maxLocCount] = j;
+                maxLocPtr[maxLocCount + 1] = i;
+            }
+            maxLocCount += 2;
+        }
+
+        if (val == minVal)
+        {
+            if (minLocCount < minLocCapacity)
+            {
+                minLocPtr[minLocCount] = j;
+                minLocPtr[minLocCount + 1] = i;
+            }
+            minLocCount += 2;
+        }
+    }
+}
+
+} // namespace
+
+#endif
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
+    uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
+
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+
+            uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
+            uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
+            uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
+
+            vst1q_u8((u8 *)&mask[0], v_mask);
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src = vld1_u8(src + j);
+
+            uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
+            uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
+            uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], v_mask);
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u16 * srcBase, ptrdiff_t srcStride,
+                    u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
+               v_minval8 = vdupq_n_u16(minVal);
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
+
+            uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
+            uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
+
+            vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v_src = vld1q_u16(src + j);
+
+            uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
+            uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
+            uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const s16 * srcBase, ptrdiff_t srcStride,
+                    s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
+              v_minval8 = vdupq_n_s16(minVal);
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+
+            uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
+            uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
+
+            vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v_src = vld1q_s16(src + j);
+
+            uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
+            uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
+            uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const s32 * srcBase, ptrdiff_t srcStride,
+                    s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
+              v_minval4 = vdupq_n_s32(minVal);
+    u64 mask = 0ul;
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
+
+            uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
+            uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
+
+            vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
+
+            if (mask)
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u32 * srcBase, ptrdiff_t srcStride,
+                    u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
+               v_minval4 = vdupq_n_u32(minVal);
+    u64 mask = 0ul;
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
+
+            uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
+            uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
+
+            vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
+
+            if (mask)
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/flip.cpp b/3rdparty/carotene/src/flip.cpp
new file mode 100644
index 0000000000..339398dd92
--- /dev/null
+++ b/3rdparty/carotene/src/flip.cpp
@@ -0,0 +1,222 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
+{
+    bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
+    return isSupportedConfiguration() &&
+            ((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
+             (flipMode == FLIP_VERTICAL_MODE));
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+void flip(const Size2D & size,
+          const void * srcBase, ptrdiff_t srcStride,
+          void * dstBase, ptrdiff_t dstStride,
+          FLIP_MODE flipMode)
+{
+    using namespace internal;
+
+    typedef typename VecTraits<T>::vec128 vec128;
+    typedef typename VecTraits<T>::vec64 vec64;
+
+    u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src = getRowPtr((const T *)srcBase, srcStride, i);
+        T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
+        size_t js = 0, jd = size.width;
+
+        for (; js < roiw_base; js += step_base, jd -= step_base)
+        {
+            prefetch(src + js);
+
+            vec128 v_src = vld1q(src + js);
+            vec128 v_dst = vrev64q(v_src);
+            v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
+            vst1q(dst + jd - step_base, v_dst);
+        }
+        for (; js < roiw_tail; js += step_tail, jd -= step_tail)
+        {
+            vec64 v_src = vld1(src + js);
+            vst1(dst + jd - step_tail, vrev64(v_src));
+        }
+
+        for (--jd; js < size.width; ++js, --jd)
+            dst[jd] = src[js];
+    }
+}
+
+template <typename T>
+void flip3(const Size2D & size,
+           const void * srcBase, ptrdiff_t srcStride,
+           void * dstBase, ptrdiff_t dstStride,
+           FLIP_MODE flipMode)
+{
+    using namespace internal;
+
+#ifndef ANDROID
+    typedef typename VecTraits<T, 3>::vec128 vec128;
+#endif
+    typedef typename VecTraits<T, 3>::vec64 vec64;
+
+#ifndef ANDROID
+    u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+#endif
+    u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src = getRowPtr((const T *)srcBase, srcStride, i);
+        T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
+        size_t j = 0, js = 0, jd = size.width * 3;
+
+#ifndef ANDROID
+        for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
+        {
+            prefetch(src + js);
+
+            vec128 v_src = vld3q(src + js), v_dst;
+            v_src.val[0] = vrev64q(v_src.val[0]);
+            v_src.val[1] = vrev64q(v_src.val[1]);
+            v_src.val[2] = vrev64q(v_src.val[2]);
+
+            v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
+            v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
+            v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
+
+            vst3q(dst + jd - step_base3, v_dst);
+        }
+#endif // ANDROID
+
+        for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
+        {
+            vec64 v_src = vld3(src + js), v_dst;
+            v_dst.val[0] = vrev64(v_src.val[0]);
+            v_dst.val[1] = vrev64(v_src.val[1]);
+            v_dst.val[2] = vrev64(v_src.val[2]);
+
+            vst3(dst + jd - step_tail3, v_dst);
+        }
+
+        for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
+        {
+            dst[jd] = src[js];
+            dst[jd + 1] = src[js + 1];
+            dst[jd + 2] = src[js + 2];
+        }
+    }
+}
+
+typedef void (* flipFunc)(const Size2D &size,
+                  const void * srcBase, ptrdiff_t srcStride,
+                  void * dstBase, ptrdiff_t dstStride,
+                  FLIP_MODE flipMode);
+
+} // namespace
+
+#endif
+
+void flip(const Size2D &size,
+          const u8 * srcBase, ptrdiff_t srcStride,
+          u8 * dstBase, ptrdiff_t dstStride,
+          FLIP_MODE flipMode, u32 elemSize)
+{
+    internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
+#ifdef CAROTENE_NEON
+
+    if (flipMode == FLIP_VERTICAL_MODE)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
+            u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
+
+            std::memcpy(dst_row, src_row, elemSize * size.width);
+        }
+        return;
+    }
+
+    flipFunc func = NULL;
+
+    if (elemSize == (u32)sizeof(u8))
+        func = &flip<u8>;
+    if (elemSize == (u32)sizeof(u16))
+        func = &flip<u16>;
+    if (elemSize == (u32)sizeof(u32))
+        func = &flip<u32>;
+    if (elemSize == (u32)sizeof(u8) * 3)
+        func = &flip3<u8>;
+
+    if (func == NULL)
+        return;
+
+    func(size,
+         srcBase, srcStride,
+         dstBase, dstStride,
+         flipMode);
+
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)flipMode;
+    (void)elemSize;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/gaussian_blur.cpp b/3rdparty/carotene/src/gaussian_blur.cpp
new file mode 100644
index 0000000000..069373e419
--- /dev/null
+++ b/3rdparty/carotene/src/gaussian_blur.cpp
@@ -0,0 +1,1059 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+#include "separable_filter.hpp"
+
+namespace CAROTENE_NS {
+
+bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+void gaussianBlur3x3(const Size2D &size,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2);
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+
+    uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        s16 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue);
+
+                currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue);
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1));
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border_x4;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u16(tprev, tcurr, 7);
+            t1 = tcurr;
+            t2 = vextq_u16(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2));
+            vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4));
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue << 2;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = srow2[x] + (srow1[x] << 1) + srow0[x];
+            }
+            else
+                nextx = (srow2 ? srow2[x + 1] : borderValue) +
+                                (srow1[x + 1] << 1) +
+                        (srow0 ? srow0[x + 1] : borderValue);
+
+            f32 val = (prevx + (currx << 1) + nextx) >> 4;
+            drow[x] = internal::saturate_cast<u8>((s32)val);
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin)
+{
+    return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin);
+}
+
+void gaussianBlur3x3Margin(const Size2D &size,
+                           const u8 * srcBase, ptrdiff_t srcStride,
+                           u8 * dstBase, ptrdiff_t dstStride,
+                           BORDER_MODE border, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin));
+#ifdef CAROTENE_NEON
+    internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3U8_121>::process(
+                           size, srcBase, srcStride, dstBase, dstStride,
+                           0, 0, border, borderValue, borderMargin);
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border)
+{
+    return isSupportedConfiguration() &&
+           cn > 0 && cn <= 4 &&
+           size.width >= 8 && size.height >= 2 &&
+           (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REFLECT101 ||
+            border == BORDER_MODE_REFLECT ||
+            border == BORDER_MODE_REPLICATE ||
+            border == BORDER_MODE_WRAP);
+}
+
+void gaussianBlur5x5(const Size2D &size, s32 cn,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 4*cn, borderValue);
+        tmp = &_tmp[cn << 1];
+    }
+
+    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //1-line buffer
+    std::vector<u16> _buf(cn * (size.width + 4) + 32 / sizeof(u16));
+    u16* lane = internal::alignPtr(&_buf[cn << 1], 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lane[-cn+k] = borderValue;
+            lane[-cn-cn+k] = borderValue;
+            lane[colsn+k] = borderValue;
+            lane[colsn+cn+k] = borderValue;
+        }
+
+    uint8x8_t vc6u8 = vmov_n_u8(6);
+    uint16x8_t vc6u16 = vmovq_n_u16(6);
+    uint16x8_t vc4u16 = vmovq_n_u16(4);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
+        const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i);
+        const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+        const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 8; x += 8)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            uint8x8_t v0 = vld1_u8(ln0+x);
+            uint8x8_t v1 = vld1_u8(ln1+x);
+            uint8x8_t v2 = vld1_u8(ln2+x);
+            uint8x8_t v3 = vld1_u8(ln3+x);
+            uint8x8_t v4 = vld1_u8(ln4+x);
+
+            uint16x8_t v = vaddl_u8(v0, v4);
+            uint16x8_t v13 = vaddl_u8(v1, v3);
+
+            v = vmlal_u8(v, v2, vc6u8);
+            v = vmlaq_u16(v, v13, vc4u16);
+
+            vst1q_u16(lane + x, v);
+        }
+        for (; x < colsn; ++x)
+            lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x];
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lane[-cn+k] = lane[idx_l1 + k];
+                lane[-cn-cn+k] = lane[idx_l2 + k];
+
+                lane[colsn+k] = lane[idx_r1 + k];
+                lane[colsn+cn+k] = lane[idx_r2 + k];
+            }
+
+        //horizontal convolution
+        x = 0;
+        switch(cn)
+        {
+        case 1:
+            for (; x <= colsn - 8; x += 8)
+            {
+                internal::prefetch(lane + x);
+
+                uint16x8_t lane0 = vld1q_u16(lane + x - 2);
+                uint16x8_t lane4 = vld1q_u16(lane + x + 2);
+                uint16x8_t lane1 = vld1q_u16(lane + x - 1);
+                uint16x8_t lane3 = vld1q_u16(lane + x + 1);
+                uint16x8_t lane2 = vld1q_u16(lane + x + 0);
+
+                uint16x8_t ln04 = vaddq_u16(lane0, lane4);
+                uint16x8_t ln13 = vaddq_u16(lane1, lane3);
+
+                uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16);
+                uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16);
+
+                uint8x8_t ls = vrshrn_n_u16(lsw, 8);
+
+                vst1_u8(dst + x, ls);
+            }
+            break;
+        case 2:
+            for (; x <= colsn - 8*2; x += 8*2)
+            {
+                internal::prefetch(lane + x);
+
+                u16* lidx0 = lane + x - 2*2;
+                u16* lidx1 = lane + x - 1*2;
+                u16* lidx3 = lane + x + 1*2;
+                u16* lidx4 = lane + x + 2*2;
+#if __GNUC_MINOR__ < 7
+                __asm__ __volatile__ (
+                    "vld2.16 {d0, d2}, [%[in0]]!                              \n\t"
+                    "vld2.16 {d1, d3}, [%[in0]]                               \n\t"
+                    "vld2.16 {d8, d10}, [%[in4]]!                             \n\t"
+                    "vld2.16 {d9, d11}, [%[in4]]                              \n\t"
+                    "vadd.i16 q0, q4                                          \n\t"
+                    "vadd.i16 q1, q5                                          \n\t"
+                    "vld2.16 {d16, d18}, [%[in1]]!                            \n\t"
+                    "vld2.16 {d17, d19}, [%[in1]]                             \n\t"
+                    "vld2.16 {d8, d10}, [%[in3]]!                             \n\t"
+                    "vld2.16 {d9, d11}, [%[in3]]                              \n\t"
+                    "vadd.i16 q4, q8                                          \n\t"
+                    "vadd.i16 q5, q9                                          \n\t"
+                    "vld2.16 {d16, d18}, [%[in2]]                             \n\t"
+                    "vld2.16 {d17, d19}, [%[in22]]                            \n\t"
+                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
+                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
+                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
+                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
+                    "vrshrn.u16 d8, q0, #8                                    \n\t"
+                    "vrshrn.u16 d9, q1, #8                                    \n\t"
+                    "vst2.8 {d8-d9}, [%[out]]                                 \n\t"
+                    : [in0] "=r" (lidx0),
+                      [in1] "=r" (lidx1),
+                      [in3] "=r" (lidx3),
+                      [in4] "=r" (lidx4)
+                    : [out] "r" (dst + x),
+                      "0" (lidx0),
+                      "1" (lidx1),
+                      "2" (lidx3),
+                      "3" (lidx4),
+                      [in2] "r" (lane + x),
+                      [in22] "r" (lane + x + 4*2),
+                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+#else
+                uint16x8x2_t vLane0 = vld2q_u16(lidx0);
+                uint16x8x2_t vLane1 = vld2q_u16(lidx1);
+                uint16x8x2_t vLane2 = vld2q_u16(lane + x);
+                uint16x8x2_t vLane3 = vld2q_u16(lidx3);
+                uint16x8x2_t vLane4 = vld2q_u16(lidx4);
+
+                uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
+                uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
+
+                uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
+                uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
+
+                uint8x8x2_t vRes;
+                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
+                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
+                vst2_u8(dst + x, vRes);
+#endif
+            }
+            break;
+        case 3:
+            for (; x <= colsn - 8*3; x += 8*3)
+            {
+                internal::prefetch(lane + x);
+
+                u16* lidx0 = lane + x - 2*3;
+                u16* lidx1 = lane + x - 1*3;
+                u16* lidx3 = lane + x + 1*3;
+                u16* lidx4 = lane + x + 2*3;
+#if defined(__GNUC__) && defined(__arm__)
+                __asm__ __volatile__ (
+                    "vld3.16 {d0, d2, d4}, [%[in0]]!                          \n\t"
+                    "vld3.16 {d1, d3, d5}, [%[in0]]                           \n\t"
+                    "vld3.16 {d8, d10, d12}, [%[in4]]!                        \n\t"
+                    "vld3.16 {d9, d11, d13}, [%[in4]]                         \n\t"
+                    "vadd.i16 q0, q4                                          \n\t"
+                    "vadd.i16 q1, q5                                          \n\t"
+                    "vadd.i16 q2, q6                                          \n\t"
+                    "vld3.16 {d16, d18, d20}, [%[in1]]!                       \n\t"
+                    "vld3.16 {d17, d19, d21}, [%[in1]]                        \n\t"
+                    "vld3.16 {d8, d10, d12}, [%[in3]]!                        \n\t"
+                    "vld3.16 {d9, d11, d13}, [%[in3]]                         \n\t"
+                    "vadd.i16 q4, q8                                          \n\t"
+                    "vadd.i16 q5, q9                                          \n\t"
+                    "vadd.i16 q6, q10                                         \n\t"
+                    "vld3.16 {d16, d18, d20}, [%[in2]]                        \n\t"
+                    "vld3.16 {d17, d19, d21}, [%[in22]]                       \n\t"
+                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
+                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
+                    "vmla.i16 q2, q6, %q[c4]                                  \n\t"
+                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
+                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
+                    "vmla.i16 q2, q10, %q[c6]                                 \n\t"
+                    "vrshrn.u16 d8, q0, #8                                    \n\t"
+                    "vrshrn.u16 d9, q1, #8                                    \n\t"
+                    "vrshrn.u16 d10, q2, #8                                   \n\t"
+                    "vst3.8 {d8-d10}, [%[out]]                                \n\t"
+                    : [in0] "=r" (lidx0),
+                      [in1] "=r" (lidx1),
+                      [in3] "=r" (lidx3),
+                      [in4] "=r" (lidx4)
+                    : [out] "r" (dst + x),
+                      "0" (lidx0),
+                      "1" (lidx1),
+                      "2" (lidx3),
+                      "3" (lidx4),
+                      [in2] "r" (lane + x),
+                      [in22] "r" (lane + x + 4*3),
+                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+#else
+                uint16x8x3_t vLane0 = vld3q_u16(lidx0);
+                uint16x8x3_t vLane1 = vld3q_u16(lidx1);
+                uint16x8x3_t vLane2 = vld3q_u16(lane + x);
+                uint16x8x3_t vLane3 = vld3q_u16(lidx3);
+                uint16x8x3_t vLane4 = vld3q_u16(lidx4);
+
+                uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
+                uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
+                uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]);
+
+                uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]);
+                uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]);
+                uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16);
+                vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
+                vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16);
+
+                uint8x8x3_t vRes;
+                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
+                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
+                vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
+
+                vst3_u8(dst + x, vRes);
+#endif
+            }
+            break;
+        case 4:
+            for (; x <= colsn - 8*4; x += 8*4)
+            {
+                internal::prefetch(lane + x);
+                internal::prefetch(lane + x + 16);
+
+                u16* lidx0 = lane + x - 2*4;
+                u16* lidx1 = lane + x - 1*4;
+                u16* lidx3 = lane + x + 1*4;
+                u16* lidx4 = lane + x + 2*4;
+#if defined(__GNUC__) && defined(__arm__)
+                __asm__ __volatile__ (
+                    "vld4.16 {d0, d2, d4, d6}, [%[in0]]!                      \n\t"
+                    "vld4.16 {d1, d3, d5, d7}, [%[in0]]                       \n\t"
+                    "vld4.16 {d8, d10, d12, d14}, [%[in4]]!                   \n\t"
+                    "vld4.16 {d9, d11, d13, d15}, [%[in4]]                    \n\t"
+                    "vadd.i16 q0, q4                                          \n\t"
+                    "vadd.i16 q1, q5                                          \n\t"
+                    "vadd.i16 q2, q6                                          \n\t"
+                    "vadd.i16 q3, q7                                          \n\t"
+                    "vld4.16 {d16, d18, d20, d22}, [%[in1]]!                  \n\t"
+                    "vld4.16 {d17, d19, d21, d23}, [%[in1]]                   \n\t"
+                    "vld4.16 {d8, d10, d12, d14}, [%[in3]]!                   \n\t"
+                    "vld4.16 {d9, d11, d13, d15}, [%[in3]]                    \n\t"
+                    "vadd.i16 q4, q8                                          \n\t"
+                    "vadd.i16 q5, q9                                          \n\t"
+                    "vadd.i16 q6, q10                                         \n\t"
+                    "vadd.i16 q7, q11                                         \n\t"
+                    "vld4.16 {d16, d18, d20, d22}, [%[in2],:256]              \n\t"
+                    "vld4.16 {d17, d19, d21, d23}, [%[in22],:256]             \n\t"
+                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
+                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
+                    "vmla.i16 q2, q6, %q[c4]                                  \n\t"
+                    "vmla.i16 q3, q7, %q[c4]                                  \n\t"
+                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
+                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
+                    "vmla.i16 q2, q10, %q[c6]                                 \n\t"
+                    "vmla.i16 q3, q11, %q[c6]                                 \n\t"
+                    "vrshrn.u16 d8, q0, #8                                    \n\t"
+                    "vrshrn.u16 d9, q1, #8                                    \n\t"
+                    "vrshrn.u16 d10, q2, #8                                   \n\t"
+                    "vrshrn.u16 d11, q3, #8                                   \n\t"
+                    "vst4.8 {d8-d11}, [%[out]]                                \n\t"
+                    : [in0] "=r" (lidx0),
+                      [in1] "=r" (lidx1),
+                      [in3] "=r" (lidx3),
+                      [in4] "=r" (lidx4)
+                    : [out] "r" (dst + x),
+                      "0" (lidx0),
+                      "1" (lidx1),
+                      "2" (lidx3),
+                      "3" (lidx4),
+                      [in2] "r" (lane + x),
+                      [in22] "r" (lane + x + 4*4),
+                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+#else
+                uint16x8x4_t vLane0 = vld4q_u16(lidx0);
+                uint16x8x4_t vLane2 = vld4q_u16(lidx4);
+                uint16x8x4_t vLane4 = vld4q_u16(lidx1);
+                uint16x8x4_t vLane6 = vld4q_u16(lidx3);
+                uint16x8x4_t vLane8 = vld4q_u16(lane + x);
+
+                uint16x8_t vSum_0_4  = vaddq_u16(vLane0.val[0], vLane2.val[0]);
+                uint16x8_t vSum_1_5  = vaddq_u16(vLane0.val[1], vLane2.val[1]);
+                uint16x8_t vSum_2_6  = vaddq_u16(vLane0.val[2], vLane2.val[2]);
+                uint16x8_t vSum_3_7  = vaddq_u16(vLane0.val[3], vLane2.val[3]);
+
+                uint16x8_t vSum_4_8  = vaddq_u16(vLane4.val[0], vLane6.val[0]);
+                uint16x8_t vSum_5_9  = vaddq_u16(vLane4.val[1], vLane6.val[1]);
+                uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]);
+                uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
+                vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16);
+                vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16);
+                vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16);
+                vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16);
+
+                uint8x8x4_t vRes;
+                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
+                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
+                vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
+                vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8);
+
+                vst4_u8(dst + x, vRes);
+#endif
+            }
+            break;
+        }
+        for (s32 h = 0; h < cn; ++h)
+        {
+            u16* ln = lane + h;
+            u8* dt = dst + h;
+            for (size_t k = x; k < colsn; k += cn)
+            {
+                dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn]
+                               + u16(4) * (ln[k-cn] + ln[k+cn])
+                               + u16(6) * ln[k] + (1 << 7)) >> 8);
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+void gaussianBlur5x5(const Size2D &size, s32 cn,
+                     const u16 * srcBase, ptrdiff_t srcStride,
+                     u16 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderType, u16 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<u16> _tmp;
+    u16 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 4*cn, borderValue);
+        tmp = &_tmp[cn << 1];
+    }
+
+    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //1-line buffer
+    std::vector<u32> _buf(cn * (size.width + 4) + 32 / sizeof(u32));
+    u32* lane = internal::alignPtr(&_buf[cn << 1], 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lane[-cn+k] = borderValue;
+            lane[-cn-cn+k] = borderValue;
+            lane[colsn+k] = borderValue;
+            lane[colsn+cn+k] = borderValue;
+        }
+
+    uint16x4_t vc6u16 = vmov_n_u16(6);
+    uint32x4_t vc6u32 = vmovq_n_u32(6);
+    uint32x4_t vc4u32 = vmovq_n_u32(4);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        u16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
+        const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
+        const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+        const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            uint16x4_t v0 = vld1_u16(ln0+x);
+            uint16x4_t v1 = vld1_u16(ln1+x);
+            uint16x4_t v2 = vld1_u16(ln2+x);
+            uint16x4_t v3 = vld1_u16(ln3+x);
+            uint16x4_t v4 = vld1_u16(ln4+x);
+
+            uint32x4_t v = vaddl_u16(v0, v4);
+            uint32x4_t v13 = vaddl_u16(v1, v3);
+
+            v = vmlal_u16(v, v2, vc6u16);
+            v = vmlaq_u32(v, v13, vc4u32);
+
+            vst1q_u32(lane + x, v);
+        }
+        for (; x < colsn; ++x)
+            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lane[-cn+k] = lane[idx_l1 + k];
+                lane[-cn-cn+k] = lane[idx_l2 + k];
+
+                lane[colsn+k] = lane[idx_r1 + k];
+                lane[colsn+cn+k] = lane[idx_r2 + k];
+            }
+
+        //horizontal convolution
+        x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(lane + x);
+
+            uint32x4_t lane0 = vld1q_u32(lane + x - 2);
+            uint32x4_t lane4 = vld1q_u32(lane + x + 2);
+            uint32x4_t lane1 = vld1q_u32(lane + x - 1);
+            uint32x4_t lane3 = vld1q_u32(lane + x + 1);
+            uint32x4_t lane2 = vld1q_u32(lane + x + 0);
+
+            uint32x4_t ln04 = vaddq_u32(lane0, lane4);
+            uint32x4_t ln13 = vaddq_u32(lane1, lane3);
+
+            uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32);
+            uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32);
+
+            uint16x4_t ls = vrshrn_n_u32(lsw, 8);
+
+            vst1_u16(dst + x, ls);
+        }
+        for (s32 h = 0; h < cn; ++h)
+        {
+            u32* ln = lane + h;
+            u16* dt = dst + h;
+            for (size_t k = x; k < colsn; k += cn)
+            {
+                dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+void gaussianBlur5x5(const Size2D &size, s32 cn,
+                     const s16 * srcBase, ptrdiff_t srcStride,
+                     s16 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderType, s16 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<s16> _tmp;
+    s16 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 4*cn, borderValue);
+        tmp = &_tmp[cn << 1];
+    }
+
+    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //1-line buffer
+    std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
+    s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lane[-cn+k] = borderValue;
+            lane[-cn-cn+k] = borderValue;
+            lane[colsn+k] = borderValue;
+            lane[colsn+cn+k] = borderValue;
+        }
+
+    int16x4_t vc6s16 = vmov_n_s16(6);
+    int32x4_t vc6s32 = vmovq_n_s32(6);
+    int32x4_t vc4s32 = vmovq_n_s32(4);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
+        const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
+        const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+        const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            int16x4_t v0 = vld1_s16(ln0+x);
+            int16x4_t v1 = vld1_s16(ln1+x);
+            int16x4_t v2 = vld1_s16(ln2+x);
+            int16x4_t v3 = vld1_s16(ln3+x);
+            int16x4_t v4 = vld1_s16(ln4+x);
+
+            int32x4_t v = vaddl_s16(v0, v4);
+            int32x4_t v13 = vaddl_s16(v1, v3);
+
+            v = vmlal_s16(v, v2, vc6s16);
+            v = vmlaq_s32(v, v13, vc4s32);
+
+            vst1q_s32(lane + x, v);
+        }
+        for (; x < colsn; ++x)
+            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lane[-cn+k] = lane[idx_l1 + k];
+                lane[-cn-cn+k] = lane[idx_l2 + k];
+
+                lane[colsn+k] = lane[idx_r1 + k];
+                lane[colsn+cn+k] = lane[idx_r2 + k];
+            }
+
+        //horizontal convolution
+        x = 0;
+       switch(cn)
+        {
+        case 1:
+        case 2:
+        case 3:
+            for (; x <= colsn - 4; x += 4)
+            {
+                internal::prefetch(lane + x);
+
+                int32x4_t lane0 = vld1q_s32(lane + x - 2);
+                int32x4_t lane4 = vld1q_s32(lane + x + 2);
+                int32x4_t lane1 = vld1q_s32(lane + x - 1);
+                int32x4_t lane3 = vld1q_s32(lane + x + 1);
+                int32x4_t lane2 = vld1q_s32(lane + x + 0);
+
+                int32x4_t ln04 = vaddq_s32(lane0, lane4);
+                int32x4_t ln13 = vaddq_s32(lane1, lane3);
+
+                int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
+                int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
+
+                int16x4_t ls = vrshrn_n_s32(lsw, 8);
+
+                vst1_s16(dst + x, ls);
+           }
+            break;
+        case 4:
+/*            for (; x <= colsn - 4*4; x += 4*4)
+            {
+                internal::prefetch(lane + x);
+                internal::prefetch(lane + x + 16);
+
+                ptrdiff_t* lidx0 = lane + x - 2*4;
+                ptrdiff_t* lidx1 = lane + x - 1*4;
+                ptrdiff_t* lidx3 = lane + x + 1*4;
+                ptrdiff_t* lidx4 = lane + x + 2*4;
+
+                __asm__ __volatile__ (
+                    "vld4.32 {d0, d2, d4, d6}, [%[in0]]!                      \n\t"
+                    "vld4.32 {d1, d3, d5, d7}, [%[in0]]                       \n\t"
+                    "vld4.32 {d8, d10, d12, d14}, [%[in4]]!                   \n\t"
+                    "vld4.32 {d9, d11, d13, d15}, [%[in4]]                    \n\t"
+                    "vadd.i32 q0, q4                                          \n\t"
+                    "vadd.i32 q1, q5                                          \n\t"
+                    "vadd.i32 q2, q6                                          \n\t"
+                    "vadd.i32 q3, q7                                          \n\t"
+                    "vld4.32 {d16, d18, d20, d22}, [%[in1]]!                  \n\t"
+                    "vld4.32 {d17, d19, d21, d23}, [%[in1]]                   \n\t"
+                    "vld4.32 {d8, d10, d12, d14}, [%[in3]]!                   \n\t"
+                    "vld4.32 {d9, d11, d13, d15}, [%[in3]]                    \n\t"
+                    "vadd.i32 q4, q8                                          \n\t"
+                    "vadd.i32 q5, q9                                          \n\t"
+                    "vadd.i32 q6, q10                                         \n\t"
+                    "vadd.i32 q7, q11                                         \n\t"
+                    "vld4.32 {d16, d18, d20, d22}, [%[in2],:256]              \n\t"
+                    "vld4.32 {d17, d19, d21, d23}, [%[in22],:256]             \n\t"
+                    "vmla.i32 q0, q4, %q[c4]                                  \n\t"
+                    "vmla.i32 q1, q5, %q[c4]                                  \n\t"
+                    "vmla.i32 q2, q6, %q[c4]                                  \n\t"
+                    "vmla.i32 q3, q7, %q[c4]                                  \n\t"
+                    "vmla.i32 q0, q8, %q[c6]                                  \n\t"
+                    "vmla.i32 q1, q9, %q[c6]                                  \n\t"
+                    "vmla.i32 q2, q10, %q[c6]                                 \n\t"
+                    "vmla.i32 q3, q11, %q[c6]                                 \n\t"
+                    "vrshrn.i32 d8, q0, #8                                    \n\t"
+                    "vrshrn.i32 d9, q1, #8                                    \n\t"
+                    "vrshrn.i32 d10, q2, #8                                   \n\t"
+                    "vrshrn.i32 d11, q3, #8                                   \n\t"
+                   "vst4.16 {d8-d11}, [%[out]]                                \n\t"
+                    : [in0] "=r" (lidx0),
+                      [in1] "=r" (lidx1),
+                      [in3] "=r" (lidx3),
+                      [in4] "=r" (lidx4)
+                    : [out] "r" (dst + x),
+                      "0" (lidx0),
+                      "1" (lidx1),
+                      "2" (lidx3),
+                      "3" (lidx4),
+                      [in2] "r" (lane + x),
+                      [in22] "r" (lane + x + 4*2),
+                      [c4] "w" (vc4s32), [c6] "w" (vc6s32)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+*/
+            for (; x <= colsn - 4; x += 4)
+            {
+                internal::prefetch(lane + x);
+
+                int32x4_t lane0 = vld1q_s32(lane + x - 2);
+                int32x4_t lane4 = vld1q_s32(lane + x + 2);
+                int32x4_t lane1 = vld1q_s32(lane + x - 1);
+                int32x4_t lane3 = vld1q_s32(lane + x + 1);
+                int32x4_t lane2 = vld1q_s32(lane + x + 0);
+
+                int32x4_t ln04 = vaddq_s32(lane0, lane4);
+                int32x4_t ln13 = vaddq_s32(lane1, lane3);
+
+                int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
+                int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
+
+                int16x4_t ls = vrshrn_n_s32(lsw, 8);
+
+                vst1_s16(dst + x, ls);
+            }
+            break;
+        }
+        for (s32 h = 0; h < cn; ++h)
+        {
+            s32* ln = lane + h;
+            s16* dt = dst + h;
+            for (size_t k = x; k < colsn; k += cn)
+            {
+                dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+void gaussianBlur5x5(const Size2D &size, s32 cn,
+                     const s32 * srcBase, ptrdiff_t srcStride,
+                     s32 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<s32> _tmp;
+    s32 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 4*cn, borderValue);
+        tmp = &_tmp[cn << 1];
+    }
+
+    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //1-line buffer
+    std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
+    s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lane[-cn+k] = borderValue;
+            lane[-cn-cn+k] = borderValue;
+            lane[colsn+k] = borderValue;
+            lane[colsn+cn+k] = borderValue;
+        }
+
+    int32x4_t vc6s32 = vmovq_n_s32(6);
+    int32x4_t vc4s32 = vmovq_n_s32(4);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        s32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
+        const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i);
+        const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+        const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            int32x4_t v0 = vld1q_s32(ln0+x);
+            int32x4_t v1 = vld1q_s32(ln1+x);
+            int32x4_t v2 = vld1q_s32(ln2+x);
+            int32x4_t v3 = vld1q_s32(ln3+x);
+            int32x4_t v4 = vld1q_s32(ln4+x);
+
+            int32x4_t v = vaddq_s32(v0, v4);
+            int32x4_t v13 = vaddq_s32(v1, v3);
+
+            v = vmlaq_s32(v, v2, vc6s32);
+            v = vmlaq_s32(v, v13, vc4s32);
+
+            vst1q_s32(lane + x, v);
+        }
+        for (; x < colsn; ++x)
+            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lane[-cn+k] = lane[idx_l1 + k];
+                lane[-cn-cn+k] = lane[idx_l2 + k];
+
+                lane[colsn+k] = lane[idx_r1 + k];
+                lane[colsn+cn+k] = lane[idx_r2 + k];
+            }
+
+        //horizontal convolution
+        x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(lane + x);
+
+            int32x4_t lane0 = vld1q_s32(lane + x - 2);
+            int32x4_t lane4 = vld1q_s32(lane + x + 2);
+            int32x4_t lane1 = vld1q_s32(lane + x - 1);
+            int32x4_t lane3 = vld1q_s32(lane + x + 1);
+            int32x4_t lane2 = vld1q_s32(lane + x + 0);
+
+            int32x4_t ln04 = vaddq_s32(lane0, lane4);
+            int32x4_t ln13 = vaddq_s32(lane1, lane3);
+
+            int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
+            int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
+
+            vst1q_s32(dst + x, lsw);
+        }
+        for (s32 h = 0; h < cn; ++h)
+        {
+            s32* ln = lane + h;
+            s32* dt = dst + h;
+            for (size_t k = x; k < colsn; k += cn)
+            {
+                dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k];
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/in_range.cpp b/3rdparty/carotene/src/in_range.cpp
new file mode 100644
index 0000000000..b79a237e39
--- /dev/null
+++ b/3rdparty/carotene/src/in_range.cpp
@@ -0,0 +1,195 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
+inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
+inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
+
+template <typename T, int elsize> struct vtail
+{
+    static inline void inRange(const T *, const T *, const T *,
+                               u8 *, size_t &, size_t)
+    {
+        //do nothing since there couldn't be enough data
+    }
+};
+template <typename T> struct vtail<T, 2>
+{
+    static inline void inRange(const T * src, const T * rng1, const T * rng2,
+                               u8 * dst, size_t &x, size_t width)
+    {
+        typedef typename internal::VecTraits<T>::vec128 vec128;
+        typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+        //There no more than 15 elements in the tail, so we could handle 8 element vector only once
+        if( x + 8 < width)
+        {
+             vec128  vs = internal::vld1q( src + x);
+             vec128 vr1 = internal::vld1q(rng1 + x);
+             vec128 vr2 = internal::vld1q(rng2 + x);
+            uvec128  vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            internal::vst1(dst + x, internal::vmovn(vd));
+            x+=8;
+        }
+    }
+};
+template <typename T> struct vtail<T, 1>
+{
+    static inline void inRange(const T * src, const T * rng1, const T * rng2,
+                               u8 * dst, size_t &x, size_t width)
+    {
+        typedef typename internal::VecTraits<T>::vec128 vec128;
+        typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+        typedef typename internal::VecTraits<T>::vec64 vec64;
+        typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
+        //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
+        if( x + 16 < width)
+        {
+             vec128  vs = internal::vld1q( src + x);
+             vec128 vr1 = internal::vld1q(rng1 + x);
+             vec128 vr2 = internal::vld1q(rng2 + x);
+            uvec128  vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            internal::vst1q(dst + x, vd);
+            x+=16;
+        }
+        if( x + 8 < width)
+        {
+             vec64  vs = internal::vld1( src + x);
+             vec64 vr1 = internal::vld1(rng1 + x);
+             vec64 vr2 = internal::vld1(rng2 + x);
+            uvec64  vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
+            internal::vst1(dst + x, vd);
+            x+=8;
+        }
+    }
+};
+
+template <typename T>
+inline void inRangeCheck(const Size2D &_size,
+                         const T * srcBase, ptrdiff_t srcStride,
+                         const T * rng1Base, ptrdiff_t rng1Stride,
+                         const T * rng2Base, ptrdiff_t rng2Stride,
+                         u8 * dstBase, ptrdiff_t dstStride)
+{
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+
+    Size2D size(_size);
+    if (srcStride == dstStride &&
+        srcStride == rng1Stride &&
+        srcStride == rng2Stride &&
+        srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const size_t width = size.width & ~( 32/sizeof(T) - 1 );
+
+    for(size_t j = 0; j < size.height; ++j)
+    {
+        const T *  src = internal::getRowPtr( srcBase,  srcStride, j);
+        const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
+        const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
+             u8 *  dst = internal::getRowPtr( dstBase,  dstStride, j);
+        size_t i = 0;
+        for( ; i < width; i += 32/sizeof(T) )
+        {
+            internal::prefetch(src + i);
+            internal::prefetch(rng1 + i);
+            internal::prefetch(rng2 + i);
+
+             vec128  vs = internal::vld1q( src + i);
+             vec128 vr1 = internal::vld1q(rng1 + i);
+             vec128 vr2 = internal::vld1q(rng2 + i);
+            uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+                     vs = internal::vld1q( src + i + 16/sizeof(T));
+                    vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
+                    vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
+            uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            vnst(dst + i, vd1, vd2);
+        }
+        vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
+        for( ; i < size.width; i++ )
+            dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
+    }
+}
+
+}
+
+#define INRANGEFUNC(T)                                       \
+void inRange(const Size2D &_size,                            \
+             const T * srcBase, ptrdiff_t srcStride,         \
+             const T * rng1Base, ptrdiff_t rng1Stride,       \
+             const T * rng2Base, ptrdiff_t rng2Stride,       \
+             u8 * dstBase, ptrdiff_t dstStride)              \
+{                                                            \
+    internal::assertSupportedConfiguration();                \
+    inRangeCheck(_size, srcBase, srcStride,                  \
+                 rng1Base, rng1Stride, rng2Base, rng2Stride, \
+                 dstBase, dstStride);                        \
+}
+#else
+#define INRANGEFUNC(T)                                       \
+void inRange(const Size2D &,                                 \
+             const T *, ptrdiff_t,                           \
+             const T *, ptrdiff_t,                           \
+             const T *, ptrdiff_t,                           \
+             u8 *, ptrdiff_t)                                \
+{                                                            \
+    internal::assertSupportedConfiguration();                \
+}
+#endif
+
+INRANGEFUNC(u8)
+INRANGEFUNC(s8)
+INRANGEFUNC(u16)
+INRANGEFUNC(s16)
+INRANGEFUNC(s32)
+INRANGEFUNC(f32)
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/integral.cpp b/3rdparty/carotene/src/integral.cpp
new file mode 100644
index 0000000000..56c919500e
--- /dev/null
+++ b/3rdparty/carotene/src/integral.cpp
@@ -0,0 +1,238 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+void integral(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u32 * sumBase, ptrdiff_t sumStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint32x4_t v_zero = vmovq_n_u32(0u);
+
+    // the first iteration
+    const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
+    u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
+
+    uint32x4_t prev = v_zero;
+    size_t j = 0u;
+
+    for ( ; j + 7 < size.width; j += 8)
+    {
+        internal::prefetch(sum + j);
+        internal::prefetch(src + j);
+
+        uint8x8_t el8shr0 = vld1_u8(src + j);
+        uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
+        uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
+        uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
+
+        uint16x8_t el8shr12 =  vaddl_u8(el8shr1, el8shr2);
+        uint16x8_t el8shr03 =  vaddl_u8(el8shr0, el8shr3);
+
+        uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
+        uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
+
+        uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
+        uint32x4_t vsumh = vaddw_u16(prev, el4h);
+
+        vst1q_u32(sum + j, vsuml);
+        vst1q_u32(sum + j + 4, vsumh);
+
+        prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
+    }
+
+    for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
+        sum[j] = (v += src[j]);
+
+    // the others
+    for (size_t i = 1; i < size.height ; ++i)
+    {
+        src = internal::getRowPtr(srcBase, srcStride, i);
+        u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
+        sum = internal::getRowPtr(sumBase, sumStride, i);
+
+        prev = v_zero;
+        j = 0u;
+
+        for ( ; j + 7 < size.width; j += 8)
+        {
+            internal::prefetch(sum + j);
+            internal::prefetch(src + j);
+
+            uint32x4_t vsuml = vld1q_u32(prevSum + j);
+            uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
+
+            uint8x8_t el8shr0 = vld1_u8(src + j);
+            uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
+            uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
+            uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
+
+            vsuml = vaddq_u32(vsuml, prev);
+            vsumh = vaddq_u32(vsumh, prev);
+
+            uint16x8_t el8shr12 =  vaddl_u8(el8shr1, el8shr2);
+            uint16x8_t el8shr03 =  vaddl_u8(el8shr0, el8shr3);
+
+            uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
+            uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
+
+            vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
+            vsumh = vaddw_u16(vsumh, el4h);
+
+            vst1q_u32(sum + j, vsuml);
+            vst1q_u32(sum + j + 4, vsumh);
+
+            prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
+        }
+
+        for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
+            sum[j] = (v += src[j]) + prevSum[j];
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumBase;
+    (void)sumStride;
+#endif
+}
+
+void sqrIntegral(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 f64 * sqsumBase, ptrdiff_t sqsumStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint16x8_t v_zero8 = vmovq_n_u16(0u);
+
+    // the first iteration
+    const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
+    f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
+
+    double prev = 0.;
+    size_t j = 0u;
+
+    for ( ; j + 7 < size.width; j += 8)
+    {
+        internal::prefetch(sqsum + j);
+        internal::prefetch(src + j);
+
+        uint8x8_t vsrc = vld1_u8(src + j);
+
+        uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
+        uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
+
+        uint32x4_t el8shr01l =  vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
+        uint32x4_t el8shr01h =  vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
+
+        uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
+
+        uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
+        uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
+        uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
+
+        u32 buf[8];
+        vst1_u32(buf, vget_low_u32(el8shr01l));
+        vst1_u32(buf+2, el2l);
+        vst1_u32(buf+4, el2hl);
+        vst1_u32(buf+6, el2hh);
+        for(u32 k=0; k < 8; k++)
+            sqsum[j+k] = prev + buf[k];
+        prev += buf[7];
+    }
+
+    for (; j < size.width; ++j)
+        sqsum[j] = (prev += src[j]*src[j]);
+
+    // the others
+    for (size_t i = 1; i < size.height ; ++i)
+    {
+        src = internal::getRowPtr(srcBase, srcStride, i);
+        f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
+        sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
+
+        prev = 0.;
+        j = 0u;
+
+        for ( ; j + 7 < size.width; j += 8)
+        {
+            internal::prefetch(sqsum + j);
+            internal::prefetch(src + j);
+
+            uint8x8_t vsrc = vld1_u8(src + j);
+
+            uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
+            uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
+
+            uint32x4_t el8shr01l =  vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
+            uint32x4_t el8shr01h =  vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
+
+            uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
+
+            uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
+            uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
+            uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
+
+            u32 buf[8];
+            vst1_u32(buf, vget_low_u32(el8shr01l));
+            vst1_u32(buf+2, el2l);
+            vst1_u32(buf+4, el2hl);
+            vst1_u32(buf+6, el2hh);
+            for(u32 k=0; k < 8; k++)
+                sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
+            prev += buf[7];
+        }
+
+        for (; j < size.width; ++j)
+            sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sqsumBase;
+    (void)sqsumStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/intrinsics.hpp b/3rdparty/carotene/src/intrinsics.hpp
new file mode 100644
index 0000000000..062a3f897b
--- /dev/null
+++ b/3rdparty/carotene/src/intrinsics.hpp
@@ -0,0 +1,112 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_INTRINSICS_HPP
+#define CAROTENE_INTRINSICS_HPP
+
+#include <carotene/definitions.hpp>
+
+#include <arm_neon.h>
+
+namespace CAROTENE_NS { namespace internal {
+
+/////////////// Custom NEON intrinsics ///////////////////
+
+// calculate reciprocal value
+
+inline float32x4_t vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x2_t vrecp_f32(float32x2_t val)
+{
+    float32x2_t reciprocal = vrecpe_f32(val);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+// caclulate sqrt value
+
+inline float32x4_t vrsqrtq_f32(float32x4_t val)
+{
+    float32x4_t e = vrsqrteq_f32(val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x2_t vrsqrt_f32(float32x2_t val)
+{
+    float32x2_t e = vrsqrte_f32(val);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x4_t vsqrtq_f32(float32x4_t val)
+{
+    return vrecpq_f32(vrsqrtq_f32(val));
+}
+
+inline float32x2_t vsqrt_f32(float32x2_t val)
+{
+    return vrecp_f32(vrsqrt_f32(val));
+}
+
+// table lookup with the table in a 128-bit register
+
+inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
+{
+#ifdef __aarch64__
+    // AArch64 supports this natively
+    return ::vqtbl1_u8(a, b);
+#else
+    union { uint8x16_t v; uint8x8x2_t w; } u = { a };
+    return vtbl2_u8(u.w, b);
+#endif
+}
+
+} }
+
+#endif
diff --git a/3rdparty/carotene/src/laplacian.cpp b/3rdparty/carotene/src/laplacian.cpp
new file mode 100644
index 0000000000..b9148de1b4
--- /dev/null
+++ b/3rdparty/carotene/src/laplacian.cpp
@@ -0,0 +1,713 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+
+#include <vector>
+
+namespace CAROTENE_NS {
+
+bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+void Laplacian3x3(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+
+    uint8x8_t vsub;
+    uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        s16 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
+
+                currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border_x3;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
+
+                vsub = x1;
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u16(tprev, tcurr, 7);
+            t1 = tcurr;
+            t2 = vextq_u16(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
+
+            int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
+                                      vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
+            uint8x8_t it0 = vqmovun_s16(tt0);
+            vst1_u8(drow + x - 8, it0);
+
+            vsub = x1;
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue * 3;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = srow2[x] + srow1[x] + srow0[x];
+            }
+            else
+            {
+                nextx = (srow2 ? srow2[x + 1] : borderValue) +
+                                 srow1[x + 1] +
+                        (srow0 ? srow0[x + 1] : borderValue);
+            }
+
+            s32 val = (prevx + currx + nextx) - 9 * srow1[x];
+            drow[x] = internal::saturate_cast<u8>((s32)val);
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() &&
+        size.width >= 8 && size.height >= 1 &&
+        (border == BORDER_MODE_CONSTANT   ||
+         border == BORDER_MODE_REFLECT    ||
+         border == BORDER_MODE_REFLECT101 ||
+         border == BORDER_MODE_REPLICATE);
+}
+
+void Laplacian1OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v2 = 0;
+        // make border
+        if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+        } else  if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v2 =  y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+        } else {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tcurr = vmovq_n_s16(0x0);
+        int16x8_t tnext = vmovq_n_s16(0x0);
+        int16x8_t t0, t2;
+        uint8x8_t xx0 = vmov_n_u8(0x0);
+        uint8x8_t xx1 = vmov_n_u8(0x0);
+        uint8x8_t xx2 = vmov_n_u8(0x0);
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+
+            if(x) {
+                xx0 = xx1;
+                xx1 = xx2;
+            } else {
+                xx1 = x1;
+                // make border
+                    if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+                    {
+                        xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
+                    }
+                    else if (border == BORDER_MODE_CONSTANT)
+                    {
+                        xx1 = vset_lane_u8(borderValue, x1, 7);
+                    }
+                    else if (border == BORDER_MODE_REFLECT101)
+                    {
+                        xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
+                    }
+            }
+            xx2 = x1;
+
+            if(x) {
+                tcurr = tnext;
+            }
+            tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
+                              vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
+
+            if(!x) {
+                tcurr = tnext;
+                continue;
+            }
+            t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
+            t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
+            t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
+
+            vst1q_s16(drow + x - 8, t0);
+        }
+
+        x -= 8;
+        if(x == cols){
+            x--;
+        }
+
+        for( ; x < cols; x++ )
+        {
+            s16 nextx;
+            s16 prevx;
+            // make border
+            if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+            {
+                prevx = x == 0 ? v1[0] : v1[x-1];
+                nextx = x == cols-1 ? v1[x] : v1[x+1];
+            }
+            else if (border == BORDER_MODE_REFLECT101)
+            {
+                prevx = x == 0 ? v1[1] : v1[x-1];
+                nextx = x == cols-1 ? v1[x-1] : v1[x+1];
+            }
+            else //if (border == BORDER_MODE_CONSTANT)
+            {
+                prevx = x == 0 ? borderValue : v1[x-1];
+                nextx = x == cols-1 ? borderValue : v1[x+1];
+            }
+            *(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void Laplacian3OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v2 = 0;
+        // make border
+        if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+        } else  if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+        } else {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tprev = vmovq_n_s16(0x0);
+        int16x8_t tcurr = vmovq_n_s16(0x0);
+        int16x8_t tnext = vmovq_n_s16(0x0);
+        int16x8_t tc = vmovq_n_s16(0x0);
+        int16x8_t t0, t2, tcnext;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+            tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
+
+            if(x) {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+            tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
+
+            if(!x) {
+                tcurr = tnext;
+                tc = tcnext;
+
+                // make border
+                    if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+                    {
+                        tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
+                    }
+                    else if (border == BORDER_MODE_CONSTANT)
+                    {
+                        tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
+                    }
+                    else if (border == BORDER_MODE_REFLECT101)
+                    {
+                        tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
+                    }
+                continue;
+            }
+
+            t0 = vextq_s16(tprev, tcurr, 7);
+            t2 = vextq_s16(tcurr, tnext, 1);
+
+            t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
+            tc = tcnext;
+
+            t0 = vshlq_n_s16(t0, 1);
+            vst1q_s16(drow + x - 8, t0);
+        }
+        x -= 8;
+        if(x == cols){
+            x--;
+        }
+
+        for( ; x < cols; x++ )
+        {
+            s16 nextx, nextx2;
+            s16 prevx, prevx2;
+            // make border
+            if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+            {
+                prevx = x == 0 ? v0[0] : v0[x-1];
+                prevx2 = x == 0 ? v2[0] : v2[x-1];
+                nextx = x == cols-1 ? v0[x] : v0[x+1];
+                nextx2 = x == cols-1 ? v2[x] : v2[x+1];
+            }
+            else if (border == BORDER_MODE_REFLECT101)
+            {
+                prevx = x == 0 ? v0[1] : v0[x-1];
+                prevx2 = x == 0 ? v2[1] : v2[x-1];
+                nextx = x == cols-1 ? v0[x-1] : v0[x+1];
+                nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
+            }
+            else //if (border == BORDER_MODE_CONSTANT)
+            {
+                prevx = x == 0 ? borderValue : v0[x-1];
+                prevx2 = x == 0 ? borderValue : v2[x-1];
+                nextx = x == cols-1 ? borderValue : v0[x+1];
+                nextx2 = x == cols-1 ? borderValue : v2[x+1];
+            }
+            s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
+            *(drow+x) = 2*res;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void Laplacian5OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = 0;
+        const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v3 = 0;
+        const u8* v4 = 0;
+        // make border
+        if (border == BORDER_MODE_REPLICATE) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
+        } else if (border == BORDER_MODE_REFLECT) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
+        } else if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1   rows - 4 + (2,1)
+        } else if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
+            v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+            v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tnext, tc, t0;
+        int16x8_t tnext2, tnext3;
+        int16x8_t tnext1Old, tnext2Old, tnext3Old;
+        int16x8_t tnext4OldOldOld, tnext5OldOldOld;
+
+        int16x8_t tcurr1 = vmovq_n_s16(0x0);
+        int16x8_t tnext1 = vmovq_n_s16(0x0);
+        int16x8_t tprev1 = vmovq_n_s16(0x0);
+        int16x8_t tpprev1 = vmovq_n_s16(0x0);
+        int16x8_t tppprev1 = vmovq_n_s16(0x0);
+
+        int16x8_t tnext4Old = vmovq_n_s16(0x0);
+        int16x8_t tnext5Old = vmovq_n_s16(0x0);
+        int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
+
+        // do vertical convolution
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+            internal::prefetch(v3 + x);
+            internal::prefetch(v4 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+            uint8x8_t x3 = vld1_u8(v3 + x);
+            uint8x8_t x4 = vld1_u8(v4 + x);
+            if(x) {
+                tcurr1 = tnext1;
+            }
+
+            tnext4OldOldOld = tnext4Old;
+            tnext5OldOldOld = tnext5Old;
+            tnext1Old = tnext1OldOld;
+            tnext2Old = tnext2OldOld;
+            tnext3Old = tnext3OldOld;
+            tnext4Old = tnext4OldOld;
+            tnext5Old = tnext5OldOld;
+
+            tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
+            tnext3 = vshlq_n_s16(tnext3, 1);
+
+            tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
+            tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
+            tnext2 = vsubq_s16(tc, tnext);
+
+            tnext1 = vaddq_s16(tnext3, tnext2);
+            // tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
+
+            tnext2 = vshlq_n_s16(tnext2, 1);
+            // tnext2 = 2*x4 - 4*x2 + 2*x0
+
+            tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
+            // tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3  + 2*x4
+
+            tnext1OldOld = tnext1;
+            tnext2OldOld = tnext2;
+            tnext3OldOld = tnext3;
+            tnext4OldOld = tnext2;
+            tnext5OldOld = tnext1;
+
+            if(x) {
+                tnext1 = vextq_s16(tnext1Old, tnext1, 2);
+                tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
+                tprev1 = tnext3Old;
+
+                if(x!=8) {
+                    tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
+                    tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
+                }
+            }
+
+            if(!x) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
+                } else if (border == BORDER_MODE_REFLECT) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
+                    tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
+                }
+                tppprev1 = tprev1;
+                continue;
+            }
+
+            t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
+            t0 = vaddq_s16(t0, t0);
+            vst1q_s16(drow + x - 8, t0);
+        }
+        x -= 8;
+        if(x >= cols - 1)
+            x = cols-2;
+
+        s16 pprevx = 0;
+        s16 prevx = 0;
+        s16 nextx = 0;
+        s16 nnextx = 0;
+
+        for( ; x < cols; x++ )
+        {
+            if (x == 0) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
+                    prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+                } else if (border == BORDER_MODE_REFLECT) {
+                    pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
+                    prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
+                    prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    pprevx = 8 * borderValue;
+                    prevx = 0;
+                }
+            } else if (x == 1) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
+                    pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    pprevx = 8 * borderValue;
+                }
+                prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+            } else {
+                pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
+                prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
+            }
+            s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
+            if (x == cols-1) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
+                    nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
+                } else if (border == BORDER_MODE_REFLECT) {
+                    nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
+                    nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
+                    nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    nextx = 0;
+                    nnextx = 8 * borderValue;
+                }
+            } else if (x == cols-2) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
+                    nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    nnextx = 8 * borderValue;
+                }
+                nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
+            } else {
+                nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
+                nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
+            }
+            s16 res = pprevx + prevx + currx + nextx + nnextx;
+            *(drow+x) = 2*res;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/magnitude.cpp b/3rdparty/carotene/src/magnitude.cpp
new file mode 100644
index 0000000000..cd9d82bf6c
--- /dev/null
+++ b/3rdparty/carotene/src/magnitude.cpp
@@ -0,0 +1,160 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cmath>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+struct Magnitude
+{
+    typedef s16 type;
+
+    void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
+              int16x8_t & v_dst) const
+    {
+        int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
+        float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
+                                       vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
+        v_src0_p = vget_high_s16(v_src0);
+        v_src1_p = vget_high_s16(v_src1);
+        float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
+                                       vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
+
+        int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
+        int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
+
+        v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
+    }
+
+    void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
+              int16x4_t & v_dst) const
+    {
+        float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
+                                      vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
+        int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
+        v_dst = vqmovn_s32(v_sqrt);
+    }
+
+    void operator() (const short * src0, const short * src1, short * dst) const
+    {
+        f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
+        dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
+    }
+};
+
+struct MagnitudeF32
+{
+    typedef f32 type;
+
+    void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
+              float32x4_t & v_dst) const
+    {
+        v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
+    }
+
+    void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
+              float32x2_t & v_dst) const
+    {
+        v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
+    }
+
+    void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
+    {
+        dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void magnitude(const Size2D &size,
+               const s16 * src0Base, ptrdiff_t src0Stride,
+               const s16 * src1Base, ptrdiff_t src1Stride,
+               s16 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         Magnitude());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void magnitude(const Size2D &size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride,
+               f32 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         MagnitudeF32());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/meanstddev.cpp b/3rdparty/carotene/src/meanstddev.cpp
new file mode 100644
index 0000000000..a847493429
--- /dev/null
+++ b/3rdparty/carotene/src/meanstddev.cpp
@@ -0,0 +1,163 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <cmath>
+
+namespace CAROTENE_NS {
+
+void meanStdDev(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                f32 * pMean, f32 * pStdDev)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    f64 fsum = 0.0f, fsqsum = 0.0f;
+    sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
+
+    // calc mean and stddev
+    f64 itotal = 1.0 / size.total();
+    f64 mean = fsum * itotal;
+    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
+
+    if (pMean)
+        *pMean = mean;
+    if (pStdDev)
+        *pStdDev = stddev;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMean;
+    (void)pStdDev;
+#endif
+}
+
+void meanStdDev(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                f32 * pMean, f32 * pStdDev)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
+    f64 fsum = 0.0f, fsqsum = 0.0f;
+
+    f32 arsum[8];
+    uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
+    float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0u;
+
+        while (j < roiw4)
+        {
+            size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
+            v_sum = v_zero;
+            v_sqsum = v_zero_f;
+
+            for ( ; j + 16 < blockSize ; j += 16)
+            {
+                internal::prefetch(src + j);
+                uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
+
+                // 0
+                uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
+                uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
+                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
+                float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
+                float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
+
+                // 1
+                v_srclo = vmovl_u16(vget_low_u16(v_src1));
+                v_srchi = vmovl_u16(vget_high_u16(v_src1));
+                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
+                v_srclo_f = vcvtq_f32_u32(v_srclo);
+                v_srchi_f = vcvtq_f32_u32(v_srchi);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
+            }
+
+            for ( ; j < blockSize; j += 4)
+            {
+                uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
+                float32x4_t v_src_f = vcvtq_f32_u32(v_src);
+                v_sum = vaddq_u32(v_sum, v_src);
+                v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
+            }
+
+            vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
+            vst1q_f32(arsum + 4, v_sqsum);
+
+            fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
+            fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
+        }
+
+        // collect a few last elements in the current row
+        for ( ; j < size.width; ++j)
+        {
+            f32 srcval = src[j];
+            fsum += srcval;
+            fsqsum += srcval * srcval;
+        }
+    }
+
+    // calc mean and stddev
+    f64 itotal = 1.0 / size.total();
+    f64 mean = fsum * itotal;
+    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
+
+    if (pMean)
+        *pMean = mean;
+    if (pStdDev)
+        *pStdDev = stddev;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMean;
+    (void)pStdDev;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/median_filter.cpp b/3rdparty/carotene/src/median_filter.cpp
new file mode 100644
index 0000000000..8c5d08b7ee
--- /dev/null
+++ b/3rdparty/carotene/src/median_filter.cpp
@@ -0,0 +1,227 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+/*
+ * The code here is based on the code in
+ * <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
+ * See also <http://ndevilla.free.fr/median/median/index.html>.
+ */
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+namespace {
+
+    uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
+    {
+        u8 buf[16+8];
+        vst1q_u8(buf+cn, r);
+        for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
+        return vld1q_u8(buf);
+    }
+
+    uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
+    {
+        u8 buf[8+8];
+        vst1_u8(buf, r);
+        for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
+        return vld1_u8(buf+cn);
+    }
+
+} // namespace
+
+//o------^-------^-----------------------------o 0
+//       |       |
+//o--^---v---^---|-------^---------------------o 1
+//   |       |   |       |
+//o--v-------v---|-------|-^-------^-------^---o 2
+//               |       | |       |       |
+//o------^-------v-----^-|-|-------|-------|---o 3
+//       |             | | |       |       |
+//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
+//   |       |     |   |   |   |       |
+//o--v-------v---^-|---|---v---|-------|-------o 5
+//               | |   |       |       |
+//o------^-------|-|---v-------|-------v-------o 6
+//       |       | |           |
+//o--^---v---^---|-v-----------v---------------o 7
+//   |       |   |
+//o--v-------v---v-----------------------------o 8
+
+#define ELT(num, level) v ## num ## _lv ## level
+#define PIX_SORT(a, alvl, b, blvl, newlvl) \
+    PIX_MIN(a, alvl, b, blvl, newlvl); \
+    PIX_MAX(a, alvl, b, blvl, newlvl);
+
+#define SORT9 \
+    PIX_SORT(1, 00, 2, 00, 01); \
+    PIX_SORT(4, 00, 5, 00, 02); \
+    PIX_SORT(7, 00, 8, 00, 03); \
+    PIX_SORT(0, 00, 1, 01, 04); \
+    PIX_SORT(3, 00, 4, 02, 05); \
+    PIX_SORT(6, 00, 7, 03, 06); \
+    PIX_SORT(1, 04, 2, 01, 07); \
+    PIX_SORT(4, 05, 5, 02, 08); \
+    PIX_SORT(7, 06, 8, 03, 09); \
+    PIX_MAX (0, 04, 3, 05, 10); \
+    PIX_MIN (5, 08, 8, 09, 11); \
+    PIX_SORT(4, 08, 7, 09, 12); \
+    PIX_MAX (3, 10, 6, 06, 13); \
+    PIX_MAX (1, 07, 4, 12, 14); \
+    PIX_MIN (2, 07, 5, 11, 15); \
+    PIX_MIN (4, 14, 7, 12, 16); \
+    PIX_SORT(4, 16, 2, 15, 17); \
+    PIX_MAX (6, 13, 4, 17, 18); \
+    PIX_MIN (4, 18, 2, 17, 19);
+
+#endif
+
+bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
+{
+    return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
+}
+
+void medianFilter3x3(const Size2D &size, u32 numChannels,
+                     const u8 *srcBase, ptrdiff_t srcStride,
+                     const Margin &srcMargin,
+                     u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
+#ifdef CAROTENE_NEON
+    u32 cn = numChannels;
+    size_t colsn = size.width * cn;
+
+    for (size_t i = 0; i < size.height; ++i) {
+        const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
+        const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
+        const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
+        u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        {
+            uint8x16_t v3_lv00 = vld1q_u8(psrc0);
+            uint8x16_t v4_lv00 = vld1q_u8(psrc1);
+            uint8x16_t v5_lv00 = vld1q_u8(psrc2);
+            uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
+            uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
+            uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
+            uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
+            uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
+            uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
+
+            goto medianBlur3x3_mainBody;
+
+            for (; j < colsn - 16; j += 16) {
+                internal::prefetch(psrc0 + j);
+                internal::prefetch(psrc1 + j);
+                internal::prefetch(psrc2 + j);
+
+                v0_lv00 = vld1q_u8(psrc0 + j - cn);
+                v1_lv00 = vld1q_u8(psrc1 + j - cn);
+                v2_lv00 = vld1q_u8(psrc2 + j - cn);
+                v3_lv00 = vld1q_u8(psrc0 + j);
+                v4_lv00 = vld1q_u8(psrc1 + j);
+                v5_lv00 = vld1q_u8(psrc2 + j);
+                v6_lv00 = vld1q_u8(psrc0 + j + cn);
+                v7_lv00 = vld1q_u8(psrc1 + j + cn);
+                v8_lv00 = vld1q_u8(psrc2 + j + cn);
+
+medianBlur3x3_mainBody:
+
+#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
+#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
+                SORT9;
+#undef PIX_MAX
+#undef PIX_MIN
+
+                vst1q_u8(pdst + j, v4_lv19);
+            }
+        }
+
+        {
+            size_t k = colsn - 8;
+            uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
+            uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
+            uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
+            uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
+            uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
+            uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
+            uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
+            uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
+            uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
+
+            goto medianBlur3x3_tailBody;
+
+            for (; k >= j - 8; k -= 8) {
+                v0_lv00 = vld1_u8(psrc0 + k - cn);
+                v1_lv00 = vld1_u8(psrc1 + k - cn);
+                v2_lv00 = vld1_u8(psrc2 + k - cn);
+                v3_lv00 = vld1_u8(psrc0 + k);
+                v4_lv00 = vld1_u8(psrc1 + k);
+                v5_lv00 = vld1_u8(psrc2 + k);
+                v6_lv00 = vld1_u8(psrc0 + k + cn);
+                v7_lv00 = vld1_u8(psrc1 + k + cn);
+                v8_lv00 = vld1_u8(psrc2 + k + cn);
+
+medianBlur3x3_tailBody:
+
+#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
+#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
+                SORT9;
+#undef PIX_MAX
+#undef PIX_MIN
+
+                vst1_u8(pdst + k, v4_lv19);
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)numChannels;
+    (void)srcBase;
+    (void)srcStride;
+    (void)srcMargin;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/min_max.cpp b/3rdparty/carotene/src/min_max.cpp
new file mode 100644
index 0000000000..d6f4017841
--- /dev/null
+++ b/3rdparty/carotene/src/min_max.cpp
@@ -0,0 +1,139 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <algorithm>
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+struct Min
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vminq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vmin(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = std::min(src0[0], src1[0]);
+    }
+};
+
+template <typename T>
+struct Max
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vmaxq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vmax(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = std::max(src0[0], src1[0]);
+    }
+};
+
+} // namespace
+
+#define IMPL_OP(fun, op, type)                                         \
+void fun(const Size2D &size,                                           \
+         const type * src0Base, ptrdiff_t src0Stride,                  \
+         const type * src1Base, ptrdiff_t src1Stride,                  \
+         type * dstBase, ptrdiff_t dstStride)                          \
+{                                                                      \
+    internal::assertSupportedConfiguration();                          \
+    internal::vtransform(size,                                         \
+                         src0Base, src0Stride,                         \
+                         src1Base, src1Stride,                         \
+                         dstBase, dstStride, op<type>());              \
+}
+
+#else
+
+#define IMPL_OP(fun, op, type)                    \
+void fun(const Size2D &,                          \
+         const type *, ptrdiff_t,                 \
+         const type *, ptrdiff_t,                 \
+         type *, ptrdiff_t)                       \
+{                                                 \
+    internal::assertSupportedConfiguration();     \
+}
+
+#endif
+
+#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
+
+IMPL_MINMAX(u8)
+IMPL_MINMAX(s8)
+IMPL_MINMAX(u16)
+IMPL_MINMAX(s16)
+IMPL_MINMAX(u32)
+IMPL_MINMAX(s32)
+IMPL_MINMAX(f32)
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/minmaxloc.cpp b/3rdparty/carotene/src/minmaxloc.cpp
new file mode 100644
index 0000000000..a7f30bc4f8
--- /dev/null
+++ b/3rdparty/carotene/src/minmaxloc.cpp
@@ -0,0 +1,1340 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <limits>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+void minMaxVals(const Size2D &size,
+                const T * srcBase, ptrdiff_t srcStride,
+                T * pMinVal, T * pMaxVal)
+{
+    using namespace internal;
+
+    typedef typename VecTraits<T>::vec128 vec128;
+    typedef typename VecTraits<T>::vec64 vec64;
+
+    u32 step_base = 32 / sizeof(T), step_tail = 8 / sizeof(T);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    T maxVal = std::numeric_limits<T>::min();
+    T minVal = std::numeric_limits<T>::max();
+    vec128 v_min_base = vdupq_n(minVal), v_max_base = vdupq_n(maxVal);
+    vec64 v_min_tail = vdup_n(minVal), v_max_tail = vdup_n(maxVal);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src = getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for (; j < roiw_base; j += step_base)
+        {
+            prefetch(src + j);
+            vec128 v_src0 = vld1q(src + j), v_src1 = vld1q(src + j + 16 / sizeof(T));
+            v_min_base = vminq(v_min_base, v_src0);
+            v_max_base = vmaxq(v_max_base, v_src0);
+            v_min_base = vminq(v_min_base, v_src1);
+            v_max_base = vmaxq(v_max_base, v_src1);
+        }
+        for (; j < roiw_tail; j += step_tail)
+        {
+            vec64 v_src0 = vld1(src + j);
+            v_min_tail = vmin(v_min_tail, v_src0);
+            v_max_tail = vmax(v_max_tail, v_src0);
+        }
+
+        for (; j < size.width; j++)
+        {
+            T srcval = src[j];
+            minVal = std::min(srcval, minVal);
+            maxVal = std::max(srcval, maxVal);
+        }
+    }
+
+    // collect min & max values
+    T ar[16 / sizeof(T)];
+    vst1q(ar, vcombine(vmin(v_min_tail, vmin(vget_low(v_min_base), vget_high(v_min_base))),
+                       vmax(v_max_tail, vmax(vget_low(v_max_base), vget_high(v_max_base)))));
+
+    for (size_t x = 0; x < 8u / sizeof(T); ++x)
+    {
+        minVal = std::min(minVal, ar[x]);
+        maxVal = std::max(maxVal, ar[x + 8 / sizeof(T)]);
+    }
+
+    if (pMaxVal)
+        *pMaxVal = maxVal;
+    if (pMinVal)
+        *pMinVal = minVal;
+}
+
+} // namespace
+
+#endif
+
+void minMaxVals(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * pMinVal, u8 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<u8>(size,
+                   srcBase, srcStride,
+                   pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxVals(const Size2D &size,
+                const s16 * srcBase, ptrdiff_t srcStride,
+                s16 * pMinVal, s16 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<s16>(size,
+                    srcBase, srcStride,
+                    pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxVals(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                u16 * pMinVal, u16 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<u16>(size,
+                    srcBase, srcStride,
+                    pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxVals(const Size2D &size,
+                const s32 * srcBase, ptrdiff_t srcStride,
+                s32 * pMinVal, s32 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<s32>(size,
+                    srcBase, srcStride,
+                    pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxVals(const Size2D &size,
+                const u32 * srcBase, ptrdiff_t srcStride,
+                u32 * pMinVal, u32 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<u32>(size,
+                    srcBase, srcStride,
+                    pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const f32 * srcBase, ptrdiff_t srcStride,
+               f32 &minVal, size_t &minCol, size_t &minRow,
+               f32 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
+        if (size.width >= 16)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t   c4       = vdupq_n_u32(4);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (4 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
+#else
+            {
+                size_t bound = size.width - (4 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                float32x4_t  n_min    = vdupq_n_f32(minVal);
+                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
+                float32x4_t  n_max    = vdupq_n_f32(maxVal);
+                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);
+
+                for(; i < bound; i+=4)
+                {
+                    internal::prefetch(src + i);
+                    float32x4_t line = vld1q_f32(src + i);
+
+                    uint32x4_t minmask = vcltq_f32(line, n_min);
+                    uint32x4_t maxmask = vcgtq_f32(line, n_max);
+
+                    n_min    = vbslq_f32(minmask, line, n_min);
+                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
+                    n_max    = vbslq_f32(maxmask, line, n_max);
+                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
+
+                    // idx[] +=4
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
+                }
+
+                f32 fmin[4], fmax[4];
+                u32 fminIdx[4], fmaxIdx[4];
+
+                vst1q_f32(fmin, n_min);
+                vst1q_f32(fmax, n_max);
+
+                vst1q_u32(fminIdx, n_minIdx);
+                vst1q_u32(fmaxIdx, n_maxIdx);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 4; ++j)
+                {
+                    f32 minval = fmin[j];
+                    f32 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; ++i )
+        {
+            float val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            else if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const f32 * srcBase, ptrdiff_t srcStride,
+               const u8 * maskBase, ptrdiff_t maskStride,
+               f32 &minVal, size_t &minCol, size_t &minRow,
+               f32 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = std::numeric_limits<f32>::max();
+    minCol = size.width;
+    minRow = size.height;
+    maxVal = -std::numeric_limits<f32>::max();
+    maxCol = size.width;
+    maxRow = size.height;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
+        const u8 * mask = internal::getRowPtr( maskBase, maskStride, l);
+        if (size.width >= 16)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t  uOne      = vdupq_n_u32(1);
+            uint32x4_t   c4       = vdupq_n_u32(4);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (4 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
+#else
+            {
+                size_t bound = size.width - (4 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                float32x4_t  n_min    = vdupq_n_f32(minVal);
+                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
+                float32x4_t  n_max    = vdupq_n_f32(maxVal);
+                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);
+
+                for(; i < bound; i+=4)
+                {
+                    internal::prefetch(src + i);
+                    internal::prefetch(mask + i);
+                    float32x4_t line = vld1q_f32(src + i);
+                    uint8x8_t maskLine = vld1_u8(mask + i);
+
+                    uint32x4_t maskLine4 = vmovl_u16(vget_low_u16(vmovl_u8(maskLine)));
+                    maskLine4 = vcgeq_u32(maskLine4, uOne);
+
+                    uint32x4_t minmask = vcltq_f32(line, n_min);
+                    uint32x4_t maxmask = vcgtq_f32(line, n_max);
+
+                    minmask = vandq_u32(minmask, maskLine4);
+                    maxmask = vandq_u32(maxmask, maskLine4);
+
+                    n_min    = vbslq_f32(minmask, line, n_min);
+                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
+                    n_max    = vbslq_f32(maxmask, line, n_max);
+                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
+
+                    // idx[] +=4
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
+                }
+
+                f32 fmin[4], fmax[4];
+                u32 fminIdx[4], fmaxIdx[4];
+
+                vst1q_f32(fmin, n_min);
+                vst1q_f32(fmax, n_max);
+
+                vst1q_u32(fminIdx, n_minIdx);
+                vst1q_u32(fmaxIdx, n_maxIdx);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 4; ++j)
+                {
+                    f32 minval = fmin[j];
+                    f32 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; i++ )
+        {
+            if (!mask[i])
+                continue;
+            f32 val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)maskBase;
+    (void)maskStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const s32 * srcBase, ptrdiff_t srcStride,
+               s32 &minVal, size_t &minCol, size_t &minRow,
+               s32 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const s32 * src = internal::getRowPtr( srcBase, srcStride, l);
+        if (size.width >= 16)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t c4       = vdupq_n_u32(4);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (4 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
+#else
+            {
+                size_t bound = size.width - (4 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                int32x4_t  n_min    = vdupq_n_s32(minVal);
+                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
+                int32x4_t  n_max    = vdupq_n_s32(maxVal);
+                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);
+
+                for(; i < bound; i+=4 )
+                {
+                    internal::prefetch(src + i);
+                    int32x4_t line = vld1q_s32(src + i);
+
+                    uint32x4_t minmask = vcltq_s32(line, n_min);
+                    uint32x4_t maxmask = vcgtq_s32(line, n_max);
+
+                    n_min    = vbslq_s32(minmask, line, n_min);
+                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
+                    n_max    = vbslq_s32(maxmask, line, n_max);
+                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
+
+                    // idx[] +=4
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
+                }
+
+                s32 fmin[4], fmax[4];
+                u32 fminIdx[4], fmaxIdx[4];
+
+                vst1q_s32(fmin, n_min);
+                vst1q_s32(fmax, n_max);
+
+                vst1q_u32(fminIdx, n_minIdx);
+                vst1q_u32(fmaxIdx, n_maxIdx);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 4; ++j)
+                {
+                    s32 minval = fmin[j];
+                    s32 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; ++i )
+        {
+            s32 val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            else if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const s16 * srcBase, ptrdiff_t srcStride,
+               s16 &minVal, size_t &minCol, size_t &minRow,
+               s16 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const s16 * src = internal::getRowPtr( srcBase,  srcStride, l);
+        if (size.width >= 32)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t c8        = vdupq_n_u32(8);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (8 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
+#else
+            {
+                size_t bound = size.width - (8 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                int16x8_t  n_min    = vdupq_n_s16(minVal);
+                uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
+                uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
+                int16x8_t  n_max    = vdupq_n_s16(maxVal);
+                uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
+                uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);
+
+                for(; i < bound; i+=8 )
+                {
+                    internal::prefetch(src + i);
+                    int16x8_t line = vld1q_s16(src + i);
+
+                    uint16x8_t minmask = vcltq_s16(line, n_min);
+                    uint16x8_t maxmask = vcgtq_s16(line, n_max);
+
+                    n_min    = vbslq_s16(minmask, line, n_min);
+                    uint16x4_t minml = vget_low_u16(minmask);
+                    uint16x4_t minmh = vget_high_u16(minmask);
+                    uint32x4_t minml2 = vmovl_u16(minml);
+                    uint32x4_t minmh2 = vmovl_u16(minmh);
+                    minml2 = vqshlq_n_u32(minml2, 31);
+                    minmh2 = vqshlq_n_u32(minmh2, 31);
+                    n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
+                    n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);
+
+                    n_max    = vbslq_s16(maxmask, line, n_max);
+                    uint16x4_t maxml = vget_low_u16(maxmask);
+                    uint16x4_t maxmh = vget_high_u16(maxmask);
+                    uint32x4_t maxml2 = vmovl_u16(maxml);
+                    uint32x4_t maxmh2 = vmovl_u16(maxmh);
+                    maxml2 = vqshlq_n_u32(maxml2, 31);
+                    maxmh2 = vqshlq_n_u32(maxmh2, 31);
+                    n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
+                    n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);
+
+                    // idx[] +=8
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
+                }
+
+                // fix high part of indexes
+                uint32x4_t c4 = vdupq_n_u32((int32_t) 4);
+                n_minIdxh = vaddq_u32(n_minIdxh, c4);
+                n_maxIdxh = vaddq_u32(n_maxIdxh, c4);
+
+                s16 fmin[8], fmax[8];
+                u32 fminIdx[8], fmaxIdx[8];
+
+                vst1q_s16(fmin, n_min);
+                vst1q_s16(fmax, n_max);
+                vst1q_u32(fminIdx+0, n_minIdxl);
+                vst1q_u32(fmaxIdx+0, n_maxIdxl);
+                vst1q_u32(fminIdx+4, n_minIdxh);
+                vst1q_u32(fmaxIdx+4, n_maxIdxh);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 8; ++j)
+                {
+                    s16 minval = fmin[j];
+                    s16 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFF8)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFF8)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; ++i )
+        {
+            short val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            else if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const u16 * srcBase, ptrdiff_t srcStride,
+               u16 &minVal, size_t &minCol, size_t &minRow,
+               u16 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const u16 * src = internal::getRowPtr( srcBase,  srcStride, l);
+        if (size.width >= 32)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t c8        = vdupq_n_u32(8);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (8 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
+#else
+            {
+                size_t bound = size.width - (8 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                uint16x8_t  n_min    = vdupq_n_u16(minVal);
+                uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
+                uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
+                uint16x8_t  n_max    = vdupq_n_u16(maxVal);
+                uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
+                uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);
+
+                for(; i < bound; i+=8 )
+                {
+                    internal::prefetch(src + i);
+                    uint16x8_t line = vld1q_u16(src + i);
+
+                    uint16x8_t minmask = vcltq_u16(line, n_min);
+                    uint16x8_t maxmask = vcgtq_u16(line, n_max);
+
+                    n_min    = vbslq_u16(minmask, line, n_min);
+                    uint16x4_t minml = vget_low_u16(minmask);
+                    uint16x4_t minmh = vget_high_u16(minmask);
+                    uint32x4_t minml2 = vmovl_u16(minml);
+                    uint32x4_t minmh2 = vmovl_u16(minmh);
+                    minml2 = vqshlq_n_u32(minml2, 31);
+                    minmh2 = vqshlq_n_u32(minmh2, 31);
+                    n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
+                    n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);
+
+                    n_max    = vbslq_u16(maxmask, line, n_max);
+                    uint16x4_t maxml = vget_low_u16(maxmask);
+                    uint16x4_t maxmh = vget_high_u16(maxmask);
+                    uint32x4_t maxml2 = vmovl_u16(maxml);
+                    uint32x4_t maxmh2 = vmovl_u16(maxmh);
+                    maxml2 = vqshlq_n_u32(maxml2, 31);
+                    maxmh2 = vqshlq_n_u32(maxmh2, 31);
+                    n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
+                    n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);
+
+                    // idx[] +=8
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
+                }
+
+                // fix high part of indexes
+                uint32x4_t c4 = vdupq_n_u32(4);
+                n_minIdxh = vaddq_u32(n_minIdxh, c4);
+                n_maxIdxh = vaddq_u32(n_maxIdxh, c4);
+
+                u16 fmin[8], fmax[8];
+                u32 fminIdx[8], fmaxIdx[8];
+
+                vst1q_u16(fmin, n_min);
+                vst1q_u16(fmax, n_max);
+                vst1q_u32(fminIdx+0, n_minIdxl);
+                vst1q_u32(fmaxIdx+0, n_maxIdxl);
+                vst1q_u32(fminIdx+4, n_minIdxh);
+                vst1q_u32(fmaxIdx+4, n_maxIdxh);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 8; ++j)
+                {
+                    u16 minval = fmin[j];
+                    u16 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFF8)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFF8)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; ++i )
+        {
+            u16 val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            else if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+namespace {
+
+void minMaxLocBlock(const u8 * src, u32 len,
+                    u8 &minVal, u16 &minIdx,
+                    u8 &maxVal, u16 &maxIdx)
+{
+    u16 tmp0123[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+    uint8x16_t n_min     = vdupq_n_u8(src[0]);
+    uint16x8_t n_minIdxl = vdupq_n_u16(0);
+    uint16x8_t n_minIdxh = vdupq_n_u16(0);
+    uint8x16_t n_max     = vdupq_n_u8(src[0]);
+    uint16x8_t n_maxIdxl = vdupq_n_u16(0);
+    uint16x8_t n_maxIdxh = vdupq_n_u16(0);
+    uint16x8_t c16       = vdupq_n_u16(16);
+    uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);
+
+    s32 i = 0;
+    s32 bound = len - (16 - 1);
+    for(; i < bound; i+=16 )
+    {
+        internal::prefetch(src + i);
+        uint8x16_t line = vld1q_u8(src + i);
+
+        uint8x16_t minmask = vcltq_u8(line, n_min);
+        uint8x16_t maxmask = vcgtq_u8(line, n_max);
+
+        n_min    = vbslq_u8(minmask, line, n_min);
+        uint8x8_t minml = vget_low_u8(minmask);
+        uint8x8_t minmh = vget_high_u8(minmask);
+        uint16x8_t minml2 = vmovl_u8(minml);
+        uint16x8_t minmh2 = vmovl_u8(minmh);
+        minml2 = vqshlq_n_u16(minml2, 15);
+        minmh2 = vqshlq_n_u16(minmh2, 15);
+        n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
+        n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);
+
+        n_max    = vbslq_u8(maxmask, line, n_max);
+        uint8x8_t maxml = vget_low_u8(maxmask);
+        uint8x8_t maxmh = vget_high_u8(maxmask);
+        uint16x8_t maxml2 = vmovl_u8(maxml);
+        uint16x8_t maxmh2 = vmovl_u8(maxmh);
+        maxml2 = vqshlq_n_u16(maxml2, 15);
+        maxmh2 = vqshlq_n_u16(maxmh2, 15);
+        n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
+        n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);
+
+        // idx[] +=16
+        lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
+    }
+
+    // fix high part of indexes
+    uint16x8_t c8 = vdupq_n_u16(8);
+    n_minIdxh = vaddq_u16(n_minIdxh, c8);
+    n_maxIdxh = vaddq_u16(n_maxIdxh, c8);
+
+    u8 fmin[16], fmax[16];
+    u16 fminIdx[16], fmaxIdx[16];
+    /*{
+        uint8x8_t min_low  = vget_low_u8(n_min);
+        uint8x8_t min_high = vget_high_u8(n_min);
+        uint8x8_t max_low  = vget_low_u8(n_max);
+        uint8x8_t max_high = vget_high_u8(n_max);
+
+        uint8x8_t minmask  = vclt_u8(min_low, min_high);
+        uint8x8_t maxmask  = vcgt_u8(max_low, max_high);
+
+        uint8x8_t min2     = vbsl_u8(minmask, min_low, min_high);
+        uint8x8_t max2     = vbsl_u8(maxmask, max_low, max_high);
+
+        uint16x8_t minidxmask = vmovl_u8(minmask);
+        uint16x8_t maxidxmask = vmovl_u8(maxmask);
+        minidxmask = vqshlq_n_u16(minidxmask, 15);
+        maxidxmask = vqshlq_n_u16(maxidxmask, 15);
+
+        uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh);
+        uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh);
+
+        vst1_u8((uint8_t*)fmin, min2);
+        vst1_u8((uint8_t*)fmax, max2);
+
+        vst1q_u16((uint16_t*)(fminIdx), n_minIdx);
+        vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx);
+    }*/
+
+    vst1q_u8(fmin, n_min);
+    vst1q_u8(fmax, n_max);
+    vst1q_u16(fminIdx+0, n_minIdxl);
+    vst1q_u16(fmaxIdx+0, n_maxIdxl);
+    vst1q_u16(fminIdx+8, n_minIdxh);
+    vst1q_u16(fmaxIdx+8, n_maxIdxh);
+
+    minIdx = fminIdx[0];
+    maxIdx = fmaxIdx[0];
+    minVal = fmin[0];
+    maxVal = fmax[0];
+
+    for (s32 j = 1; j < 16; ++j)
+    {
+        u8 minval = fmin[j];
+        u8 maxval = fmax[j];
+        if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+        {
+            minIdx = fminIdx[j];
+            minVal = minval;
+        }
+        if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+        {
+            maxIdx = fmaxIdx[j];
+            maxVal = maxval;
+        }
+    }
+
+    for(; i < (s32)len; ++i )
+    {
+        u8 val = src[i];
+        if( val < minVal )
+        {
+            minVal = val;
+            minIdx = (u16)i;
+        }
+        else if( val > maxVal )
+        {
+            maxVal = val;
+            maxIdx = (u16)i;
+        }
+    }
+}
+
+void minMaxLocBlock(const s8 * src, u32 len,
+                    s8 &minVal, u16 &minIdx,
+                    s8 &maxVal, u16 &maxIdx)
+{
+    u16 tmp0123[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    int8x16_t n_min      = vdupq_n_s8(src[0]);
+    uint16x8_t n_minIdxl = vdupq_n_u16(0);
+    uint16x8_t n_minIdxh = vdupq_n_u16(0);
+    int8x16_t n_max      = vdupq_n_s8(src[0]);
+    uint16x8_t n_maxIdxl = vdupq_n_u16(0);
+    uint16x8_t n_maxIdxh = vdupq_n_u16(0);
+    uint16x8_t c16       = vdupq_n_u16(16);
+    uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);
+
+    s32 i = 0;
+    s32 bound = len - (16 - 1);
+    for(; i < bound; i+=16 )
+    {
+        internal::prefetch(src + i);
+        int8x16_t line = vld1q_s8(src + i);
+
+        uint8x16_t minmask = vcltq_s8(line, n_min);
+        uint8x16_t maxmask = vcgtq_s8(line, n_max);
+
+        n_min    = vbslq_s8(minmask, line, n_min);
+        uint8x8_t minml = vget_low_u8(minmask);
+        uint8x8_t minmh = vget_high_u8(minmask);
+        uint16x8_t minml2 = vmovl_u8(minml);
+        uint16x8_t minmh2 = vmovl_u8(minmh);
+        minml2 = vqshlq_n_u16(minml2, 15);
+        minmh2 = vqshlq_n_u16(minmh2, 15);
+        n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
+        n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);
+
+        n_max    = vbslq_s8(maxmask, line, n_max);
+        uint8x8_t maxml = vget_low_u8(maxmask);
+        uint8x8_t maxmh = vget_high_u8(maxmask);
+        uint16x8_t maxml2 = vmovl_u8(maxml);
+        uint16x8_t maxmh2 = vmovl_u8(maxmh);
+        maxml2 = vqshlq_n_u16(maxml2, 15);
+        maxmh2 = vqshlq_n_u16(maxmh2, 15);
+        n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
+        n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);
+
+        // idx[] +=16
+        lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
+    }
+
+    // fix high part of indexes
+    uint16x8_t c8 = vdupq_n_u16(8);
+    n_minIdxh = vaddq_u16(n_minIdxh, c8);
+    n_maxIdxh = vaddq_u16(n_maxIdxh, c8);
+
+    s8 fmin[16], fmax[16];
+    u16 fminIdx[16], fmaxIdx[16];
+
+    vst1q_s8(fmin, n_min);
+    vst1q_s8(fmax, n_max);
+    vst1q_u16(fminIdx+0, n_minIdxl);
+    vst1q_u16(fmaxIdx+0, n_maxIdxl);
+    vst1q_u16(fminIdx+8, n_minIdxh);
+    vst1q_u16(fmaxIdx+8, n_maxIdxh);
+
+    minIdx = fminIdx[0];
+    maxIdx = fmaxIdx[0];
+    minVal = fmin[0];
+    maxVal = fmax[0];
+
+    for (s32 j = 1; j < 16; ++j)
+    {
+        s8 minval = fmin[j];
+        s8 maxval = fmax[j];
+        if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+        {
+            minIdx = fminIdx[j];
+            minVal = minval;
+        }
+        if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+        {
+            maxIdx = fmaxIdx[j];
+            maxVal = maxval;
+        }
+    }
+
+    for(; i < (s32)len; ++i )
+    {
+        s8 val = src[i];
+        if( val < minVal )
+        {
+            minVal = val;
+            minIdx = (u16)i;
+        }
+        else if( val > maxVal )
+        {
+            maxVal = val;
+            maxIdx = (u16)i;
+        }
+    }
+}
+
+} // namespace
+#endif // CAROTENE_NEON
+
+#define USHORT_BLOCK_MAX_SIZE (1 << 16)
+
+void minMaxLoc(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 &minVal, size_t &minCol, size_t &minRow,
+               u8 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0; l < size.height; ++l)
+    {
+        const u8 * src = internal::getRowPtr( srcBase,  srcStride, l);
+        if (size.width > 128)
+        {
+            for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
+            {
+                u8 locMinVal, locMaxVal;
+                u16 locMinIdx, locMaxIdx;
+                size_t tail = size.width - blockStart;
+                minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
+                               locMinVal, locMinIdx, locMaxVal, locMaxIdx);
+
+                if (locMinVal == 0 && locMaxVal == 255)
+                {
+                    minCol = blockStart + locMinIdx;
+                    maxCol = blockStart + locMaxIdx;
+                    minRow = l;
+                    maxRow = l;
+                    minVal = 0;
+                    maxVal = 255;
+                    return;
+                }
+                else
+                {
+                    if (locMinVal < minVal)
+                    {
+                        minCol = blockStart + locMinIdx;
+                        minRow = l;
+                        minVal = locMinVal;
+                    }
+                    if (locMaxVal > maxVal)
+                    {
+                        maxCol = blockStart + locMaxIdx;
+                        maxRow = l;
+                        maxVal = locMaxVal;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for(size_t i = 0; i < size.width; ++i )
+            {
+                u8 val = src[i];
+                if( val < minVal )
+                {
+                    minVal = val;
+                    minCol = i;
+                    minRow = l;
+                }
+                else if( val > maxVal )
+                {
+                    maxVal = val;
+                    maxCol = i;
+                    maxRow = l;
+                }
+            }
+        }
+
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const s8 * srcBase, ptrdiff_t srcStride,
+               s8 &minVal, size_t &minCol, size_t &minRow,
+               s8 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0; l < size.height; ++l)
+    {
+        const s8 * src = internal::getRowPtr( srcBase,  srcStride, l);
+        if (size.width > 128)
+        {
+            for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
+            {
+                s8 locMinVal, locMaxVal;
+                u16 locMinIdx, locMaxIdx;
+                size_t tail = size.width - blockStart;
+                minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
+                               locMinVal, locMinIdx, locMaxVal, locMaxIdx);
+
+                if (locMinVal == -128 && locMaxVal == 127)
+                {
+                    minCol = blockStart + locMinIdx;
+                    maxCol = blockStart + locMaxIdx;
+                    minRow = l;
+                    maxRow = l;
+                    minVal = -128;
+                    maxVal = 127;
+                    return;
+                }
+                else
+                {
+                    if (locMinVal < minVal)
+                    {
+                        minCol = blockStart + locMinIdx;
+                        minRow = l;
+                        minVal = locMinVal;
+                    }
+                    if (locMaxVal > maxVal)
+                    {
+                        maxCol = blockStart + locMaxIdx;
+                        maxRow = l;
+                        maxVal = locMaxVal;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for(size_t i = 0; i < size.width; ++i )
+            {
+                s8 val = src[i];
+                if( val < minVal )
+                {
+                    minVal = val;
+                    minRow = l;
+                    minCol = i;
+                }
+                else if( val > maxVal )
+                {
+                    maxVal = val;
+                    maxRow = l;
+                    maxCol = i;
+                }
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/morph.cpp b/3rdparty/carotene/src/morph.cpp
new file mode 100644
index 0000000000..bcc6aa7e06
--- /dev/null
+++ b/3rdparty/carotene/src/morph.cpp
@@ -0,0 +1,728 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 16 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+struct ErodeVecOp
+{
+    ErodeVecOp():borderValue(0){}
+
+    ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
+        borderValue(borderValue_)
+    {
+        if (border == BORDER_MODE_REPLICATE)
+            borderValue = std::numeric_limits<u8>::max();
+    }
+
+    inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
+    {
+        return vminq_u8(a, b);
+    }
+
+    inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
+    {
+        return vmin_u8(a, b);
+    }
+
+    inline u8 operator()(u8 a, u8 b) const
+    {
+        return std::min(a, b);
+    }
+
+    u8 borderValue;
+};
+
+struct DilateVecOp
+{
+    DilateVecOp():borderValue(0){}
+
+    DilateVecOp(BORDER_MODE border, u8 borderValue_) :
+        borderValue(borderValue_)
+    {
+        if (border == BORDER_MODE_REPLICATE)
+            borderValue = std::numeric_limits<u8>::min();
+    }
+
+    inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
+    {
+        return vmaxq_u8(a, b);
+    }
+
+    inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
+    {
+        return vmax_u8(a, b);
+    }
+
+    inline u8 operator()(u8 a, u8 b) const
+    {
+        return std::max(a, b);
+    }
+
+    u8 borderValue;
+};
+
+template <typename VecOp>
+void morph3x3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              BORDER_MODE border, const VecOp & vop)
+{
+    u8 borderValue = vop.borderValue;
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    const uint8x16_t v_zero = vdupq_n_u8(0);
+    const uint8x16_t v_border = vdupq_n_u8(borderValue);
+
+    uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        u8 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 16)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
+            uint8x16_t x1 = vld1q_u8(srow1 + x);
+            uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 16 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = vop(srow1[x4],
+                                vop(srow2 ? srow2[x4] : borderValue,
+                                    srow0 ? srow0[x4] : borderValue));
+
+                currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vop(vop(x0, x1), x2);
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u8(tprev, tcurr, 15);
+            t1 = tcurr;
+            t2 = vextq_u8(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vop(t0, vop(t1, t2));
+
+            vst1q_u8(drow + x - 16, t0);
+        }
+
+        x -= 16;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
+            }
+            else
+                nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
+                                srow0 ? srow0[x + 1] : borderValue),
+                            srow1[x + 1]);
+
+            drow[x] = vop(prevx, vop(currx, nextx));
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+}
+
+} // namespace
+
+#endif
+
+void erode3x3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    morph3x3(size,
+             srcBase, srcStride,
+             dstBase, dstStride,
+             border, ErodeVecOp(border, borderValue));
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void dilate3x3(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride,
+               BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    morph3x3(size,
+             srcBase, srcStride,
+             dstBase, dstStride,
+             border, DilateVecOp(border, borderValue));
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+namespace {
+
+template<class VecUpdate>
+void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
+{
+    size_t i, j, k;
+    size_t width16 = (width & -16) * cn;
+    size_t width8 = (width & -8) * cn;
+    width *= cn;
+
+    if (ksize == 1)
+    {
+        for (i = 0; i < width; i++)
+            dst[i] = src[i];
+        return;
+    }
+
+    ksize = ksize*cn;
+    VecUpdate updateOp;
+    switch(cn)
+    {
+    case 1:
+        for (i = 0; i < width16; i += 16)
+        {
+            const u8* sptr = src + i;
+            uint8x16_t s = vld1q_u8(sptr);
+            internal::prefetch(sptr);
+
+            for( k = 1; k < ksize; ++k)
+                s = updateOp(s, vld1q_u8(sptr + k));
+
+            vst1q_u8(dst + i, s);
+        }
+
+        for (; i < width8; i += 8)
+        {
+            const u8* sptr = src + i;
+            uint8x8_t s = vld1_u8(sptr);
+            internal::prefetch(sptr);
+
+            for( k = 1; k < ksize; ++k)
+                s = updateOp(s, vld1_u8(sptr + k));
+
+            vst1_u8(dst + i, s);
+        }
+        break;
+    default:
+        for (i = 0; i < width16; i += 16)
+        {
+            uint8x16_t s = vld1q_u8(src + i);
+            internal::prefetch(src + i);
+
+            for (k = cn; k < ksize; k += cn)
+                s = updateOp(s, vld1q_u8(src + i + k));
+
+            vst1q_u8(dst + i, s);
+        }
+
+        for (; i < width8; i += 8)
+        {
+            uint8x8_t s = vld1_u8(src + i);
+            internal::prefetch(src + i);
+
+            for (k = cn; k < ksize; k += cn)
+                s = updateOp(s, vld1_u8(src + i + k));
+
+            vst1_u8(dst + i, s);
+        }
+        break;
+    }
+
+    ptrdiff_t i0 = i;
+    for( k = 0; k < (size_t)cn; k++, src++, dst++ )
+    {
+        for( i = i0; i <= width - cn*2; i += cn*2 )
+        {
+            const u8* s = src + i;
+            u8 m = s[cn];
+            for( j = cn*2; j < ksize; j += cn )
+                m = updateOp(m, s[j]);
+            dst[i] = updateOp(m, s[0]);
+            dst[i+cn] = updateOp(m, s[j]);
+        }
+
+        for( ; i < width; i += cn )
+        {
+            const u8* s = src + i;
+            u8 m = s[0];
+            for( j = cn; j < ksize; j += cn )
+                m = updateOp(m, s[j]);
+            dst[i] = m;
+        }
+    }
+}
+
+template<class VecUpdate>
+void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
+{
+    size_t i, k;
+    size_t width32 = width & -32;
+    VecUpdate updateOp;
+
+    uint8x16_t x0,x1,s0,s1;
+    if (ksize == 3)
+    {
+        for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
+        {
+            for (i = 0; i < width32; i += 32)
+            {
+                const u8* sptr = src[1] + i;
+                s0 = vld1q_u8(sptr);
+                s1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                sptr = src[2] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                s0 = updateOp(s0, x0);
+                s1 = updateOp(s1, x1);
+
+                sptr = src[0] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                vst1q_u8(dst+i, updateOp(s0, x0));
+                vst1q_u8(dst+i+16, updateOp(s1, x1));
+
+                sptr = src[3] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                vst1q_u8(dst + dststep + i, updateOp(s0, x0));
+                vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
+
+            }
+            for(; i < width; i++ )
+            {
+                u8 s = src[1][i];
+
+                for( k = 2; k < ksize; k++ )
+                    s = updateOp(s, src[k][i]);
+
+                dst[i] = updateOp(s, src[0][i]);
+                dst[i+dststep] = updateOp(s, src[k][i]);
+            }
+        }
+    }
+    else if (ksize > 1)
+        for (; count > 1; count -= 2, dst += dststep*2, src += 2)
+        {
+            for (i = 0; i < width32; i += 32)
+            {
+                const u8* sptr = src[1] + i;
+                s0 = vld1q_u8(sptr);
+                s1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                for (k = 2; k < ksize; k++)
+                {
+                    sptr = src[k] + i;
+                    x0 = vld1q_u8(sptr);
+                    x1 = vld1q_u8(sptr + 16);
+                    internal::prefetch(sptr);
+
+                    s0 = updateOp(s0, x0);
+                    s1 = updateOp(s1, x1);
+                }
+
+                sptr = src[0] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                vst1q_u8(dst+i, updateOp(s0, x0));
+                vst1q_u8(dst+i+16, updateOp(s1, x1));
+
+                sptr = src[k] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                vst1q_u8(dst + dststep + i, updateOp(s0, x0));
+                vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
+            }
+            for(; i < width; i++ )
+            {
+                u8 s = src[1][i];
+
+                for( k = 2; k < ksize; k++ )
+                    s = updateOp(s, src[k][i]);
+
+                dst[i] = updateOp(s, src[0][i]);
+                dst[i+dststep] = updateOp(s, src[k][i]);
+            }
+        }
+
+    for (; count > 0; count--, dst += dststep, src++)
+    {
+        for (i = 0; i < width32; i += 32)
+        {
+            const u8* sptr = src[0] + i;
+            s0 = vld1q_u8(sptr);
+            s1 = vld1q_u8(sptr + 16);
+            internal::prefetch(sptr);
+
+            for (k = 1; k < ksize; k++)
+            {
+                sptr = src[k] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                s0 = updateOp(s0, x0);
+                s1 = updateOp(s1, x1);
+            }
+
+            vst1q_u8(dst + i, s0);
+            vst1q_u8(dst + i + 16, s1);
+        }
+        for(; i < width; i++ )
+        {
+            u8 s = src[0][i];
+            for( k = 1; k < ksize; k++ )
+                s = updateOp(s, src[k][i]);
+            dst[i] = s;
+        }
+    }
+}
+
+template <class Op>
+inline void morphology(const Size2D &ssize, u32 cn,
+                       const u8 * srcBase, ptrdiff_t srcStride,
+                       u8 * dstBase, ptrdiff_t dstStride,
+                       const Size2D &ksize,
+                       size_t anchorX, size_t anchorY,
+                       BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+                       const u8 * borderValues, Margin borderMargin)
+{
+    //Temporary buffers common for all iterations
+    std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
+    u8* srcRow = &_srcRow[0];
+
+    size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
+    std::vector<u8*> _rows(bufRows);
+    u8** rows = &_rows[0];
+
+    // adjust swidthcn so that the used part of buffers stays compact in memory
+    ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
+    std::vector<u8> _ringBuf(swidthcn*bufRows+16);
+    u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
+
+    size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
+    std::vector<ptrdiff_t> _borderTab(borderLength);
+    ptrdiff_t * borderTab = &_borderTab[0];
+
+    std::vector<u8> _constBorderValue;
+    std::vector<u8> _constBorderRow;
+    u8 * constBorderValue = NULL;
+    u8 * constBorderRow = NULL;
+    if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
+    {
+        _constBorderValue.resize(borderLength);
+        constBorderValue = &_constBorderValue[0];
+        size_t i;
+        for(i = 0; i < cn; i++)
+            constBorderValue[i] = borderValues[i];
+        for(; i < borderLength; i++)
+            constBorderValue[i] = constBorderValue[i-cn];
+
+        if( columnBorderType == BORDER_MODE_CONSTANT )
+        {
+            _constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
+            constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
+            size_t N = (ssize.width + ksize.width - 1)*cn;
+            for( i = 0; i < N; i += borderLength )
+            {
+                size_t n = std::min( borderLength, N - i );
+                for(size_t j = 0; j < n; j++)
+                    srcRow[i+j] = constBorderValue[j];
+            }
+            MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
+        }
+    }
+
+    Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
+                     ssize.height + borderMargin.top + borderMargin.bottom);
+
+    ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
+    ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
+    // recompute border tables
+    if( dx1 > 0 || dx2 > 0 )
+    {
+        if( rowBorderType == BORDER_MODE_CONSTANT )
+        {
+            memcpy( srcRow, &constBorderValue[0], dx1*cn );
+            memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
+        }
+        else
+        {
+            ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
+
+            ptrdiff_t wholeWidth = wholeSize.width;
+
+            ptrdiff_t i, j;
+            for( i = 0; i < dx1; i++ )
+            {
+                ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
+                for( j = 0; j < (ptrdiff_t)cn; j++ )
+                    borderTab[i*cn + j] = p0 + j;
+            }
+
+            for( i = 0; i < dx2; i++ )
+            {
+                ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
+                for( j = 0; j < (ptrdiff_t)cn; j++ )
+                    borderTab[(i + dx1)*cn + j] = p0 + j;
+            }
+        }
+    }
+
+    ptrdiff_t startY, startY0, endY, rowCount;
+    startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
+    endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
+
+    const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
+    u8* dst = dstBase;
+
+    ptrdiff_t width = ssize.width, kwidth = ksize.width;
+    ptrdiff_t kheight = ksize.height, ay = anchorY;
+    ptrdiff_t width1 = ssize.width + kwidth - 1;
+    ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
+    bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
+    ptrdiff_t dy = 0, i = 0;
+
+    src -= xofs1*cn;
+    ptrdiff_t count = endY - startY;
+
+    rowCount = 0;
+    for(;; dst += dstStride*i, dy += i)
+    {
+        ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
+        dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
+        dcount = std::min(dcount, count);
+        count -= dcount;
+        for( ; dcount-- > 0; src += srcStride )
+        {
+            ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
+            u8* brow = ringBuf + bi*swidthcn;
+
+            if( (size_t)(++rowCount) > bufRows )
+            {
+                --rowCount;
+                ++startY;
+            }
+
+            memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
+
+            if( makeBorder )
+            {
+                    for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
+                        srcRow[i] = src[borderTab[i]];
+                    for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
+                        srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
+            }
+
+            MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
+        }
+
+        ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
+        for( i = 0; i < max_i; i++ )
+        {
+            ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
+                                               wholeSize.height, columnBorderType);
+            if( srcY < 0 ) // can happen only with constant border type
+                rows[i] = constBorderRow;
+            else
+            {
+                if( srcY >= startY + rowCount )
+                    break;
+                ptrdiff_t bi = (srcY - startY0) % bufRows;
+                rows[i] = ringBuf + bi*swidthcn;
+            }
+        }
+        if( i < kheight )
+            break;
+        i -= kheight - 1;
+        MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
+    }
+}
+
+} // namespace
+#endif // CAROTENE_NEON
+
+void erode(const Size2D &ssize, u32 cn,
+           const u8 * srcBase, ptrdiff_t srcStride,
+           u8 * dstBase, ptrdiff_t dstStride,
+           const Size2D &ksize,
+           size_t anchorX, size_t anchorY,
+           BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+           const u8 * borderValues, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
+                                           anchorX < ksize.width && anchorY < ksize.height);
+#ifdef CAROTENE_NEON
+    morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
+                           ksize, anchorX, anchorY, rowBorderType, columnBorderType,
+                           borderValues, borderMargin);
+#else
+    (void)cn;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)rowBorderType;
+    (void)columnBorderType;
+    (void)borderValues;
+    (void)borderMargin;
+#endif
+}
+
+void dilate(const Size2D &ssize, u32 cn,
+            const u8 * srcBase, ptrdiff_t srcStride,
+            u8 * dstBase, ptrdiff_t dstStride,
+            const Size2D &ksize,
+            size_t anchorX, size_t anchorY,
+            BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+            const u8 * borderValues, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
+                                           anchorX < ksize.width && anchorY < ksize.height);
+#ifdef CAROTENE_NEON
+    morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
+                            ksize, anchorX, anchorY, rowBorderType, columnBorderType,
+                            borderValues, borderMargin);
+#else
+    (void)cn;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)rowBorderType;
+    (void)columnBorderType;
+    (void)borderValues;
+    (void)borderMargin;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/mul.cpp b/3rdparty/carotene/src/mul.cpp
new file mode 100644
index 0000000000..3bbbfc50aa
--- /dev/null
+++ b/3rdparty/carotene/src/mul.cpp
@@ -0,0 +1,1572 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2016, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+#include <cfloat>
+#include <cmath>
+#include <limits>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+bool isIntegerScale(f32 scale)
+{
+    return std::fabs(scale - static_cast<s32>(scale)) < FLT_EPSILON;
+}
+
+template <s32 shift>
+void mulu8(const Size2D &size,
+           const u8 * src0Base, ptrdiff_t src0Stride,
+           const u8 * src1Base, ptrdiff_t src1Stride,
+           u8 * dstBase, ptrdiff_t dstStride,
+           CONVERT_POLICY cpolicy)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
+
+                uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
+                uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
+
+                v_dst0 = vshrq_n_u16(v_dst0, shift);
+                v_dst1 = vshrq_n_u16(v_dst1, shift);
+
+                vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0), vqmovn_u16(v_dst1)));
+            }
+            for (; j < roiw8; j += 8)
+            {
+                uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
+                vst1_u8(dst + j, vqmovn_u16(vshrq_n_u16(v_dst, shift)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                u16 val = (u16)src0[j] * (u16)src1[j];
+                dst[j] = internal::saturate_cast<u8>(val >> shift);
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
+
+                uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
+                uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
+
+                v_dst0 = vshrq_n_u16(v_dst0, shift);
+                v_dst1 = vshrq_n_u16(v_dst1, shift);
+
+                vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
+            }
+            for (; j < roiw8; j += 8)
+            {
+                uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
+                vst1_u8(dst + j, vmovn_u16(vshrq_n_u16(v_dst, shift)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                u16 val = (u16)src0[j] * (u16)src1[j];
+                dst[j] = (u8)(val >> shift);
+            }
+        }
+    }
+}
+
+template <s32 shift>
+void muls16(const Size2D &size,
+            const u8 * src0Base, ptrdiff_t src0Stride,
+            const u8 * src1Base, ptrdiff_t src1Stride,
+            s16 * dstBase, ptrdiff_t dstStride,
+            CONVERT_POLICY cpolicy)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint16x8_t v_32767 = vdupq_n_u16(0x7FFF);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
+
+                uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
+                uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
+
+                v_dst0 = vshrq_n_u16(v_dst0, shift);
+                v_dst1 = vshrq_n_u16(v_dst1, shift);
+
+                vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst0)));
+                vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst1)));
+            }
+            for (; j < roiw8; j += 8)
+            {
+                uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
+                v_dst = vshrq_n_u16(v_dst, shift);
+                vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                u16 val = (u16)src0[j] * (u16)src1[j];
+                dst[j] = internal::saturate_cast<s16>(val >> shift);
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
+
+                uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
+                uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
+
+                v_dst0 = vshrq_n_u16(v_dst0, shift);
+                v_dst1 = vshrq_n_u16(v_dst1, shift);
+
+                vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst0));
+                vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(v_dst1));
+            }
+            for (; j < roiw8; j += 8)
+            {
+                uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
+                v_dst = vshrq_n_u16(v_dst, shift);
+                vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst));
+            }
+
+            for (; j < size.width; j++)
+            {
+                u16 val = (u16)src0[j] * (u16)src1[j];
+                dst[j] = (s16)(val >> shift);
+            }
+        }
+    }
+}
+
+typedef void (* mulFuncu8)(const Size2D &size,
+                           const u8 * src0Base, ptrdiff_t src0Stride,
+                           const u8 * src1Base, ptrdiff_t src1Stride,
+                           u8 * dstBase, ptrdiff_t dstStride,
+                           CONVERT_POLICY cpolicy);
+
+typedef void (* mulFuncs16)(const Size2D &size,
+                            const u8 * src0Base, ptrdiff_t src0Stride,
+                            const u8 * src1Base, ptrdiff_t src1Stride,
+                            s16 * dstBase, ptrdiff_t dstStride,
+                            CONVERT_POLICY cpolicy);
+
+} // namespace
+
+#endif
+
+void mul(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         u8 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    if ((scale * UCHAR_MAX * UCHAR_MAX) < 1.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(u8) * size.width);
+        }
+        return;
+    }
+
+    s32 iscale = static_cast<s32>(scale), exp = 0;
+    f32 significand = frexp(scale, &exp);
+    bool is_integer_scale = isIntegerScale(scale),
+         is_power_of_2 = (significand == 0.5f) && (exp <= 0);
+    exp = -exp + 1;
+
+    if (is_power_of_2)
+    {
+        static const mulFuncu8 funcs[16] =
+        {
+            NULL,
+            mulu8<1>,
+            mulu8<2>,
+            mulu8<3>,
+            mulu8<4>,
+            mulu8<5>,
+            mulu8<6>,
+            mulu8<7>,
+            mulu8<8>,
+            mulu8<9>,
+            mulu8<10>,
+            mulu8<11>,
+            mulu8<12>,
+            mulu8<13>,
+            mulu8<14>,
+            mulu8<15>
+        };
+
+        mulFuncu8 func = funcs[exp];
+
+        func(size,
+             src0Base, src0Stride,
+             src1Base, src1Stride,
+             dstBase, dstStride,
+             cpolicy);
+
+        return;
+    }
+
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
+
+                    uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
+                    uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
+
+                    vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0), vqmovn_u16(v_dst1)));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    vst1_u8(dst + j, vqmovn_u16(vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    u16 val = (u16)src0[j] * (u16)src1[j];
+                    dst[j] = internal::saturate_cast<u8>(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                    uint8x16_t v_src1 = vld1q_u8(src1 + j);
+
+                    uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
+                    uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
+                    v_src0_p = vmovl_u8(vget_high_u8(v_src0));
+                    v_src1_p = vmovl_u8(vget_high_u8(v_src1));
+                    float32x4_t v_dst2f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
+                    float32x4_t v_dst3f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
+                    uint16x8_t v_dst0u = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                                      vqmovn_u32(vcvtq_u32_f32(v_dst1f)));
+                    uint16x8_t v_dst1u = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst2f)),
+                                                      vqmovn_u32(vcvtq_u32_f32(v_dst3f)));
+                    vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0u), vqmovn_u16(v_dst1u)));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
+                    uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j));
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale);
+                    uint16x8_t v_dstu = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                                     vqmovn_u32(vcvtq_u32_f32(v_dst1f)));
+                    vst1_u8(dst + j, vqmovn_u16(v_dstu));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
+                    dst[j] = internal::saturate_cast<u8>((s32)trunc(fval));
+                }
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
+
+                    uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
+                    uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
+
+                    vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    vst1_u8(dst + j, vmovn_u16(vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    u16 val = (u16)src0[j] * (u16)src1[j];
+                    dst[j] = (u8)(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                    uint8x16_t v_src1 = vld1q_u8(src1 + j);
+
+                    uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
+                    uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
+                    v_src0_p = vmovl_u8(vget_high_u8(v_src0));
+                    v_src1_p = vmovl_u8(vget_high_u8(v_src1));
+                    float32x4_t v_dst2f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
+                    float32x4_t v_dst3f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
+                    uint16x8_t v_dst0u = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                                      vmovn_u32(vcvtq_u32_f32(v_dst1f)));
+                    uint16x8_t v_dst1u = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst2f)),
+                                                      vmovn_u32(vcvtq_u32_f32(v_dst3f)));
+                    vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0u), vmovn_u16(v_dst1u)));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
+                    uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j));
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale);
+                    uint16x8_t v_dstu = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                                     vmovn_u32(vcvtq_u32_f32(v_dst1f)));
+                    vst1_u8(dst + j, vmovn_u16(v_dstu));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
+                    dst[j] = (u8)(s32)trunc(fval);
+                }
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+void mul(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         s16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (((scale * UCHAR_MAX * UCHAR_MAX) < 1.0f) && (scale >= 0))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            s16 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(s16) * size.width);
+        }
+        return;
+    }
+
+    s32 iscale = static_cast<s32>(scale), exp = 0;
+    f32 significand = frexp(scale, &exp);
+    bool is_integer_scale = isIntegerScale(scale),
+         is_power_of_2 = (significand == 0.5f) && (exp <= 0);
+    exp = -exp + 1;
+
+    if (is_power_of_2)
+    {
+        static const mulFuncs16 funcs[16] =
+        {
+            NULL,
+            muls16<1>,
+            muls16<2>,
+            muls16<3>,
+            muls16<4>,
+            muls16<5>,
+            muls16<6>,
+            muls16<7>,
+            muls16<8>,
+            muls16<9>,
+            muls16<10>,
+            muls16<11>,
+            muls16<12>,
+            muls16<13>,
+            muls16<14>,
+            muls16<15>
+        };
+
+        mulFuncs16 func = funcs[exp];
+
+        func(size,
+             src0Base, src0Stride,
+             src1Base, src1Stride,
+             dstBase, dstStride,
+             cpolicy);
+
+        return;
+    }
+
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint16x8_t v_32767 = vdupq_n_u16(0x7FFF);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
+
+                    uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
+                    uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
+
+                    vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst0)));
+                    vst1q_s16(dst + j +8, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst1)));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
+                    vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst)));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    u16 val = (u16)src0[j] * (u16)src1[j];
+                    dst[j] = internal::saturate_cast<s16>(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                    uint8x16_t v_src1 = vld1q_u8(src1 + j);
+
+                    uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
+                    uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
+                    vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                    vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
+
+                    v_src0_p = vmovl_u8(vget_high_u8(v_src0));
+                    v_src1_p = vmovl_u8(vget_high_u8(v_src1));
+                    v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                    vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
+                    v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                    vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
+                    vst1q_s16(dst + j + 8, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                        vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
+                    uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j));
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale);
+                    vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                    vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
+                    dst[j] = internal::saturate_cast<s16>((s32)trunc(fval));
+                }
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
+
+                    uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
+                    uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
+
+                    vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst0));
+                    vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(v_dst1));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
+                    vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    u16 val = (u16)src0[j] * (u16)src1[j];
+                    dst[j] = (s16)(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                    uint8x16_t v_src1 = vld1q_u8(src1 + j);
+
+                    uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
+                    uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
+                    vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                    vmovn_s32(vcvtq_s32_f32(v_dst1f))));
+
+                    v_src0_p = vmovl_u8(vget_high_u8(v_src0));
+                    v_src1_p = vmovl_u8(vget_high_u8(v_src1));
+                    v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                    vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
+                    v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                    vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
+                    vst1q_s16(dst + j + 8, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                        vmovn_s32(vcvtq_s32_f32(v_dst1f))));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
+                    uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j));
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
+                                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale);
+                    vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                    vmovn_s32(vcvtq_s32_f32(v_dst1f))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
+                    dst[j] = (s16)(s32)trunc(fval);
+                }
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+void mul(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    if (scale == 0.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            s16 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(s16) * size.width);
+        }
+        return;
+    }
+
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    bool is_integer_scale = isIntegerScale(scale);
+    s32 iscale = static_cast<s32>(scale);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j);
+
+                    int16x8_t v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                    int16x8_t v_src1_p = vld1q_s16(src1 + j);
+                    int16x8_t v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))),
+                                                   vqmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p))));
+                    vst1q_s16(dst + j, v_dst);
+
+                    v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                    v_src1_p = vld1q_s16(src1 + j + 8);
+                    v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))),
+                                                   vqmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p))));
+                    vst1q_s16(dst + j + 8, v_dst);
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vld1q_u8(src0 + j))));
+                    int16x8_t v_src1 = vld1q_s16(src1 + j);
+                    int16x8_t v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0), vget_low_s16(v_src1))),
+                                                   vqmovn_s32(vmull_s16(vget_high_s16(v_src0), vget_high_s16(v_src1))));
+                    vst1q_s16(dst + j, v_dst);
+                }
+
+                for (; j < size.width; j++)
+                {
+                    s32 val = (s32)src0[j] * (s32)src1[j];
+                    dst[j] = internal::saturate_cast<s16>(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j);
+
+                    uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
+                    int16x8_t v_src1_p = vld1q_s16(src1 + j);
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale);
+                    vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                    vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
+
+                    v_src0_p = vmovl_u8(vget_high_u8(v_src0));
+                    v_src1_p = vld1q_s16(src1 + j + 8);
+                    v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                    vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale);
+                    v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                    vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale);
+                    vst1q_s16(dst + j + 8, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                        vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
+                    int16x8_t v_src1 = vld1q_s16(src1 + j);
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
+                                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
+                                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)))), scale);
+                    vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                    vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
+                    dst[j] = internal::saturate_cast<s16>((s32)trunc(fval));
+                }
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j);
+
+                    int16x8_t v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                    int16x8_t v_src1_p = vld1q_s16(src1 + j);
+                    int16x8_t v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))),
+                                                   vmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p))));
+                    vst1q_s16(dst + j, v_dst);
+
+                    v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                    v_src1_p = vld1q_s16(src1 + j + 8);
+                    v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))),
+                                                   vmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p))));
+                    vst1q_s16(dst + j + 8, v_dst);
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vld1q_u8(src0 + j))));
+                    int16x8_t v_src1 = vld1q_s16(src1 + j);
+                    int16x8_t v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0), vget_low_s16(v_src1))),
+                                                   vmovn_s32(vmull_s16(vget_high_s16(v_src0), vget_high_s16(v_src1))));
+                    vst1q_s16(dst + j, v_dst);
+                }
+
+                for (; j < size.width; j++)
+                {
+                    s32 val = (s32)src0[j] * (s32)src1[j];
+                    dst[j] = (s16)(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    uint8x16_t v_src0 = vld1q_u8(src0 + j);
+
+                    uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
+                    int16x8_t v_src1_p = vld1q_s16(src1 + j);
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale);
+                    vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                    vmovn_s32(vcvtq_s32_f32(v_dst1f))));
+
+                    v_src0_p = vmovl_u8(vget_high_u8(v_src0));
+                    v_src1_p = vld1q_s16(src1 + j + 8);
+                    v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
+                                                    vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale);
+                    v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
+                                                    vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale);
+                    vst1q_s16(dst + j + 8, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                        vmovn_s32(vcvtq_s32_f32(v_dst1f))));
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
+                    int16x8_t v_src1 = vld1q_s16(src1 + j);
+                    float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
+                                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)))), scale);
+                    float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
+                                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)))), scale);
+                    vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
+                                                    vmovn_s32(vcvtq_s32_f32(v_dst1f))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
+                    dst[j] = (s16)(s32)trunc(fval);
+                }
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+namespace {
+
+#ifdef CAROTENE_NEON
+
+template <typename T>
+inline T mulSaturateQ(const T &v1, const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vqmovn(mulSaturateQ(internal::vmovl(internal::vget_low(v1)),
+                                                            internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vqmovn(mulSaturateQ(internal::vmovl(internal::vget_high(v1)),
+                                                            internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t mulSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_s32(v1), vcvtq_f32_s32(v2)), scale)); }
+template <>
+inline uint32x4_t mulSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(v1), vcvtq_f32_u32(v2)), scale)); }
+
+template <typename T>
+inline T mulSaturate(const T &v1, const T &v2, const float scale)
+{
+    return internal::vqmovn(mulSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t mulSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_n_f32(vmul_f32(vcvt_f32_s32(v1), vcvt_f32_s32(v2)), scale)); }
+template <>
+inline uint32x2_t mulSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_n_f32(vmul_f32(vcvt_f32_u32(v1), vcvt_f32_u32(v2)), scale)); }
+
+
+template <typename T>
+inline T mulWrapQ(const T &v1, const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vmovn(mulWrapQ(internal::vmovl(internal::vget_low(v1)),
+                                                       internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vmovn(mulWrapQ(internal::vmovl(internal::vget_high(v1)),
+                                                       internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t mulWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_s32(v1), vcvtq_f32_s32(v2)), scale)); }
+template <>
+inline uint32x4_t mulWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(v1), vcvtq_f32_u32(v2)), scale)); }
+
+template <typename T>
+inline T mulWrap(const T &v1, const T &v2, const float scale)
+{
+    return internal::vmovn(mulWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t mulWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_n_f32(vmul_f32(vcvt_f32_s32(v1), vcvt_f32_s32(v2)), scale)); }
+template <>
+inline uint32x2_t mulWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_n_f32(vmul_f32(vcvt_f32_u32(v1), vcvt_f32_u32(v2)), scale)); }
+
+
+template <int n> inline  uint8x16_t vshrq_n(const uint8x16_t  & v0) { return vshrq_n_u8 (v0, n); }
+template <int n> inline   int8x16_t vshrq_n(const int8x16_t   & v0) { return vshrq_n_s8 (v0, n); }
+template <int n> inline  uint16x8_t vshrq_n(const uint16x8_t  & v0) { return vshrq_n_u16(v0, n); }
+template <int n> inline   int16x8_t vshrq_n(const int16x8_t   & v0) { return vshrq_n_s16(v0, n); }
+template <int n> inline  uint32x4_t vshrq_n(const uint32x4_t  & v0) { return vshrq_n_u32(v0, n); }
+template <int n> inline   int32x4_t vshrq_n(const int32x4_t   & v0) { return vshrq_n_s32(v0, n); }
+template <int n> inline  uint64x2_t vshrq_n(const uint64x2_t  & v0) { return vshrq_n_u64(v0, n); }
+template <int n> inline   int64x2_t vshrq_n(const int64x2_t   & v0) { return vshrq_n_s64(v0, n); }
+
+template <int n> inline   uint8x8_t vshr_n(const uint8x8_t   & v0) { return vshr_n_u8 (v0, n); }
+template <int n> inline    int8x8_t vshr_n(const int8x8_t    & v0) { return vshr_n_s8 (v0, n); }
+template <int n> inline  uint16x4_t vshr_n(const uint16x4_t  & v0) { return vshr_n_u16(v0, n); }
+template <int n> inline   int16x4_t vshr_n(const int16x4_t   & v0) { return vshr_n_s16(v0, n); }
+template <int n> inline  uint32x2_t vshr_n(const uint32x2_t  & v0) { return vshr_n_u32(v0, n); }
+template <int n> inline   int32x2_t vshr_n(const int32x2_t   & v0) { return vshr_n_s32(v0, n); }
+template <int n> inline  uint64x1_t vshr_n(const uint64x1_t  & v0) { return vshr_n_u64(v0, n); }
+template <int n> inline   int64x1_t vshr_n(const int64x1_t   & v0) { return vshr_n_s64(v0, n); }
+
+template <int n> inline  uint8x16_t vrshrq_n(const uint8x16_t  & v0) { return vrshrq_n_u8 (v0, n); }
+template <int n> inline   int8x16_t vrshrq_n(const int8x16_t   & v0) { return vrshrq_n_s8 (v0, n); }
+template <int n> inline  uint16x8_t vrshrq_n(const uint16x8_t  & v0) { return vrshrq_n_u16(v0, n); }
+template <int n> inline   int16x8_t vrshrq_n(const int16x8_t   & v0) { return vrshrq_n_s16(v0, n); }
+template <int n> inline  uint32x4_t vrshrq_n(const uint32x4_t  & v0) { return vrshrq_n_u32(v0, n); }
+template <int n> inline   int32x4_t vrshrq_n(const int32x4_t   & v0) { return vrshrq_n_s32(v0, n); }
+template <int n> inline  uint64x2_t vrshrq_n(const uint64x2_t  & v0) { return vrshrq_n_u64(v0, n); }
+template <int n> inline   int64x2_t vrshrq_n(const int64x2_t   & v0) { return vrshrq_n_s64(v0, n); }
+
+template <int n> inline   uint8x8_t vrshr_n(const uint8x8_t   & v0) { return vrshr_n_u8 (v0, n); }
+template <int n> inline    int8x8_t vrshr_n(const int8x8_t    & v0) { return vrshr_n_s8 (v0, n); }
+template <int n> inline  uint16x4_t vrshr_n(const uint16x4_t  & v0) { return vrshr_n_u16(v0, n); }
+template <int n> inline   int16x4_t vrshr_n(const int16x4_t   & v0) { return vrshr_n_s16(v0, n); }
+template <int n> inline  uint32x2_t vrshr_n(const uint32x2_t  & v0) { return vrshr_n_u32(v0, n); }
+template <int n> inline   int32x2_t vrshr_n(const int32x2_t   & v0) { return vrshr_n_s32(v0, n); }
+template <int n> inline  uint64x1_t vrshr_n(const uint64x1_t  & v0) { return vrshr_n_u64(v0, n); }
+template <int n> inline   int64x1_t vrshr_n(const int64x1_t   & v0) { return vrshr_n_s64(v0, n); }
+
+template <typename T, typename WT, s32 shift>
+void mulShift(const Size2D &size,
+              const T * src0Base, ptrdiff_t src0Stride,
+              const T * src1Base, ptrdiff_t src1Stride,
+              T * dstBase, ptrdiff_t dstStride,
+              CONVERT_POLICY cpolicy)
+{
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<WT>::vec128 wvec128;
+    typedef typename internal::VecTraits<T>::vec64 vec64;
+    const size_t step128 = 16 / sizeof(T);
+    size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
+    const size_t step64 = 8 / sizeof(T);
+    size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
+
+    wvec128 v_mask = internal::vdupq_n((WT)(1<<shift));
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        T * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
+                wvec128 v_mul0 = internal::vmull( internal::vget_low(v_src0),  internal::vget_low(v_src1));
+                wvec128 v_mul1 = internal::vmull(internal::vget_high(v_src0), internal::vget_high(v_src1));
+
+                vec64 v_res0 = internal::vqmovn(vrshrq_n<shift>(internal::vqsubq(v_mul0, vshrq_n<shift>(internal::vbicq(v_mask, v_mul0)) )));
+                vec64 v_res1 = internal::vqmovn(vrshrq_n<shift>(internal::vqsubq(v_mul1, vshrq_n<shift>(internal::vbicq(v_mask, v_mul1)) )));
+
+                internal::vst1q(dst + j, internal::vcombine(v_res0, v_res1));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                wvec128 v_mul = internal::vmull(internal::vld1(src0 + j), internal::vld1(src1 + j));
+                vec64 v_res = internal::vqmovn(vrshrq_n<shift>(internal::vqsubq(v_mul, vshrq_n<shift>(internal::vbicq(v_mask, v_mul)) )));
+                internal::vst1(dst + j, v_res);
+            }
+
+            for (; j < size.width; j++)
+            {
+                WT val = (WT)src0[j] * (WT)src1[j];
+                dst[j] = internal::saturate_cast<T>((val - (((1<<shift) & ~val) >> shift) + (1<<(shift-1))) >> shift);
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
+                wvec128 v_mul0 = internal::vmull( internal::vget_low(v_src0),  internal::vget_low(v_src1));
+                wvec128 v_mul1 = internal::vmull(internal::vget_high(v_src0), internal::vget_high(v_src1));
+
+                vec64 v_res0 = internal::vmovn(vrshrq_n<shift>(internal::vqsubq(v_mul0, vshrq_n<shift>(internal::vbicq(v_mask, v_mul0)) )));
+                vec64 v_res1 = internal::vmovn(vrshrq_n<shift>(internal::vqsubq(v_mul1, vshrq_n<shift>(internal::vbicq(v_mask, v_mul1)) )));
+
+                internal::vst1q(dst + j, internal::vcombine(v_res0, v_res1));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                wvec128 v_mul = internal::vmull(internal::vld1(src0 + j), internal::vld1(src1 + j));
+                vec64 v_res = internal::vmovn(vrshrq_n<shift>(internal::vqsubq(v_mul, vshrq_n<shift>(internal::vbicq(v_mask, v_mul)) )));
+                internal::vst1(dst + j, v_res);
+            }
+
+            for (; j < size.width; j++)
+            {
+                WT val = (WT)src0[j] * (WT)src1[j];
+                dst[j] = (T)((val - (((1<<shift) & ~val) >> shift) + (1<<(shift-1))) >> shift);
+            }
+        }
+    }
+}
+#endif
+
+template <typename T, typename WT>
+void mul(const Size2D &size,
+         const T * src0Base, ptrdiff_t src0Stride,
+         const T * src1Base, ptrdiff_t src1Stride,
+         T * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+
+    typedef void (* mulFunc)(const Size2D &size,
+                             const T * src0Base, ptrdiff_t src0Stride,
+                             const T * src1Base, ptrdiff_t src1Stride,
+                             T * dstBase, ptrdiff_t dstStride,
+                             CONVERT_POLICY cpolicy);
+
+    if (scale == 0.0f ||
+        (std::numeric_limits<T>::is_integer &&
+         (scale * std::numeric_limits<T>::max() * std::numeric_limits<T>::max()) <  1.0f &&
+         (scale * std::numeric_limits<T>::max() * std::numeric_limits<T>::max()) > -1.0f))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            T * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(T) * size.width);
+        }
+        return;
+    }
+
+    s32 iscale = static_cast<s32>(scale), exp = 0;
+    f32 significand = frexp(scale, &exp);
+    bool is_integer_scale = isIntegerScale(scale),
+         is_power_of_2 = (significand == 0.5f) && (exp <= 0);
+    exp = -exp + 1;
+
+    if (is_power_of_2)
+    {
+        static const mulFunc funcs[16] =
+        {
+            NULL,
+            mulShift<T,WT,1>,
+            mulShift<T,WT,2>,
+            mulShift<T,WT,3>,
+            mulShift<T,WT,4>,
+            mulShift<T,WT,5>,
+            mulShift<T,WT,6>,
+            mulShift<T,WT,7>,
+            mulShift<T,WT,8>,
+            mulShift<T,WT,9>,
+            mulShift<T,WT,10>,
+            mulShift<T,WT,11>,
+            mulShift<T,WT,12>,
+            mulShift<T,WT,13>,
+            mulShift<T,WT,14>,
+            mulShift<T,WT,15>
+        };
+
+        mulFunc func = funcs[exp];
+
+        func(size,
+             src0Base, src0Stride,
+             src1Base, src1Stride,
+             dstBase, dstStride,
+             cpolicy);
+
+        return;
+    }
+
+    const size_t step128 = 16 / sizeof(T);
+    size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
+    const size_t step64 = 8 / sizeof(T);
+    size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        T * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw128; j += step128)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
+                    internal::vst1q(dst + j, internal::vcombine(
+                                                internal::vqmovn(internal::vmull(internal::vget_low(v_src0),
+                                                                                 internal::vget_low(v_src1))),
+                                                internal::vqmovn(internal::vmull(internal::vget_high(v_src0),
+                                                                                 internal::vget_high(v_src1)))
+                                                               )
+                                   );
+                }
+                for (; j < roiw64; j += step64)
+                {
+                    internal::vst1(dst + j, internal::vqmovn(internal::vmull(internal::vld1(src0 + j),
+                                                                             internal::vld1(src1 + j))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    WT val = (WT)src0[j] * (WT)src1[j];
+                    dst[j] = internal::saturate_cast<T>(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < roiw128; j += step128)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    internal::vst1q(dst + j, mulSaturateQ(internal::vld1q(src0 + j),
+                                                          internal::vld1q(src1 + j), scale));
+                }
+                for (; j < roiw64; j += step64)
+                {
+                    internal::vst1(dst + j, mulSaturate(internal::vld1(src0 + j),
+                                                        internal::vld1(src1 + j), scale));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
+                    dst[j] = internal::saturate_cast<T>(fval);
+                }
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw128; j += step128)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
+                    internal::vst1q(dst + j, internal::vcombine(
+                                                 internal::vmovn(internal::vmull(internal::vget_low(v_src0),
+                                                                                 internal::vget_low(v_src1))),
+                                                 internal::vmovn(internal::vmull(internal::vget_high(v_src0),
+                                                                                 internal::vget_high(v_src1)))
+                                                               )
+                                   );
+                }
+                for (; j < roiw64; j += step64)
+                {
+                    internal::vst1(dst + j, internal::vmovn(internal::vmull(internal::vld1(src0 + j),
+                                                                            internal::vld1(src1 + j))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    WT val = (WT)src0[j] * (WT)src1[j];
+                    dst[j] = (T)(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < roiw128; j += step128)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    internal::vst1q(dst + j, mulWrapQ(internal::vld1q(src0 + j),
+                                                      internal::vld1q(src1 + j), scale));
+                }
+                for (; j < roiw64; j += step64)
+                {
+                    internal::vst1(dst + j, mulWrap(internal::vld1(src0 + j),
+                                                    internal::vld1(src1 + j), scale));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
+                    dst[j] = (T)((s32)trunc(fval));
+                }
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+}
+
+void mul(const Size2D &size,
+         const s8 * src0Base, ptrdiff_t src0Stride,
+         const s8 * src1Base, ptrdiff_t src1Stride,
+         s8 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    mul<s8,s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void mul(const Size2D &size,
+         const u16 * src0Base, ptrdiff_t src0Stride,
+         const u16 * src1Base, ptrdiff_t src1Stride,
+         u16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    mul<u16,u32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void mul(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    mul<s16,s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void mul(const Size2D &size,
+         const s32 * src0Base, ptrdiff_t src0Stride,
+         const s32 * src1Base, ptrdiff_t src1Stride,
+         s32 * dstBase, ptrdiff_t dstStride,
+         f64 scale,
+         CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    typedef void (* mulFunc)(const Size2D &size,
+                             const s32 * src0Base, ptrdiff_t src0Stride,
+                             const s32 * src1Base, ptrdiff_t src1Stride,
+                             s32 * dstBase, ptrdiff_t dstStride,
+                             CONVERT_POLICY cpolicy);
+
+    if (!std::isnormal(scale) ||
+        ((scale * std::numeric_limits<s32>::max() * std::numeric_limits<s32>::max()) <  1.0f &&
+         (scale * std::numeric_limits<s32>::max() * std::numeric_limits<s32>::max()) > -1.0f))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            s32 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(s32) * size.width);
+        }
+        return;
+    }
+
+    s32 iscale = static_cast<s32>(scale), exp = 0;
+    f64 significand = frexp(scale, &exp);
+    bool is_integer_scale = isIntegerScale(scale),
+         is_power_of_2 = (significand == 0.5) && (exp <= 0);
+    exp = -exp + 1;
+
+    if (is_power_of_2)
+    {
+        static const mulFunc funcs[16] =
+        {
+            NULL,
+            mulShift<s32,s64,1>,
+            mulShift<s32,s64,2>,
+            mulShift<s32,s64,3>,
+            mulShift<s32,s64,4>,
+            mulShift<s32,s64,5>,
+            mulShift<s32,s64,6>,
+            mulShift<s32,s64,7>,
+            mulShift<s32,s64,8>,
+            mulShift<s32,s64,9>,
+            mulShift<s32,s64,10>,
+            mulShift<s32,s64,11>,
+            mulShift<s32,s64,12>,
+            mulShift<s32,s64,13>,
+            mulShift<s32,s64,14>,
+            mulShift<s32,s64,15>
+        };
+
+        mulFunc func = funcs[exp];
+
+        func(size,
+             src0Base, src0Stride,
+             src1Base, src1Stride,
+             dstBase, dstStride,
+             cpolicy);
+
+        return;
+    }
+
+    size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
+    size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const s32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw128; j += 4)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    int32x4_t v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
+                    internal::vst1q(dst + j, internal::vcombine(
+                                                internal::vqmovn(internal::vmull(internal::vget_low(v_src0),
+                                                                                 internal::vget_low(v_src1))),
+                                                internal::vqmovn(internal::vmull(internal::vget_high(v_src0),
+                                                                                 internal::vget_high(v_src1)))
+                                                               )
+                                   );
+                }
+                for (; j < roiw64; j += 2)
+                {
+                    internal::vst1(dst + j, internal::vqmovn(internal::vmull(internal::vld1(src0 + j),
+                                                                             internal::vld1(src1 + j))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    s64 val = (s64)src0[j] * (s64)src1[j];
+                    dst[j] = internal::saturate_cast<s32>(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < size.width; j++)
+                {
+                    f64 fval = src0[j] * src1[j] * scale;
+                    dst[j] = internal::saturate_cast<s32>(fval);
+                }
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            if (is_integer_scale && iscale == 1)
+            {
+                for (; j < roiw128; j += 4)
+                {
+                    internal::prefetch(src0 + j);
+                    internal::prefetch(src1 + j);
+                    int32x4_t v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
+                    internal::vst1q(dst + j, internal::vcombine(
+                                                 internal::vmovn(internal::vmull(internal::vget_low(v_src0),
+                                                                                 internal::vget_low(v_src1))),
+                                                 internal::vmovn(internal::vmull(internal::vget_high(v_src0),
+                                                                                 internal::vget_high(v_src1)))
+                                                               )
+                                   );
+                }
+                for (; j < roiw64; j += 2)
+                {
+                    internal::vst1(dst + j, internal::vmovn(internal::vmull(internal::vld1(src0 + j),
+                                                                            internal::vld1(src1 + j))));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    s64 val = (s64)src0[j] * (s64)src1[j];
+                    dst[j] = (s32)(val);
+                }
+            }
+            else // generic case using floats
+            {
+                for (; j < size.width; j++)
+                {
+                    f64 fval = src0[j] * src1[j] * scale;
+                    dst[j] = (s32)trunc(fval);
+                }
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+void mul(const Size2D &size,
+         const f32 * src0Base, ptrdiff_t src0Stride,
+         const f32 * src1Base, ptrdiff_t src1Stride,
+         f32 * dstBase, ptrdiff_t dstStride,
+         f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (scale == 0.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(f32) * size.width);
+        }
+        return;
+    }
+
+    size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
+    size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
+
+    if (std::fabs(scale - 1.0f) < FLT_EPSILON)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+            const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                vst1q_f32(dst + j, vmulq_f32(vld1q_f32(src0 + j), vld1q_f32(src1 + j)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                vst1_f32(dst + j, vmul_f32(vld1_f32(src0 + j), vld1_f32(src1 + j)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src0[j] * src1[j];
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+            const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                vst1q_f32(dst + j, vmulq_n_f32(vmulq_f32(vld1q_f32(src0 + j), vld1q_f32(src1 + j)), scale));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                vst1_f32(dst + j, vmul_n_f32(vmul_f32(vld1_f32(src0 + j), vld1_f32(src1 + j)), scale));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src0[j] * src1[j] * scale;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/norm.cpp b/3rdparty/carotene/src/norm.cpp
new file mode 100644
index 0000000000..6ff2456597
--- /dev/null
+++ b/3rdparty/carotene/src/norm.cpp
@@ -0,0 +1,1310 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+//magic number; must be multiple of 4
+#define NORM32F_BLOCK_SIZE 2048
+
+s32 normInf(const Size2D &_size,
+            const u8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        if (size.width >= 16)
+        {
+            uint8x16_t s = vld1q_u8(src);
+            for (i = 16; i <= size.width - 16; i += 16)
+            {
+                internal::prefetch(src + i);
+                uint8x16_t s1 = vld1q_u8(src + i);
+                s = vmaxq_u8(s1, s);
+            }
+            u8 s2[8];
+            uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
+            vst1_u8(s2, s3);
+            for (u32 j = 0; j < 8; j++)
+                result = std::max((s32)(s2[j]), result);
+        }
+        for ( ; i < size.width; i++)
+            result = std::max((s32)(src[i]), result);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 normInf(const Size2D &_size,
+            const s8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        if (size.width >= 16)
+        {
+            uint8x16_t s = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src)));
+            for (i = 16; i <= size.width - 16; i += 16)
+            {
+                internal::prefetch(src + i);
+                uint8x16_t s1 = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src + i)));
+                s = vmaxq_u8(s1, s);
+            }
+            u8 s2[8];
+            uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
+            vst1_u8(s2, s3);
+            for (u32 j = 0; j < 8; j++)
+                result = std::max((s32)(s2[j]), result);
+        }
+        for ( ; i < size.width; i++)
+            result = std::max((s32)(std::abs(src[i])), result);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 normInf(const Size2D &_size,
+            const u16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        if (size.width >= 8)
+        {
+            uint16x8_t s = vld1q_u16(src);
+            for (i = 8; i <= size.width - 8; i += 8)
+            {
+                internal::prefetch(src + i);
+                uint16x8_t s1 = vld1q_u16(src + i);
+                s = vmaxq_u16(s1, s);
+            }
+            u16 s2[4];
+            uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
+            vst1_u16(s2, s3);
+            for (u32 j = 0; j < 4; j++)
+                result = std::max((s32)(s2[j]), result);
+        }
+        for ( ; i < size.width; i++)
+            result = std::max((s32)(src[i]), result);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 normInf(const Size2D &_size,
+            const s16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        if (size.width >= 8)
+        {
+            uint16x8_t s = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src)));
+            for (i = 8; i <= size.width - 8; i += 8)
+            {
+                internal::prefetch(src + i);
+                uint16x8_t s1 = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src + i)));
+                s = vmaxq_u16(s1, s);
+            }
+            u16 s2[4];
+            uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
+            vst1_u16(s2, s3);
+            for (u32 j = 0; j < 4; j++)
+                result = std::max((s32)(s2[j]), result);
+        }
+        for ( ; i < size.width; i++)
+            result = std::max(std::abs((s32)(src[i])), result);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 normInf(const Size2D &_size,
+            const s32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        if (size.width >= 4)
+        {
+            uint32x4_t s = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src)));
+            for (i = 4; i <= size.width - 4; i += 4)
+            {
+                internal::prefetch(src + i);
+                uint32x4_t s1 = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src + i)));
+                s = vmaxq_u32(s1, s);
+            }
+            u32 s2[2];
+            uint32x2_t s3 = vmax_u32(vget_low_u32(s), vget_high_u32(s));
+            vst1_u32(s2, s3);
+            for (u32 j = 0; j < 2; j++)
+                result = std::max((s32)(s2[j]), result);
+        }
+        for ( ; i < size.width; i++)
+            result = std::max((s32)(std::abs(src[i])), result);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+f32 normInf(const Size2D &_size,
+            const f32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    f32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        if (size.width >= 4)
+        {
+            float32x4_t s = vabsq_f32(vld1q_f32(src));
+            for (i = 4; i <= size.width - 4; i += 4 )
+            {
+                internal::prefetch(src + i);
+                float32x4_t s1 = vld1q_f32(src + i);
+                float32x4_t sa = vabsq_f32(s1);
+                s = vmaxq_f32(sa, s);
+            }
+            f32 s2[2];
+            float32x2_t s3 = vmax_f32(vget_low_f32(s), vget_high_f32(s));
+            vst1_f32(s2, s3);
+            for (u32 j = 0; j < 2; j++)
+                result = std::max(s2[j], result);
+        }
+        for (; i < size.width; i++)
+            result = std::max(std::abs(src[i]), result);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0.;
+#endif
+}
+
+s32 normL1(const Size2D &_size,
+           const u8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        uint32x4_t vs = vmovq_n_u32(0);
+        for (; i < roiw8;)
+        {
+            size_t limit = std::min(size.width, i + 256) - 8;
+            uint8x8_t s0 = vld1_u8(src + i);
+            uint16x8_t s = vmovl_u8(s0);
+
+            for (i += 8; i <= limit; i += 8)
+            {
+                internal::prefetch(src + i);
+                uint8x8_t s1 = vld1_u8(src + i);
+                s = vaddw_u8(s, s1);
+            }
+
+            uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
+            vs = vaddw_u16(vs, s4);
+        }
+
+        u32 s2[2];
+        uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+        vst1_u32(s2, vs2);
+
+        result += (s32)(s2[0] + s2[1]);
+
+        for ( ; i < size.width; i++)
+            result += (s32)(src[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 normL1(const Size2D &_size,
+           const s8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        uint32x4_t vs = vmovq_n_u32(0);
+
+        for (; i < roiw8;)
+        {
+            size_t limit = std::min(size.width, i + 256) - 8;
+            uint8x8_t s0 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
+            uint16x8_t s = vmovl_u8(s0);
+
+            for (i += 8; i <= limit; i += 8)
+            {
+                internal::prefetch(src + i);
+                uint8x8_t s1 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
+                s = vaddw_u8(s, s1);
+            }
+
+            uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
+            vs = vaddw_u16(vs, s4);
+        }
+
+        u32 s2[2];
+        uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+        vst1_u32(s2, vs2);
+
+        result += (s32)(s2[0] + s2[1]);
+
+        for ( ; i < size.width; i++)
+            result += (s32)(std::abs(src[i]));
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 normL1(const Size2D &_size,
+           const u16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        uint32x4_t vs = vmovq_n_u32(0);
+        for (; i < roiw4; i += 4)
+        {
+            internal::prefetch(src + i);
+            uint16x4_t s = vld1_u16(src + i);
+            vs = vaddw_u16(vs, s);
+        }
+        u32 s2[4];
+        vst1q_u32(s2, vs);
+        for (u32 j = 0; j < 4; j++)
+            result += s2[j];
+        for ( ; i < size.width; i++)
+            result += (s32)(src[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 normL1(const Size2D &_size,
+           const s16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        uint32x4_t vs = vmovq_n_u32(0);
+        for (; i < roiw4; i += 4)
+        {
+            internal::prefetch(src + i);
+            uint16x4_t s = vreinterpret_u16_s16(vabs_s16(vld1_s16(src + i)));
+            vs = vaddw_u16(vs, s);
+        }
+        u32 s2[4];
+        vst1q_u32(s2, vs);
+        for (u32 j = 0; j < 4; j++)
+            result += s2[j];
+        for ( ; i < size.width; i++)
+            result += (s32)(std::abs(src[i]));
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+f64 normL1(const Size2D &_size,
+           const s32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
+    f64 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        for (; i < roiw4;)
+        {
+            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
+            float32x4_t s = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
+            for (i += 4; i <= limit; i += 4 )
+            {
+                internal::prefetch(src + i);
+                float32x4_t s1 = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
+                s = vaddq_f32(s, s1);
+            }
+
+            f32 s2[4];
+            vst1q_f32(s2, s);
+
+            for (u32 j = 0; j < 4; j++)
+                result += (f64)(s2[j]);
+        }
+        for ( ; i < size.width; i++)
+            result += (f64)(std::abs(src[i]));
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0.;
+#endif
+}
+
+f64 normL1(const Size2D &_size,
+           const f32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
+    f64 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        for (; i < roiw4;)
+        {
+            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
+            float32x4_t s = vabsq_f32(vld1q_f32(src + i));
+            for (i += 4; i <= limit; i += 4)
+            {
+                internal::prefetch(src + i);
+                float32x4_t s1 = vld1q_f32(src + i);
+                float32x4_t sa = vabsq_f32(s1);
+                s = vaddq_f32(sa, s);
+            }
+
+            f32 s2[4];
+            vst1q_f32(s2, s);
+
+            for (u32 j = 0; j < 4; j++)
+                result += (f64)(s2[j]);
+        }
+        for (; i < size.width; i++)
+            result += std::abs((f64)(src[i]));
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0.;
+#endif
+}
+
+s32 normL2(const Size2D &_size,
+           const u8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        uint32x4_t sl = vmovq_n_u32(0);
+        uint32x4_t sh = vmovq_n_u32(0);
+
+        for (; i < roiw8; i += 8)
+        {
+            internal::prefetch(src + i);
+            uint8x8_t s1 = vld1_u8(src + i);
+            uint16x8_t sq = vmull_u8(s1, s1);
+
+            sl = vaddw_u16(sl, vget_low_u16(sq));
+            sh = vaddw_u16(sh, vget_high_u16(sq));
+        }
+
+        uint32x4_t s = vaddq_u32(sl, sh);
+        uint32x2_t ss = vadd_u32(vget_low_u32(s), vget_high_u32(s));
+
+        u32 s2[2];
+        vst1_u32(s2, ss);
+
+        result += (s32)(s2[0] + s2[1]);
+
+        for (; i < size.width; i++)
+            result += (s32)(src[i]) * (s32)(src[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 normL2(const Size2D &_size,
+           const s8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        int32x4_t sl = vmovq_n_s32(0);
+        int32x4_t sh = vmovq_n_s32(0);
+
+        for (; i < roiw8; i += 8)
+        {
+            internal::prefetch(src + i);
+            int8x8_t s1 = vld1_s8(src + i);
+            int16x8_t sq = vmull_s8(s1, s1);
+
+            sl = vaddw_s16(sl, vget_low_s16(sq));
+            sh = vaddw_s16(sh, vget_high_s16(sq));
+        }
+
+        int32x4_t s = vaddq_s32(sl, sh);
+        int32x2_t ss = vadd_s32(vget_low_s32(s), vget_high_s32(s));
+
+        s32 s2[2];
+        vst1_s32(s2, ss);
+
+        result += s2[0] + s2[1];
+
+        for (; i < size.width; i++)
+            result += (s32)(src[i]) * (s32)(src[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+f64 normL2(const Size2D &_size,
+           const u16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
+    f64 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        for (; i < roiw4;)
+        {
+            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
+            uint16x4_t s0 = vld1_u16(src+i);
+            float32x4_t s = vcvtq_f32_u32(vmull_u16(s0,s0));
+            for (i += 4; i <= limit; i += 4 )
+            {
+                internal::prefetch(src + i);
+                uint16x4_t s1 = vld1_u16(src+i);
+                float32x4_t sq = vcvtq_f32_u32(vmull_u16(s1, s1));
+                s = vaddq_f32(s, sq);
+            }
+            f32 s2[4];
+            vst1q_f32(s2, s);
+            for (u32 j = 0; j < 4; j++)
+                result += (f64)(s2[j]);
+        }
+
+        for ( ; i < size.width; i++)
+            result += (f64)(src[i]) * (f64)(src[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0.;
+#endif
+}
+
+f64 normL2(const Size2D &_size,
+           const s16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
+    f64 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        for (; i < roiw4;)
+        {
+            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
+            int16x4_t s0 = vld1_s16(src+i);
+            float32x4_t s = vcvtq_f32_s32(vmull_s16(s0,s0));
+            for (i += 4; i <= limit; i += 4 )
+            {
+                internal::prefetch(src + i);
+                int16x4_t s1 = vld1_s16(src+i);
+                float32x4_t sq = vcvtq_f32_s32(vmull_s16(s1, s1));
+                s = vaddq_f32(s, sq);
+            }
+            f32 s2[4];
+            vst1q_f32(s2, s);
+            for (u32 j = 0; j < 4; j++)
+                result += (f64)(s2[j]);
+        }
+
+        for ( ; i < size.width; i++)
+            result += (f64)(src[i]) * (f64)(src[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0.;
+#endif
+}
+
+f64 normL2(const Size2D &_size,
+           const s32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
+    f64 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        for (; i < roiw4;)
+        {
+            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
+            float32x4_t s = vcvtq_f32_s32(vld1q_s32(src + i));
+            s = vmulq_f32(s, s);
+            for (i += 4; i <= limit; i += 4 )
+            {
+                internal::prefetch(src + i);
+                float32x4_t s1 = vcvtq_f32_s32(vld1q_s32(src + i));
+                s = vmlaq_f32(s, s1, s1);
+            }
+
+            f32 s2[4];
+            vst1q_f32(s2, s);
+
+            for (u32 j = 0; j < 4; j++)
+                result += (f64)(s2[j]);
+        }
+        for ( ; i < size.width; i++)
+            result += (f64)(src[i]) * (f64)(src[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0.;
+#endif
+}
+
+f64 normL2(const Size2D &_size,
+           const f32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
+    f64 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+        for (; i < roiw4;)
+        {
+            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
+            float32x4_t s = vld1q_f32(src + i);
+            s = vmulq_f32(s, s);
+            for (i += 4; i <= limit; i += 4 )
+            {
+                internal::prefetch(src + i);
+                float32x4_t s1 = vld1q_f32(src + i);
+                s = vmlaq_f32(s, s1, s1);
+            }
+
+            f32 s2[4];
+            vst1q_f32(s2, s);
+
+            for (u32 j = 0; j < 4; j++)
+                result += (f64)(s2[j]);
+        }
+        for ( ; i < size.width; i++)
+            result += (f64)(src[i]) * (f64)(src[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0.;
+#endif
+}
+
+s32 diffNormInf(const Size2D &_size,
+                const u8 * src0Base, ptrdiff_t src0Stride,
+                const u8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
+        const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
+        size_t i = 0;
+
+        if (size.width >= 16)
+        {
+            uint8x16_t vs3 = vdupq_n_u8(0);
+            for (; i < size.width - 16; i += 16)
+            {
+                internal::prefetch(src1 + i);
+                internal::prefetch(src2 + i);
+
+                uint8x16_t vs1 = vld1q_u8(src1 + i);
+                uint8x16_t vs2 = vld1q_u8(src2 + i);
+
+                vs3 = vmaxq_u8(vs3, vabdq_u8(vs1, vs2));
+            }
+
+            u8 s2[8];
+            vst1_u8(s2, vpmax_u8(vget_low_u8(vs3), vget_high_u8(vs3)));
+
+            for (u32 j = 0; j < 8; j++)
+                result = std::max((s32)(s2[j]), result);
+        }
+
+        for (; i < size.width; i++)
+        {
+            result = std::max(std::abs((s32)(src1[i]) - (s32)(src2[i])), result);
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f32 diffNormInf(const Size2D &_size,
+                const f32 * src0Base, ptrdiff_t src0Stride,
+                const f32 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    f32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
+        const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
+        size_t i = 0;
+
+        if (size.width >= 4)
+        {
+            float32x4_t s = vabdq_f32(vld1q_f32(src1), vld1q_f32(src2));
+
+            for (i += 4; i <= size.width - 4; i += 4 )
+            {
+                internal::prefetch(src1 + i);
+                internal::prefetch(src2 + i);
+
+                float32x4_t vs1 = vld1q_f32(src1 + i);
+                float32x4_t vs2 = vld1q_f32(src2 + i);
+
+                float32x4_t vd = vabdq_f32(vs2, vs1);
+                s = vmaxq_f32(s, vd);
+            }
+
+            f32 s2[4];
+            vst1q_f32(s2, s);
+
+            for (u32 j = 0; j < 4; j++)
+                if (s2[j] > result)
+                    result = s2[j];
+        }
+
+        for (; i < size.width; i++)
+        {
+            f32 v = std::abs(src1[i] - src2[i]);
+            if (v > result)
+                result = v;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0.;
+#endif
+}
+
+s32 diffNormL1(const Size2D &_size,
+               const u8 * src0Base, ptrdiff_t src0Stride,
+               const u8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
+        const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
+        size_t i = 0;
+
+        if (size.width >= 16)
+        {
+            for(; i <= size.width - 16;)
+            {
+                size_t limit = std::min(size.width, i + 2*256) - 16;
+                uint16x8_t si1 = vmovq_n_u16(0);
+                uint16x8_t si2 = vmovq_n_u16(0);
+
+                for (; i <= limit; i += 16)
+                {
+                    internal::prefetch(src1 + i);
+                    internal::prefetch(src2 + i);
+
+                    uint8x16_t vs1 = vld1q_u8(src1 + i);
+                    uint8x16_t vs2 = vld1q_u8(src2 + i);
+
+                    si1 = vabal_u8(si1, vget_low_u8(vs1), vget_low_u8(vs2));
+                    si2 = vabal_u8(si2, vget_high_u8(vs1), vget_high_u8(vs2));
+                }
+
+                u32 s2[4];
+                vst1q_u32(s2, vaddq_u32(vpaddlq_u16(si1), vpaddlq_u16(si2)));
+
+                for (u32 j = 0; j < 4; j++)
+                {
+                    if ((s32)(0x7fFFffFFu - s2[j]) <= result)
+                    {
+                        return 0x7fFFffFF; //result already saturated
+                    }
+                    result = (s32)((u32)(result) + s2[j]);
+                }
+            }
+
+        }
+
+        for (; i < size.width; i++)
+        {
+            u32 v = std::abs((s32)(src1[i]) - (s32)(src2[i]));
+
+            if ((s32)(0x7fFFffFFu - v) <= result)
+            {
+                return 0x7fFFffFF; //result already saturated
+            }
+            result = (s32)((u32)(result) + v);
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f64 diffNormL1(const Size2D &_size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    f64 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
+        const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
+        size_t i = 0;
+
+        if (size.width >= 4)
+        {
+            for(; i <= size.width - 4;)
+            {
+                size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
+                float32x4_t s = vmovq_n_f32(0.0f);
+
+                for (; i <= limit; i += 4 )
+                {
+                    internal::prefetch(src1 + i);
+                    internal::prefetch(src2 + i);
+
+                    float32x4_t vs1 = vld1q_f32(src1 + i);
+                    float32x4_t vs2 = vld1q_f32(src2 + i);
+
+                    float32x4_t vd = vabdq_f32(vs2, vs1);
+                    s = vaddq_f32(s, vd);
+                }
+
+                f32 s2[4];
+                vst1q_f32(s2, s);
+
+                for (u32 j = 0; j < 4; j++)
+                    result += (f64)(s2[j]);
+            }
+        }
+
+        for (; i < size.width; i++)
+        {
+            f32 v = std::abs(src1[i] - src2[i]);
+            result += (f64)(v);
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0.;
+#endif
+}
+
+s32 diffNormL2(const Size2D &_size,
+               const u8 * src0Base, ptrdiff_t src0Stride,
+               const u8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
+        const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
+        size_t i = 0;
+
+#define NORML28U_BLOCK_SIZE (33024*2) //bigger block size can result in integer overflow
+        if (size.width >= 16)
+        {
+            for(; i <= size.width - 16;)
+            {
+                size_t limit = std::min(size.width, i + NORML28U_BLOCK_SIZE) - 16;
+                uint32x4_t si1 = vmovq_n_u32(0);
+                uint32x4_t si2 = vmovq_n_u32(0);
+
+                for (; i <= limit; i += 16)
+                {
+                    internal::prefetch(src1 + i);
+                    internal::prefetch(src2 + i);
+
+                    uint8x16_t vs1 = vld1q_u8(src1 + i);
+                    uint8x16_t vs2 = vld1q_u8(src2 + i);
+
+                    uint16x8_t vdlo = vabdl_u8(vget_low_u8(vs1), vget_low_u8(vs2));
+                    uint16x8_t vdhi = vabdl_u8(vget_high_u8(vs1), vget_high_u8(vs2));
+
+                    si1 = vmlal_u16(si1, vget_low_u16(vdlo), vget_low_u16(vdlo));
+                    si2 = vmlal_u16(si2, vget_high_u16(vdlo), vget_high_u16(vdlo));
+
+                    si1 = vmlal_u16(si1, vget_low_u16(vdhi), vget_low_u16(vdhi));
+                    si2 = vmlal_u16(si2, vget_high_u16(vdhi), vget_high_u16(vdhi));
+                }
+
+                u32 s2[4];
+                vst1q_u32(s2, vqaddq_u32(si1, si2));
+
+                for (u32 j = 0; j < 4; j++)
+                {
+                    if ((s32)(0x7fFFffFFu - s2[j]) <= result)
+                    {
+                        return 0x7fFFffFF; //result already saturated
+                    }
+                    result += (s32)s2[j];
+                }
+            }
+
+        }
+
+        for (; i < size.width; i++)
+        {
+            s32 v = (s32)(src1[i]) - (s32)(src2[i]);
+            v *= v;
+
+            if ((s32)(0x7fFFffFFu - (u32)(v)) <= result)
+            {
+                return 0x7fFFffFF; //result already saturated
+            }
+            result += v;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f64 diffNormL2(const Size2D &_size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    f64 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
+        const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
+        size_t i = 0;
+
+        if (size.width >= 4)
+        {
+            for(; i <= size.width - 4;)
+            {
+                size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
+                float32x4_t s = vmovq_n_f32(0.0f);
+
+                for (; i <= limit; i += 4 )
+                {
+                    internal::prefetch(src1 + i);
+                    internal::prefetch(src2 + i);
+
+                    float32x4_t vs1 = vld1q_f32(src1 + i);
+                    float32x4_t vs2 = vld1q_f32(src2 + i);
+
+                    float32x4_t vd = vsubq_f32(vs2,vs1);
+                    s = vmlaq_f32(s, vd, vd);
+                }
+
+                f32 s2[4];
+                vst1q_f32(s2, s);
+
+                for (u32 j = 0; j < 4; j++)
+                    result += (f64)(s2[j]);
+            }
+        }
+
+        for (; i < size.width; i++)
+        {
+            f32 v = src1[i] - src2[i];
+            result += v * v;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0.;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/opticalflow.cpp b/3rdparty/carotene/src/opticalflow.cpp
new file mode 100644
index 0000000000..fa9402a05c
--- /dev/null
+++ b/3rdparty/carotene/src/opticalflow.cpp
@@ -0,0 +1,539 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+#include <vector>
+#include <float.h> // For FLT_EPSILON
+
+namespace CAROTENE_NS {
+
+#define CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
+
+/*
+ *        Pyramidal Lucas-Kanade Optical Flow level processing
+ */
+void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
+                       const u8 *prevData, ptrdiff_t prevStride,
+                       const s16 *prevDerivData, ptrdiff_t prevDerivStride,
+                       const u8 *nextData, ptrdiff_t nextStride,
+                       u32 ptCount,
+                       const f32 *prevPts, f32 *nextPts,
+                       u8 *status, f32 *err,
+                       const Size2D &winSize,
+                       u32 terminationCount, f64 terminationEpsilon,
+                       u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                       f32 minEigThreshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;
+    s32 cn2 = cn*2;
+
+    std::vector<s16> _buf(winSize.total()*(cn + cn2));
+    s16* IWinBuf = &_buf[0];
+    s32  IWinBufStride = winSize.width*cn;
+    s16* derivIWinBuf = &_buf[winSize.total()*cn];
+    s32  derivIWinBufStride = winSize.width*cn2;
+
+    for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
+    {
+        f32 levscale = (1./(1 << level));
+        u32 ptref = ptidx << 1;
+        f32 prevPtX = prevPts[ptref+0]*levscale;
+        f32 prevPtY = prevPts[ptref+1]*levscale;
+        f32 nextPtX;
+        f32 nextPtY;
+        if( level == maxLevel )
+        {
+            if( useInitialFlow )
+            {
+                nextPtX = nextPts[ptref+0]*levscale;
+                nextPtY = nextPts[ptref+1]*levscale;
+            }
+            else
+            {
+                nextPtX = prevPtX;
+                nextPtY = prevPtY;
+            }
+        }
+        else
+        {
+            nextPtX = nextPts[ptref+0]*2.f;
+            nextPtY = nextPts[ptref+1]*2.f;
+        }
+        nextPts[ptref+0] = nextPtX;
+        nextPts[ptref+1] = nextPtY;
+
+        s32 iprevPtX, iprevPtY;
+        s32 inextPtX, inextPtY;
+        prevPtX -= halfWinX;
+        prevPtY -= halfWinY;
+        iprevPtX = floor(prevPtX);
+        iprevPtY = floor(prevPtY);
+
+        if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
+            iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
+        {
+            if( level == 0 )
+            {
+                if( status )
+                    status[ptidx] = false;
+                if( err )
+                    err[ptidx] = 0;
+            }
+            continue;
+        }
+
+        f32 a = prevPtX - iprevPtX;
+        f32 b = prevPtY - iprevPtY;
+        const s32 W_BITS = 14, W_BITS1 = 14;
+        const f32 FLT_SCALE = 1.f/(1 << 20);
+        s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
+        s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));
+        s32 iw10 = round((1.f - a)*b*(1 << W_BITS));
+        s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+        s32 dstep = prevDerivStride/sizeof(s16);
+        f32 A11 = 0, A12 = 0, A22 = 0;
+
+        int16x4_t viw00 = vmov_n_s16((s16)iw00);
+        int16x4_t viw01 = vmov_n_s16((s16)iw01);
+        int16x4_t viw10 = vmov_n_s16((s16)iw10);
+        int16x4_t viw11 = vmov_n_s16((s16)iw11);
+
+        float32x4_t vA11 = vmovq_n_f32(0);
+        float32x4_t vA12 = vmovq_n_f32(0);
+        float32x4_t vA22 = vmovq_n_f32(0);
+
+        s32 wwcn = winSize.width*cn;
+
+        // extract the patch from the first image, compute covariation matrix of derivatives
+        s32 x = 0;
+        for(s32 y = 0; y < (s32)winSize.height; y++ )
+        {
+            const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;
+            const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;
+
+            s16* Iptr = IWinBuf + y*IWinBufStride;
+            s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
+
+            internal::prefetch(src + x + prevStride * 2, 0);
+            for(x = 0; x <= wwcn - 8; x += 8)
+            {
+                uint8x8_t vsrc00 = vld1_u8(src + x);
+                uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
+                uint8x8_t vsrc01 = vld1_u8(src + x + cn);
+                uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
+
+                int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));
+                int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));
+                int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));
+                int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));
+
+                int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
+                int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
+
+                vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
+                vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
+
+                vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
+                vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
+
+                vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
+                vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
+
+                int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
+                int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
+
+                vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));
+            }
+            for(; x <= wwcn - 4; x += 4)
+            {
+                uint8x8_t vsrc00 = vld1_u8(src + x);
+                uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
+                uint8x8_t vsrc01 = vld1_u8(src + x + cn);
+                uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
+
+                int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));
+                int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));
+                int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));
+                int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));
+
+                int32x4_t vsuml1 = vmull_s16(vs00, viw00);
+                int32x4_t vsuml2 = vmull_s16(vs01, viw01);
+                vsuml1 = vmlal_s16(vsuml1, vs10, viw10);
+                vsuml2 = vmlal_s16(vsuml2, vs11, viw11);
+                int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);
+
+                int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
+
+                vst1_s16(Iptr + x, vsumnl);
+            }
+
+            internal::prefetch(dsrc + dstep * 2, 0);
+            for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+            {
+#if __GNUC_MINOR__ < 0
+                __asm__ (
+                    "vld2.16 {d0-d1}, [%[dsrc00]]                         \n\t"
+                    "vld2.16 {d2-d3}, [%[dsrc10]]                         \n\t"
+                    "vld2.16 {d4-d5}, [%[dsrc01]]                         \n\t"
+                    "vld2.16 {d6-d7}, [%[dsrc11]]                         \n\t"
+                    "vmull.s16 q4, d3, %P[viw10]                           \n\t"
+                    "vmull.s16 q5, d0, %P[viw00]                           \n\t"
+                    "vmlal.s16 q4, d7, %P[viw11]                           \n\t"
+                    "vmlal.s16 q5, d4, %P[viw01]                           \n\t"
+                    "vmlal.s16 q4, d1, %P[viw00]                           \n\t"
+                    "vmlal.s16 q5, d2, %P[viw10]                           \n\t"
+                    "vmlal.s16 q4, d5, %P[viw01]                           \n\t"
+                    "vmlal.s16 q5, d6, %P[viw11]                            \n\t"
+                    "vrshrn.s32 d13, q4, %[W_BITS1]                       \n\t"
+                    "vrshrn.s32 d12, q5, %[W_BITS1]                       \n\t"
+                    "vmull.s16 q3, d13, d13                               \n\t"
+                    "vmull.s16 q4, d12, d12                               \n\t"
+                    "vmull.s16 q5, d13, d12                               \n\t"
+                    "vcvt.f32.s32 q3, q3                                  \n\t"
+                    "vcvt.f32.s32 q4, q4                                  \n\t"
+                    "vcvt.f32.s32 q5, q5                                  \n\t"
+                    "vadd.f32 %q[vA22], q3                                \n\t"
+                    "vadd.f32 %q[vA11], q4                                \n\t"
+                    "vadd.f32 %q[vA12], q5                                \n\t"
+                    "vst2.16 {d12-d13}, [%[out]]                          \n\t"
+                    : [vA22] "=w" (vA22),
+                      [vA11] "=w" (vA11),
+                      [vA12] "=w" (vA12)
+                    : "0" (vA22),
+                      "1" (vA11),
+                      "2" (vA12),
+                      [out] "r" (dIptr),
+                      [dsrc00] "r" (dsrc),
+                      [dsrc10] "r" (dsrc + dstep),
+                      [dsrc01] "r" (dsrc + cn2),
+                      [dsrc11] "r" (dsrc + dstep + cn2),
+                      [viw00] "w" (viw00),
+                      [viw10] "w" (viw10),
+                      [viw01] "w" (viw01),
+                      [viw11] "w" (viw11),
+                      [W_BITS1] "I" (W_BITS1)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
+                );
+#else
+                int16x4x2_t vdsrc00 = vld2_s16(dsrc);
+                int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);
+                int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);
+                int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);
+
+                int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);
+                int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);
+
+                vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);
+                vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);
+
+                vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);
+                vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);
+
+                vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);
+                vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);
+
+                int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);
+                int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);
+
+                int32x4_t va22i = vmull_s16(vsumny, vsumny);
+                int32x4_t va11i = vmull_s16(vsumnx, vsumnx);
+                int32x4_t va12i = vmull_s16(vsumnx, vsumny);
+
+                float32x4_t va22f = vcvtq_f32_s32(va22i);
+                float32x4_t va11f = vcvtq_f32_s32(va11i);
+                float32x4_t va12f = vcvtq_f32_s32(va12i);
+
+                vA22 = vaddq_f32(vA22, va22f);
+                vA11 = vaddq_f32(vA11, va11f);
+                vA12 = vaddq_f32(vA12, va12f);
+
+                int16x4x2_t vsum;
+                vsum.val[0] = vsumnx;
+                vsum.val[1] = vsumny;
+                vst2_s16(dIptr, vsum);
+#endif
+            }
+
+            for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )
+            {
+                s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                      src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);
+                s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                       dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                       dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                Iptr[x] = (s16)ival;
+                dIptr[0] = (s16)ixval;
+                dIptr[1] = (s16)iyval;
+
+                A11 += (f32)(ixval*ixval);
+                A12 += (f32)(ixval*iyval);
+                A22 += (f32)(iyval*iyval);
+            }
+        }
+
+        f32 A11buf[2], A12buf[2], A22buf[2];
+        vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));
+        vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));
+        vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));
+        A11 += A11buf[0] + A11buf[1];
+        A12 += A12buf[0] + A12buf[1];
+        A22 += A22buf[0] + A22buf[1];
+
+        A11 *= FLT_SCALE;
+        A12 *= FLT_SCALE;
+        A22 *= FLT_SCALE;
+
+        f32 D = A11*A22 - A12*A12;
+        f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                        4.f*A12*A12))/(2*winSize.width*winSize.height);
+
+        if( err && getMinEigenVals )
+            err[ptidx] = (f32)minEig;
+
+        if( minEig < minEigThreshold || D < FLT_EPSILON )
+        {
+            if( level == 0 && status )
+                status[ptidx] = false;
+            continue;
+        }
+
+        D = 1.f/D;
+
+        nextPtX -= halfWinX;
+        nextPtY -= halfWinY;
+        f32 prevDeltaX = 0;
+        f32 prevDeltaY = 0;
+
+        for(u32 j = 0; j < terminationCount; j++ )
+        {
+            inextPtX = floor(nextPtX);
+            inextPtY = floor(nextPtY);
+
+            if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
+               inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
+            {
+                if( level == 0 && status )
+                    status[ptidx] = false;
+                break;
+            }
+
+            a = nextPtX - inextPtX;
+            b = nextPtY - inextPtY;
+            iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
+            iw01 = round(a*(1.f - b)*(1 << W_BITS));
+            iw10 = round((1.f - a)*b*(1 << W_BITS));
+            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            f32 b1 = 0, b2 = 0;
+
+            viw00 = vmov_n_s16((s16)iw00);
+            viw01 = vmov_n_s16((s16)iw01);
+            viw10 = vmov_n_s16((s16)iw10);
+            viw11 = vmov_n_s16((s16)iw11);
+
+            float32x4_t vb1 = vmovq_n_f32(0);
+            float32x4_t vb2 = vmovq_n_f32(0);
+
+            for(s32 y = 0; y < (s32)winSize.height; y++ )
+            {
+                const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;
+                const s16* Iptr = IWinBuf + y*IWinBufStride;
+                const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
+
+                x = 0;
+
+                internal::prefetch(Jptr, nextStride * 2);
+                internal::prefetch(Iptr, IWinBufStride/2);
+                internal::prefetch(dIptr, derivIWinBufStride/2);
+
+                for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )
+                {
+                    uint8x8_t vj00 = vld1_u8(Jptr + x);
+                    uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);
+                    uint8x8_t vj01 = vld1_u8(Jptr + x + cn);
+                    uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);
+                    int16x8_t vI = vld1q_s16(Iptr + x);
+                    int16x8x2_t vDerivI = vld2q_s16(dIptr);
+
+                    int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));
+                    int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));
+                    int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));
+                    int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));
+
+                    int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
+                    int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
+
+                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
+                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
+
+                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
+                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
+
+                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
+                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
+
+                    int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
+                    int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
+
+                    int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);
+
+                    int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));
+                    int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));
+                    int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));
+                    int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));
+
+                    float32x4_t vb1f = vcvtq_f32_s32(vb1i);
+                    float32x4_t vb2f = vcvtq_f32_s32(vb2i);
+
+                    vb1 = vaddq_f32(vb1, vb1f);
+                    vb2 = vaddq_f32(vb2, vb2f);
+                }
+
+                for( ; x < wwcn; x++, dIptr += 2 )
+                {
+                    s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                          Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
+                                          W_BITS1-5) - Iptr[x];
+                    b1 += (f32)(diff*dIptr[0]);
+                    b2 += (f32)(diff*dIptr[1]);
+                }
+            }
+
+            f32 bbuf[2];
+            float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));
+            vst1_f32(bbuf, vb);
+            b1 += bbuf[0];
+            b2 += bbuf[1];
+
+            b1 *= FLT_SCALE;
+            b2 *= FLT_SCALE;
+
+            f32 deltaX = (f32)((A12*b2 - A22*b1) * D);
+            f32 deltaY = (f32)((A12*b1 - A11*b2) * D);
+
+            nextPtX += deltaX;
+            nextPtY += deltaY;
+            nextPts[ptref+0] = nextPtX + halfWinX;
+            nextPts[ptref+1] = nextPtY + halfWinY;
+
+            if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )
+                break;
+
+            if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&
+               std::abs(deltaY + prevDeltaY) < 0.01 )
+            {
+                nextPts[ptref+0] -= deltaX*0.5f;
+                nextPts[ptref+1] -= deltaY*0.5f;
+                break;
+            }
+            prevDeltaX = deltaX;
+            prevDeltaY = deltaY;
+        }
+
+        if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
+        {
+            f32 nextPointX = nextPts[ptref+0] - halfWinX;
+            f32 nextPointY = nextPts[ptref+1] - halfWinY;
+
+            s32 inextPointX = floor(nextPointX);
+            s32 inextPointY = floor(nextPointY);
+
+            if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||
+                inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )
+            {
+                if( status )
+                    status[ptidx] = false;
+                continue;
+            }
+
+            f32 aa = nextPointX - inextPointX;
+            f32 bb = nextPointY - inextPointY;
+            iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));
+            iw01 = round(aa*(1.f - bb)*(1 << W_BITS));
+            iw10 = round((1.f - aa)*bb*(1 << W_BITS));
+            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            f32 errval = 0.f;
+
+            for(s32 y = 0; y < (s32)winSize.height; y++ )
+            {
+                const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;
+                const s16* Iptr = IWinBuf + y*IWinBufStride;
+
+                for( x = 0; x < wwcn; x++ )
+                {
+                    s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                          Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
+                                          W_BITS1-5) - Iptr[x];
+                    errval += std::abs((f32)diff);
+                }
+            }
+            err[ptidx] = errval / (32*wwcn*winSize.height);
+        }
+    }
+#else
+    (void)size;
+    (void)cn;
+    (void)prevData;
+    (void)prevStride;
+    (void)prevDerivData;
+    (void)prevDerivStride;
+    (void)nextData;
+    (void)nextStride;
+    (void)prevPts;
+    (void)nextPts;
+    (void)status;
+    (void)err;
+    (void)winSize;
+    (void)terminationCount;
+    (void)terminationEpsilon;
+    (void)level;
+    (void)maxLevel;
+    (void)useInitialFlow;
+    (void)getMinEigenVals;
+    (void)minEigThreshold;
+    (void)ptCount;
+#endif
+}
+
+}//CAROTENE_NS
+
diff --git a/3rdparty/carotene/src/phase.cpp b/3rdparty/carotene/src/phase.cpp
new file mode 100644
index 0000000000..141b1e864a
--- /dev/null
+++ b/3rdparty/carotene/src/phase.cpp
@@ -0,0 +1,274 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <cfloat>
+#include <cmath>
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+#define FASTATAN2CONST(scale) \
+        f32 P1((f32)( 0.9997878412794807  * (180.0 / M_PI) * scale)), \
+        P3((f32)(-0.3258083974640975  * (180.0 / M_PI) * scale)), \
+        P5((f32)( 0.1555786518463281  * (180.0 / M_PI) * scale)), \
+        P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
+         A_90((f32)(90.f * scale)), \
+        A_180((f32)(180.f * scale)), \
+        A_360((f32)(360.f * scale)); \
+        float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
+         _90(vdupq_n_f32(A_90)), \
+        _180(vdupq_n_f32(A_180)), \
+        _360(vdupq_n_f32(A_360)), \
+           z(vdupq_n_f32(0.0f)), \
+        p1(vdupq_n_f32(P1)), \
+        p3(vdupq_n_f32(P3)), \
+        p5(vdupq_n_f32(P5)), \
+        p7(vdupq_n_f32(P7));
+
+#define FASTATAN2SCALAR(y, x, a) \
+    { \
+        f32 ax = std::abs(x), ay = std::abs(y); \
+        f32 c, c2; \
+        if (ax >= ay) \
+        { \
+            c = ay / (ax + (float)DBL_EPSILON); \
+            c2 = c * c; \
+            a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
+        } \
+        else \
+        { \
+            c = ax / (ay + (float)DBL_EPSILON); \
+            c2 = c * c; \
+            a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
+        } \
+        if (x < 0) \
+            a = A_180 - a; \
+        if (y < 0) \
+            a = A_360 - a; \
+    }
+
+#define FASTATAN2VECTOR(v_y, v_x, a) \
+    { \
+        float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
+        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
+        float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
+        float32x4_t c2 = vmulq_f32(c, c); \
+        a = vmulq_f32(c2, p7); \
+ \
+        a = vmulq_f32(vaddq_f32(a, p5), c2); \
+        a = vmulq_f32(vaddq_f32(a, p3), c2); \
+        a = vmulq_f32(vaddq_f32(a, p1), c); \
+ \
+        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
+        a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
+        a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
+ \
+    }
+
+} // namespace
+
+#endif
+
+void phase(const Size2D &size,
+           const s16 * src0Base, ptrdiff_t src0Stride,
+           const s16 * src1Base, ptrdiff_t src1Stride,
+           u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    FASTATAN2CONST(256.0f / 360.0f)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    float32x4_t v_05 = vdupq_n_f32(0.5f);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+
+            int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
+            int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+
+            // 0
+            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+            float32x4_t v_dst32f0;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+            float32x4_t v_dst32f1;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
+
+            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            // 1
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
+
+            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
+                                          vmovn_u16(v_dst16s1)));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_src0 = vld1q_s16(src0 + j);
+            int16x8_t v_src1 = vld1q_s16(src1 + j);
+
+            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
+            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
+            float32x4_t v_dst32f0;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
+            float32x4_t v_dst32f1;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
+
+            uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                            vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            vst1_u8(dst + j, vmovn_u16(v_dst));
+        }
+
+        for (; j < size.width; j++)
+        {
+            f32 x = src0[j], y = src1[j];
+            f32 a;
+            FASTATAN2SCALAR(y, x, a)
+            dst[j] = (u8)(s32)floor(a + 0.5f);
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void phase(const Size2D &size,
+           const f32 * src0Base, ptrdiff_t src0Stride,
+           const f32 * src1Base, ptrdiff_t src1Stride,
+           f32 * dstBase, ptrdiff_t dstStride,
+           f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    FASTATAN2CONST(scale)
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+
+            float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
+            float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
+
+            float32x4_t v_dst32f;
+            // 0
+            FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
+            vst1q_f32(dst + j,     v_dst32f);
+            // 1
+            FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
+            vst1q_f32(dst + j + 4, v_dst32f);
+        }
+        if(j + 4 <= size.width)
+        {
+            float32x4_t v_src0 = vld1q_f32(src0 + j);
+            float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+            float32x4_t v_dst32f;
+            FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
+            vst1q_f32(dst + j, v_dst32f);
+            j += 4;
+        }
+
+        for (; j < size.width; j++)
+        {
+            f32 a;
+            FASTATAN2SCALAR(src1[j], src0[j], a)
+            dst[j] = a;
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/pyramid.cpp b/3rdparty/carotene/src/pyramid.cpp
new file mode 100644
index 0000000000..546ccecd97
--- /dev/null
+++ b/3rdparty/carotene/src/pyramid.cpp
@@ -0,0 +1,1414 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <vector>
+
+namespace CAROTENE_NS {
+
+bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border_mode)
+{
+    if (!isSupportedConfiguration())
+        return false;
+    // Need at least 8 pixels for vectorization.
+    // Need to make sure dst width is half the src width.
+    // Don't care about dst height.
+    if ( dstSize.width < 8 || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 )
+        return false;
+
+    // Current implementation only supports Reflect101 (ie: UNDEFINED mode)
+    if (border_mode != BORDER_MODE_UNDEFINED)
+        return false;
+
+    return true;
+}
+
+bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
+{
+    if (!isSupportedConfiguration())
+        return false;
+    if ( (dstSize.width * cn) < 8 ||
+         (cn != 1 && cn !=3 && cn!=4) ||
+         std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
+         std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
+        return false;
+
+    return true;
+}
+
+bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
+{
+    if (!isSupportedConfiguration())
+        return false;
+    if ( (dstSize.width * cn) < 4 ||
+         (cn != 1 && cn !=3 && cn!=4) ||
+         std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
+         std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
+        return false;
+
+    return true;
+}
+
+bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
+{
+    if (!isSupportedConfiguration())
+        return false;
+    if ( (dstSize.width * cn) < 4 ||
+         (cn != 1 && cn !=3 && cn!=4) ||
+         std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
+         std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
+        return false;
+
+    return true;
+}
+
+bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
+{
+    if (!isSupportedConfiguration())
+        return false;
+    if ( (srcSize.width * cn) < 8 ||
+         (cn != 1 && cn !=3 && cn!=4) ||
+         std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 ||
+         std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 )
+        return false;
+
+    return true;
+}
+
+bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
+{
+    if (!isSupportedConfiguration())
+        return false;
+    if ( (srcSize.width * cn) < 12 ||
+         (cn != 1 && cn !=3 && cn!=4) ||
+         std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 ||
+         std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 )
+        return false;
+
+    return true;
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+ptrdiff_t borderInterpolate101(ptrdiff_t p, ptrdiff_t len)
+{
+    if (len == 1)
+        return 0;
+    else
+    {
+        while ((unsigned)p >= (unsigned)len)
+        {
+            if (p < 0)
+                p = -p;
+            else
+                p = (len - 1)*2 - p;
+        }
+    }
+    return p;
+}
+
+} // namespace
+
+#endif
+
+void gaussianPyramidDownRTZ(const Size2D &srcSize,
+                            const u8 *srcBase, ptrdiff_t srcStride,
+                            const Size2D &dstSize,
+                            u8 *dstBase, ptrdiff_t dstStride,
+                            BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isGaussianPyramidDownRTZSupported(srcSize, dstSize, border));
+#ifdef CAROTENE_NEON
+    // Single-core NEON code
+    const size_t dwidth = dstSize.width;
+    const size_t dheight = dstSize.height;
+    const size_t swidth = srcSize.width;
+    const size_t sheight = srcSize.height;
+
+    ptrdiff_t idx_l1 = borderInterpolate101(-1, swidth);
+    ptrdiff_t idx_l2 = borderInterpolate101(-2, swidth);
+    ptrdiff_t idx_r1 = borderInterpolate101(swidth + 0, swidth);
+    ptrdiff_t idx_r2 = borderInterpolate101(swidth + 1, swidth);
+
+    //1-line buffer
+    std::vector<u16> _buf((swidth + 4) + 32/sizeof(u16));
+    u16* lane = internal::alignPtr(&_buf[2], 32);
+
+    uint8x8_t vc6u8 = vmov_n_u8(6);
+    uint16x8_t vc6u16 = vmovq_n_u16(6);
+    uint16x8_t vc4u16 = vmovq_n_u16(4);
+
+    u8* dst = dstBase;
+
+    for (size_t i = 0; i < dheight; ++i, dst += dstStride)
+    {
+        //vertical convolution
+        const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, sheight));
+        const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, sheight));
+        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, sheight));
+        const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, sheight));
+        const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, sheight));
+
+        size_t x = 0;
+        for (; x <= swidth - 8; x += 8)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            uint8x8_t v0 = vld1_u8(ln0+x);
+            uint8x8_t v1 = vld1_u8(ln1+x);
+            uint8x8_t v2 = vld1_u8(ln2+x);
+            uint8x8_t v3 = vld1_u8(ln3+x);
+            uint8x8_t v4 = vld1_u8(ln4+x);
+
+            uint16x8_t v = vaddl_u8(v0, v4);
+            uint16x8_t v13 = vaddl_u8(v1, v3);
+
+            v = vmlal_u8(v, v2, vc6u8);
+            v = vmlaq_u16(v, v13, vc4u16);
+
+            vst1q_u16(lane + x, v);
+        }
+        for (; x < swidth; ++x)
+        {
+            lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x];
+        }
+
+        //left&right borders
+        lane[-1] = lane[idx_l1];
+        lane[-2] = lane[idx_l2];
+
+        lane[swidth] = lane[idx_r1];
+        lane[swidth+1] = lane[idx_r2];
+
+        //horizontal convolution
+        x = 0;
+        size_t vw = (swidth/2) - 7;    // Using 7 instead of 8 allows swidth of 14 or 15.
+        for (; x < vw; x += 8)
+        {
+            internal::prefetch(lane + 2 * x);
+            uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2);  // L0[0] = x0 x2 x4 x6 x8 x10 x12 x14   L0[1] = x1 x3 x5 x7 x9 x11 x13 x15
+            uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1);  // L1[0] = x1 x3 x5 x7 x9 x11 x13 x15   L1[1] = x2 x4 x6 x8 x10 x12 x14 x16
+            uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0);  // L2[0] = x2 x4 x6 x8 x10 x12 x14 x16  L2[1] = x3 x5 x7 x9 x11 x13 x15 x17
+            uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1);  // L3[0] = x3 x5 x7 x9 x11 x13 x15 x17  L3[1] = x4 x6 x8 x10 x12 x14 x16 x18
+            uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2);  // L4[0] = x4 x6 x8 x10 x12 x14 x16 x18 L4[1] = x5 x7 x9 x11 x13 x15 x17 x19
+            uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
+            uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
+            vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
+            vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16);
+            uint8x8_t vRes = vshrn_n_u16(vSum_0_4, 8);
+
+            vst1_u8(dst + x, vRes);
+        }
+
+        for (; x < dwidth; x++)
+        {
+            dst[x] = u8((lane[2*x-2] + lane[2*x+2] + 4u * (lane[2*x-1] + lane[2*x+1]) + 6u * lane[2*x]) >> 8);
+        }
+    }
+#else
+    // Remove 'unused parameter' warnings.
+    (void)srcSize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstSize;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+#endif
+    (void)borderValue;
+}
+
+void gaussianPyramidDown(const Size2D &srcSize,
+                         const u8 *srcBase, ptrdiff_t srcStride,
+                         const Size2D &dstSize,
+                         u8 *dstBase, ptrdiff_t dstStride, u8 cn)
+{
+    internal::assertSupportedConfiguration(isGaussianPyramidDownU8Supported(srcSize, dstSize, cn));
+#ifdef CAROTENE_NEON
+    size_t dcolcn = dstSize.width*cn;
+    size_t scolcn = srcSize.width*cn;
+    size_t roiw8 = dcolcn - 7;
+
+    size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
+    size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
+    size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
+    size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
+
+    //1-line buffer
+    std::vector<u16> _buf(cn*(srcSize.width + 4) + 32/sizeof(u16));
+    u16* lane = internal::alignPtr(&_buf[2*cn], 32);
+
+    uint8x8_t vc6u8 = vmov_n_u8(6);
+    uint16x8_t vc6u16 = vmovq_n_u16(6);
+    uint16x8_t vc4u16 = vmovq_n_u16(4);
+
+    for (size_t i = 0; i < dstSize.height; ++i)
+    {
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
+        const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
+        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
+        const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
+        const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
+
+        size_t x = 0;
+        for (; x <= scolcn - 8; x += 8)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
+            uint8x8_t v0 = vld1_u8(ln0+x);
+            uint8x8_t v1 = vld1_u8(ln1+x);
+            uint8x8_t v2 = vld1_u8(ln2+x);
+            uint8x8_t v3 = vld1_u8(ln3+x);
+            uint8x8_t v4 = vld1_u8(ln4+x);
+
+            uint16x8_t v = vaddl_u8(v0, v4);
+            uint16x8_t v13 = vaddl_u8(v1, v3);
+
+            v = vmlal_u8(v, v2, vc6u8);
+            v = vmlaq_u16(v, v13, vc4u16);
+
+            vst1q_u16(lane + x, v);
+        }
+        for (; x < scolcn; ++x)
+        {
+            lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x];
+        }
+
+        //left&right borders
+        for (u32 k = 0; k < cn; ++k)
+        {
+            lane[(s32)(-cn+k)] = lane[idx_l1 + k];
+            lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
+
+            lane[scolcn+k] = lane[idx_r1 + k];
+            lane[scolcn+cn+k] = lane[idx_r2 + k];
+        }
+
+        //horizontal convolution
+        x = 0;
+        switch(cn)
+        {
+        case 1:
+            for (; x < roiw8; x += 8)
+            {
+                internal::prefetch(lane + 2 * x);
+#if __GNUC_MINOR__ < 7
+                __asm__ (
+                    "vld2.16 {d0-d3}, [%[in0]]                               \n\t"
+                    "vld2.16 {d4-d7}, [%[in4]]                               \n\t"
+                    "vld2.16 {d12-d15}, [%[in1]]                             \n\t"
+                    "vld2.16 {d16-d19}, [%[in3]]                             \n\t"
+                    "vld2.16 {d8-d11}, [%[in2],:256]                         \n\t"
+                    "vadd.i16 q0, q2                  /*q0 = v0 + v4*/       \n\t"
+                    "vadd.i16 q6, q8                  /*q6 = v1 + v3*/       \n\t"
+                    "vmla.i16 q0, q4, %q[c6]          /*q0 += v2 * 6*/       \n\t"
+                    "vmla.i16 q0, q6, %q[c4]          /*q1 += (v1+v3) * 4*/  \n\t"
+                    "vrshrn.u16 d8, q0, #8                                   \n\t"
+                    "vst1.8 {d8}, [%[out]]                                   \n\t"
+                    : /*no output*/
+                    : [out] "r" (dst + x),
+                      [in0] "r" (lane + 2*x-2),
+                      [in1] "r" (lane + 2*x-1),
+                      [in2] "r" (lane + 2*x+0),
+                      [in3] "r" (lane + 2*x+1),
+                      [in4] "r" (lane + 2*x+2),
+                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+                );
+#else
+                uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2);
+                uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1);
+                uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0);
+                uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1);
+                uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2);
+
+                uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
+                uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16);
+                uint8x8_t vRes = vrshrn_n_u16(vSum_0_4, 8);
+
+                vst1_u8(dst + x, vRes);
+#endif
+            }
+            break;
+        case 3:
+        {
+            uint16x4_t vx1 = vld1_u16(lane - 2*3);
+            uint16x4_t vx2 = vld1_u16(lane - 1*3);
+            uint16x4_t vx3 = vld1_u16(lane + 0*3);
+            uint16x8_t v0 = vcombine_u16(vx1, vx3);
+
+            uint8x8_t map = vreinterpret_u8_u64(vmov_n_u64(0xFFFF060504020100ULL));
+            for (; x < roiw8; x += 6)
+            {
+                internal::prefetch(lane + 2 * x + 12);
+
+                uint16x4_t vx_ = vld1_u16(lane + 2*x-1*3 + 6);
+                uint16x4_t vx4 = vld1_u16(lane + 2*x+0*3 + 6);
+                uint16x4_t vx5 = vld1_u16(lane + 2*x+1*3 + 6);
+                uint16x4_t vx6 = vld1_u16(lane + 2*x+2*3 + 6);
+
+                uint16x8_t v1 = vcombine_u16(vx2, vx_);
+                uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4);
+                uint16x8_t v3 = vcombine_u16(vx_, vx5);
+                uint16x8_t v4 = vcombine_u16(vx4, vx6);
+                vx2 = vx5;
+
+                uint16x8_t v = vaddq_u16(v0, v4);
+                uint16x8_t v13 = vaddq_u16(v1, v3);
+
+                v = vmlaq_u16(v, v2, vc6u16);
+                v = vmlaq_u16(v, v13, vc4u16);
+
+                uint8x8_t v8 = vrshrn_n_u16(v, 8);
+
+                v0 = v4;
+
+                vst1_u8(dst + x, vtbl1_u8(v8, map));
+            }
+        }
+        break;
+        case 4:
+        {
+            uint16x4_t vx1 = vld1_u16(lane - 2*4);
+            uint16x4_t vx2 = vld1_u16(lane - 1*4);
+            uint16x4_t vx3 = vld1_u16(lane + 0*4);
+            uint16x8_t v0 = vcombine_u16(vx1, vx3);
+
+            for (; x < roiw8; x += 8)
+            {
+                internal::prefetch(lane + 2 * x + 16);
+
+                uint16x4_t vx_ = vld1_u16(lane + 2 * x - 1*4 + 8);
+                uint16x4_t vx4 = vld1_u16(lane + 2 * x + 0*4 + 8);
+                uint16x4_t vx5 = vld1_u16(lane + 2 * x + 1*4 + 8);
+                uint16x4_t vx6 = vld1_u16(lane + 2 * x + 2*4 + 8);
+
+                uint16x8_t v1 = vcombine_u16(vx2, vx_);
+                uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4);
+                uint16x8_t v3 = vcombine_u16(vx_, vx5);
+                uint16x8_t v4 = vcombine_u16(vx4, vx6);
+                vx2 = vx5;
+
+                uint16x8_t v = vaddq_u16(v0, v4);
+                uint16x8_t v13 = vaddq_u16(v1, v3);
+
+                v = vmlaq_u16(v, v2, vc6u16);
+                v = vmlaq_u16(v, v13, vc4u16);
+
+                uint8x8_t v8 = vrshrn_n_u16(v, 8);
+
+                v0 = v4;
+
+                vst1_u8(dst + x, v8);
+            }
+        }
+        break;
+        }
+
+        for (u32 h = 0; h < cn; ++h)
+        {
+            u16* ln = lane + h;
+            u8* dt = dst + h;
+            for (size_t k = x; k < dcolcn; k += cn)
+                dt[k] = u8((ln[2*k-2*cn] + ln[2*k+2*cn] + 4u * (ln[2*k-cn] + ln[2*k+cn]) + 6u * ln[2*k] + (1 << 7)) >> 8);
+        }
+    }
+#else
+    // Remove 'unused parameter' warnings.
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void gaussianPyramidDown(const Size2D &srcSize,
+                         const s16 *srcBase, ptrdiff_t srcStride,
+                         const Size2D &dstSize,
+                         s16 *dstBase, ptrdiff_t dstStride, u8 cn)
+{
+    internal::assertSupportedConfiguration(isGaussianPyramidDownS16Supported(srcSize, dstSize, cn));
+#ifdef CAROTENE_NEON
+    size_t dcolcn = dstSize.width*cn;
+    size_t scolcn = srcSize.width*cn;
+    size_t roiw4 = dcolcn - 3;
+
+    size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
+    size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
+    size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
+    size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
+
+    //1-line buffer
+    std::vector<s32> _buf(cn*(srcSize.width + 4) + 32/sizeof(s32));
+    s32* lane = internal::alignPtr(&_buf[2*cn], 32);
+
+    int16x4_t vc6s16 = vmov_n_s16(6);
+    int32x4_t vc6s32 = vmovq_n_s32(6);
+    int32x4_t vc4s32 = vmovq_n_s32(4);
+
+    for (size_t i = 0; i < dstSize.height; ++i)
+    {
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
+        const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
+        const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
+        const s16* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
+        const s16* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
+
+        size_t x = 0;
+        for (; x <= scolcn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
+            int16x4_t v0 = vld1_s16(ln0 + x);
+            int16x4_t v1 = vld1_s16(ln1 + x);
+            int16x4_t v2 = vld1_s16(ln2 + x);
+            int16x4_t v3 = vld1_s16(ln3 + x);
+            int16x4_t v4 = vld1_s16(ln4 + x);
+
+            int32x4_t v = vaddl_s16(v0, v4);
+            int32x4_t v13 = vaddl_s16(v1, v3);
+
+            v = vmlal_s16(v, v2, vc6s16);
+            v = vmlaq_s32(v, v13, vc4s32);
+
+            vst1q_s32(lane + x, v);
+        }
+        for (; x < scolcn; ++x)
+        {
+            lane[x] = ln0[x] + ln4[x] + 4 * (ln1[x] + ln3[x]) + 6 * ln2[x];
+        }
+
+        //left&right borders
+        for (u32 k = 0; k < cn; ++k)
+        {
+            lane[(s32)(-cn+k)] = lane[idx_l1 + k];
+            lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
+
+            lane[scolcn+k] = lane[idx_r1 + k];
+            lane[scolcn+cn+k] = lane[idx_r2 + k];
+        }
+
+        //horizontal convolution
+        x = 0;
+        switch(cn)
+        {
+        case 1:
+            for (; x < roiw4; x += 4)
+            {
+                internal::prefetch(lane + 2 * x);
+#if __GNUC_MINOR__ < 7
+                __asm__ (
+                    "vld2.32 {d0-d3}, [%[in0]]                              \n\t"
+                    "vld2.32 {d4-d7}, [%[in4]]                              \n\t"
+                    "vld2.32 {d12-d15}, [%[in1]]                            \n\t"
+                    "vld2.32 {d16-d19}, [%[in3]]                            \n\t"
+                    "vld2.32 {d8-d11}, [%[in2],:256]                        \n\t"
+                    "vadd.i32 q0, q2                                        \n\t"
+                    "vadd.i32 q6, q8                                        \n\t"
+                    "vmla.i32 q0, q4, %q[c6]                                \n\t"
+                    "vmla.i32 q0, q6, %q[c4]                                \n\t"
+                    "vrshrn.s32 d8, q0, #8                                  \n\t"
+                    "vst1.16 {d8}, [%[out]]                                 \n\t"
+                    : /*no output*/
+                    : [out] "r" (dst + x),
+                      [in0] "r" (lane + 2*x-2),
+                      [in1] "r" (lane + 2*x-1),
+                      [in2] "r" (lane + 2*x+0),
+                      [in3] "r" (lane + 2*x+1),
+                      [in4] "r" (lane + 2*x+2),
+                      [c4] "w" (vc4s32), [c6] "w" (vc6s32)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+                );
+#else
+                int32x4x2_t vLane0 = vld2q_s32(lane + 2*x-2);
+                int32x4x2_t vLane1 = vld2q_s32(lane + 2*x-1);
+                int32x4x2_t vLane2 = vld2q_s32(lane + 2*x+0);
+                int32x4x2_t vLane3 = vld2q_s32(lane + 2*x+1);
+                int32x4x2_t vLane4 = vld2q_s32(lane + 2*x+2);
+
+                int32x4_t vSum_0_4 = vaddq_s32(vLane0.val[0], vLane4.val[0]);
+                int32x4_t vSum_1_3 = vaddq_s32(vLane1.val[0], vLane3.val[0]);
+                vSum_0_4 = vmlaq_s32(vSum_0_4, vLane2.val[0], vc6s32);
+                vSum_0_4 = vmlaq_s32(vSum_0_4, vSum_1_3, vc4s32);
+                int16x4_t vRes = vrshrn_n_s32(vSum_0_4, 8);
+
+                vst1_s16(dst + x, vRes);
+#endif
+            }
+            break;
+        case 3:
+        {
+            int32x4_t v0 = vld1q_s32(lane - 2*3);
+            int32x4_t v1 = vld1q_s32(lane - 1*3);
+            int32x4_t v2 = vld1q_s32(lane + 0*3);
+            for (; x < roiw4; x += 3)
+            {
+                internal::prefetch(lane + 2 * x);
+
+                int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*3);
+                int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*3);
+
+                int32x4_t v = vaddq_s32(v0, v4);
+                int32x4_t v13 = vaddq_s32(v1, v3);
+
+                v = vmlaq_s32(v, v2, vc6s32);
+                v = vmlaq_s32(v, v13, vc4s32);
+
+                int16x4_t vv = vrshrn_n_s32(v, 8);
+
+                v0 = v2;
+                v1 = v3;
+                v2 = v4;
+
+                vst1_s16(dst + x, vv);
+            }
+        }
+        break;
+        case 4:
+        {
+            int32x4_t v0 = vld1q_s32(lane - 2*4);
+            int32x4_t v1 = vld1q_s32(lane - 1*4);
+            int32x4_t v2 = vld1q_s32(lane + 0*4);
+            for (; x < roiw4; x += 4)
+            {
+                internal::prefetch(lane + 2 * x + 8);
+                int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*4);
+                int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*4);
+
+                int32x4_t v = vaddq_s32(v0, v4);
+                int32x4_t v13 = vaddq_s32(v1, v3);
+
+                v = vmlaq_s32(v, v2, vc6s32);
+                v = vmlaq_s32(v, v13, vc4s32);
+
+                int16x4_t vv = vrshrn_n_s32(v, 8);
+
+                v0 = v2;
+                v1 = v3;
+                v2 = v4;
+
+                vst1_s16(dst + x, vv);
+            }
+        }
+        break;
+        }
+
+        for (u32 h = 0; h < cn; ++h)
+        {
+            s32* ln = lane + h;
+            s16* dt = dst + h;
+            for (size_t k = x; k < dcolcn; k += cn)
+                dt[k] = s16((ln[2*k-2*cn] + ln[2*k+2*cn] + 4 * (ln[2*k-cn] + ln[2*k+cn]) + 6 * ln[2*k] + (1 << 7)) >> 8);
+        }
+    }
+#else
+    // Remove 'unused parameter' warnings.
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void gaussianPyramidDown(const Size2D &srcSize,
+                         const f32 *srcBase, ptrdiff_t srcStride,
+                         const Size2D &dstSize,
+                         f32 *dstBase, ptrdiff_t dstStride, u8 cn)
+{
+    internal::assertSupportedConfiguration(isGaussianPyramidDownF32Supported(srcSize, dstSize, cn));
+#ifdef CAROTENE_NEON
+    size_t dcolcn = dstSize.width*cn;
+    size_t scolcn = srcSize.width*cn;
+    size_t roiw4 = dcolcn - 3;
+
+    size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
+    size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
+    size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
+    size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
+
+    //1-line buffer
+    std::vector<f32> _buf(cn*(srcSize.width + 4) + 32/sizeof(f32));
+    f32* lane = internal::alignPtr(&_buf[2*cn], 32);
+
+#if __GNUC_MINOR__ < 7
+    register float32x4_t vc6d4f32  asm ("q11") = vmovq_n_f32(1.5f);  // 6/4
+    register float32x4_t vc1d4f32  asm ("q12") = vmovq_n_f32(0.25f); // 1/4
+
+    register float32x4_t vc1d64f32 asm ("q13") = vmovq_n_f32(0.015625f); //1/4/16
+    register float32x4_t vc4d64f32 asm ("q14") = vmovq_n_f32(0.0625f);   //4/4/16
+    register float32x4_t vc6d64f32 asm ("q15") = vmovq_n_f32(0.09375f);  //6/4/16
+#else
+    register float32x4_t vc6d4f32  = vmovq_n_f32(1.5f);  // 6/4
+    register float32x4_t vc1d4f32  = vmovq_n_f32(0.25f); // 1/4
+
+    register float32x4_t vc1d64f32 = vmovq_n_f32(0.015625f); //1/4/16
+    register float32x4_t vc4d64f32 = vmovq_n_f32(0.0625f);   //4/4/16
+    register float32x4_t vc6d64f32 = vmovq_n_f32(0.09375f);  //6/4/16
+#endif
+
+    for (size_t i = 0; i < dstSize.height; ++i)
+    {
+        f32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        const f32* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
+        const f32* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
+        const f32* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
+        const f32* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
+        const f32* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
+
+        size_t x = 0;
+        for (; x <= scolcn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
+            float32x4_t v0 = vld1q_f32((const float32_t*)ln0 + x);
+            float32x4_t v1 = vld1q_f32((const float32_t*)ln1 + x);
+            float32x4_t v2 = vld1q_f32((const float32_t*)ln2 + x);
+            float32x4_t v3 = vld1q_f32((const float32_t*)ln3 + x);
+            float32x4_t v4 = vld1q_f32((const float32_t*)ln4 + x);
+
+            float32x4_t v   = vaddq_f32(v1, v3);
+            float32x4_t v04 = vaddq_f32(v0, v4);
+
+            v = vmlaq_f32(v, v2, vc6d4f32);
+            v = vmlaq_f32(v, v04, vc1d4f32);
+
+            vst1q_f32(lane + x, v);
+        }
+        for (; x < scolcn; ++x)
+        {
+            lane[x] = 0.25f*(ln0[x] + ln4[x]) + (ln1[x] + ln3[x]) + 1.5f * ln2[x];
+        }
+
+        //left&right borders
+        for (u32 k = 0; k < cn; ++k)
+        {
+            lane[(s32)(-cn+k)] = lane[idx_l1 + k];
+            lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
+
+            lane[scolcn+k] = lane[idx_r1 + k];
+            lane[scolcn+cn+k] = lane[idx_r2 + k];
+        }
+
+        //horizontal convolution
+        x = 0;
+        switch(cn)
+        {
+        case 1:
+            for (; x < roiw4; x += 4)
+            {
+                internal::prefetch(lane + 2 * x);
+#if __GNUC_MINOR__ < 7
+                __asm__ __volatile__ (
+                    "vld2.32 {d0-d3}, [%[in0]]                              \n\t"
+                    "vld2.32 {d8-d11}, [%[in4]]                             \n\t"
+                    "vld2.32 {d14-d17}, [%[in2],:256]                       \n\t"
+                    "vld2.32 {d10-d13}, [%[in1]]                            \n\t"
+                    "vld2.32 {d16-d19}, [%[in3]]                            \n\t"
+                    "vmul.f32 q7, %q[c6d64]                                 \n\t"
+                    "vadd.f32 q0, q4         @v04                           \n\t"
+                    "vadd.f32 q5, q8         @v13                           \n\t"
+                    "vmla.f32 q7, q0, %q[c1d64]                             \n\t"
+                    "vmla.f32 q7, q5, %q[c4d64]                             \n\t"
+                    "vst1.32 {d14-d15}, [%[out]]                            \n\t"
+                    :
+                    : [out] "r" (dst + x),
+                      [in0] "r" (lane + 2*x-2),
+                      [in1] "r" (lane + 2*x-1),
+                      [in2] "r" (lane + 2*x+0),
+                      [in3] "r" (lane + 2*x+1),
+                      [in4] "r" (lane + 2*x+2),
+                      [c4d64] "w" (vc4d64f32), [c6d64] "w" (vc6d64f32), [c1d64] "w" (vc1d64f32)
+                    : "d0","d1","d2","d3","d4",/*"d5","d6","d7",*/"d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" //ugly compiler "bug" - can't touch d5-d7
+                );
+#else
+                float32x4x2_t vLane0 = vld2q_f32(lane + 2*x-2);
+                float32x4x2_t vLane1 = vld2q_f32(lane + 2*x-1);
+                float32x4x2_t vLane2 = vld2q_f32(lane + 2*x+0);
+                float32x4x2_t vLane3 = vld2q_f32(lane + 2*x+1);
+                float32x4x2_t vLane4 = vld2q_f32(lane + 2*x+2);
+
+                float32x4_t vSum_0_4 = vaddq_f32(vLane0.val[0], vLane4.val[0]);
+                float32x4_t vSum_1_3 = vaddq_f32(vLane1.val[0], vLane3.val[0]);
+                float32x4_t vRes = vmulq_f32(vLane2.val[0], vc6d64f32);
+                vRes = vmlaq_f32(vRes, vSum_0_4, vc1d64f32);
+                vRes = vmlaq_f32(vRes, vSum_1_3, vc4d64f32);
+
+                vst1q_f32(dst + x, vRes);
+#endif
+            }
+            break;
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*3);
+            float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*3);
+            float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*3);
+
+            for (; x < roiw4; x += 3)
+            {
+                internal::prefetch(lane + 2 * x);
+
+                float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*3);
+                float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*3);
+
+                float32x4_t v04 = vaddq_f32(v0, v4);
+                float32x4_t v13 = vaddq_f32(v1, v3);
+
+                float32x4_t v = vmulq_f32(v2, vc6d64f32);
+                v = vmlaq_f32(v, v04, vc1d64f32);
+                v = vmlaq_f32(v, v13, vc4d64f32);
+
+                v0 = v2;
+                v1 = v3;
+                v2 = v4;
+
+                vst1q_f32(dst + x, v);
+            }
+        }
+        break;
+        case 4:
+        {
+            float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*4);
+            float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*4);
+            float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*4);
+
+            for (; x < roiw4; x += 4)
+            {
+                internal::prefetch(lane + 2 * x + 8);
+
+                float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*4);
+                float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*4);
+
+                float32x4_t v04 = vaddq_f32(v0, v4);
+                float32x4_t v13 = vaddq_f32(v1, v3);
+
+                float32x4_t v = vmulq_f32(v2, vc6d64f32);
+                v = vmlaq_f32(v, v04, vc1d64f32);
+                v = vmlaq_f32(v, v13, vc4d64f32);
+
+                v0 = v2;
+                v1 = v3;
+                v2 = v4;
+
+                vst1q_f32(dst + x, v);
+            }
+        }
+        break;
+        }
+
+        for (u32 h = 0; h < cn; ++h)
+        {
+            f32* ln = lane + h;
+            f32* dt = dst + h;
+            for (size_t k = x; k < dcolcn; k += cn)
+                dt[k] = 0.015625f * (ln[2*k-2*cn] + ln[2*k+2*cn]) + 0.0625f * (ln[2*k-cn] + ln[2*k+cn]) + 0.09375f * ln[2*k];
+        }
+    }
+#else
+    // Remove 'unused parameter' warnings.
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void gaussianPyramidUp(const Size2D &srcSize,
+                       const u8 *srcBase, ptrdiff_t srcStride,
+                       const Size2D &dstSize,
+                       u8 *dstBase, ptrdiff_t dstStride, u8 cn)
+{
+    internal::assertSupportedConfiguration(isGaussianPyramidUpU8Supported(srcSize, dstSize, cn));
+#ifdef CAROTENE_NEON
+    size_t dcolshn = (dstSize.width/2) * cn;
+    size_t dcolshw = ((dstSize.width+1)/2) * cn;
+    size_t scolsn = srcSize.width*cn;
+
+    size_t idx_l =  (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn;
+    size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn;
+    size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn;
+
+    //2-lines buffer
+    std::vector<u16> _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(u16)));
+    u16* lane0 = internal::alignPtr(&_buf[cn], 32);
+    u16* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32);
+
+    uint8x8_t vc6u8 = vmov_n_u8(6);
+    uint16x8_t vc6u16 = vmovq_n_u16(6);
+
+    for (size_t i = 0; i < (dstSize.height + 1)/2; ++i)
+    {
+        u8* dst = internal::getRowPtr(dstBase, dstStride, 2*i);
+        //vertical convolution
+        const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2);
+        const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2);
+        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2);
+
+        size_t x = 0;
+        for (; x <= scolsn - 8; x += 8)
+        {
+            internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1));
+            uint8x8_t v0 = vld1_u8(ln0+x);
+            uint8x8_t v2 = vld1_u8(ln2+x);
+            uint8x8_t v1 = vld1_u8(ln1+x);
+
+            uint16x8_t vl0 = vaddl_u8(v0, v2);
+            uint16x8_t vl1 = vaddl_u8(v1, v2);
+
+            vl0 = vmlal_u8(vl0, v1, vc6u8);
+            vl1 = vshlq_n_u16(vl1, 2);
+
+            vst1q_u16(lane0 + x, vl0);
+            vst1q_u16(lane1 + x, vl1);
+        }
+        for (; x < scolsn; ++x)
+        {
+            lane0[x] = ln0[x] + ln2[x] + 6u * ln1[x];
+            lane1[x] = 4u * (ln1[x] + ln2[x]);
+        }
+
+        //left&right borders
+        for (u32 k = 0; k < cn; ++k)
+        {
+            lane0[(s32)(-cn+k)] = lane0[idx_l + k];
+            lane1[(s32)(-cn+k)] = lane1[idx_l + k];
+
+            lane0[scolsn+k] = lane0[idx_r1 + k];
+            lane0[scolsn+cn+k] = lane0[idx_r2 + k];
+            lane1[scolsn+k] = lane1[idx_r1 + k];
+            lane1[scolsn+cn+k] = lane1[idx_r2 + k];
+        }
+
+        //horizontal convolution
+        const u16* lane = lane0;
+pyrUp8uHorizontalConvolution:
+        x = 0;
+        size_t lim;
+        switch(cn)
+        {
+        case 1:
+            lim = dcolshn > 7 ? dcolshn - 7 : 0;
+            for (; x < lim; x += 8)
+            {
+                internal::prefetch(lane + x);
+#if defined(__GNUC__) && defined(__arm__)
+                __asm__ (
+                    "vld1.16 {d0-d1}, [%[in0]]       /*q0 = v0*/            \n\t"
+                    "vld1.16 {d2-d3}, [%[in2]]       /*q1 = v2*/            \n\t"
+                    "vld1.16 {d4-d5}, [%[in1],:128]  /*q2 = v1*/            \n\t"
+                    "vadd.i16 q0, q1                 /*q0 = v0 + v2*/       \n\t"
+                    "vadd.i16 q3, q1, q2             /*q3 = v1 + v2*/       \n\t"
+                    "vmla.i16 q0, q2, %q[c6]         /*q0 += v1*6*/         \n\t"
+                    "vrshrn.u16 d9, q3, #4                                  \n\t"
+                    "vrshrn.u16 d8, q0, #6                                  \n\t"
+                    "vst2.8 {d8-d9}, [%[out]]                               \n\t"
+                    : /*no output*/
+                    : [out] "r" (dst + x*2),
+                      [in0] "r" (lane + x - 1),
+                      [in1] "r" (lane + x + 0),
+                      [in2] "r" (lane + x + 1),
+                      [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
+                );
+#else
+                uint16x8_t vLane0 = vld1q_u16(lane + x - 1);
+                uint16x8_t vLane1 = vld1q_u16(lane + x + 0);
+                uint16x8_t vLane2 = vld1q_u16(lane + x + 1);
+
+                vLane0 = vaddq_u16(vLane0, vLane2);
+                vLane2 = vaddq_u16(vLane2, vLane1);
+                vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16);
+                uint8x8x2_t vRes;
+                vRes.val[0] = vrshrn_n_u16(vLane0, 6);
+                vRes.val[1] = vrshrn_n_u16(vLane2, 4);
+
+                vst2_u8(dst + x*2, vRes);
+#endif
+            }
+            break;
+        case 3:
+        {
+            lim = dcolshn > 23 ? dcolshn - 23 : 0;
+            for (; x < lim; x += 24)
+            {
+                internal::prefetch(lane + x);
+#if defined(__GNUC__) && defined(__arm__)
+                __asm__ (
+                    "vmov.u16 q9, #6                                           \n\t"
+                    "vld3.16 {d0, d2, d4}, [%[in0]]        /*v0*/              \n\t"
+                    "vld3.16 {d1, d3, d5}, [%[in02]]                           \n\t"
+                    "vld3.16 {d6, d8, d10}, [%[in2]]       /*v2*/              \n\t"
+                    "vld3.16 {d7, d9, d11}, [%[in22]]                          \n\t"
+                    "vld3.16 {d12, d14, d16}, [%[in1]]     /*v1*/              \n\t"
+                    "vld3.16 {d13, d15, d17}, [%[in12]]                        \n\t"
+                    "vadd.i16 q0, q3                       /*v0 + v2*/         \n\t"
+                    "vadd.i16 q1, q4                       /*v0 + v2*/         \n\t"
+                    "vadd.i16 q2, q5                       /*v0 + v2*/         \n\t"
+                    "vadd.i16 q3, q6                       /*v1 + v2*/         \n\t"
+                    "vadd.i16 q4, q7                       /*v1 + v2*/         \n\t"
+                    "vadd.i16 q5, q8                       /*v1 + v2*/         \n\t"
+                    "vmla.i16 q0, q6, q9                   /*v0 + v2 + v1*6 */ \n\t"
+                    "vmla.i16 q1, q7, q9                   /*v0 + v2 + v1*6 */ \n\t"
+                    "vmla.i16 q2, q8, q9                   /*v0 + v2 + v1*6 */ \n\t"
+                    "vrshrn.u16 d19, q3, #4                                    \n\t"
+                    "vrshrn.u16 d21, q4, #4                                    \n\t"
+                    "vrshrn.u16 d23, q5, #4                                    \n\t"
+                    "vrshrn.u16 d18, q0, #6                                    \n\t"
+                    "vrshrn.u16 d20, q1, #6                                    \n\t"
+                    "vrshrn.u16 d22, q2, #6                                    \n\t"
+                    "vzip.8 d18, d19                                           \n\t"
+                    "vzip.8 d20, d21                                           \n\t"
+                    "vzip.8 d22, d23                                           \n\t"
+                    "vst3.8 {d18, d20, d22}, [%[out1]]                         \n\t"
+                    "vst3.8 {d19, d21, d23}, [%[out2]]                         \n\t"
+                    : /*no output*/
+                    : [out1] "r" (dst + 2 * x),
+                      [out2] "r" (dst + 2 * x + 24),
+                      [in0]  "r" (lane + x - 3),
+                      [in02] "r" (lane + x + 9),
+                      [in1]  "r" (lane + x),
+                      [in12] "r" (lane + x + 12),
+                      [in2]  "r" (lane + x + 3),
+                      [in22] "r" (lane + x + 15)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+#else
+                uint16x8_t vc6 = vmovq_n_u16(6);
+                uint16x8x3_t vLane0  = vld3q_u16(lane + x - 3);
+                uint16x8x3_t vLane1  = vld3q_u16(lane + x + 0);
+                uint16x8x3_t vLane2  = vld3q_u16(lane + x + 3);
+
+                uint16x8_t vSum_0_3 = vaddq_u16(vLane0.val[0], vLane2.val[0]);
+                uint16x8_t vSum_1_4 = vaddq_u16(vLane0.val[1], vLane2.val[1]);
+                uint16x8_t vSum_2_5 = vaddq_u16(vLane0.val[2], vLane2.val[2]);
+                uint16x8_t vSum_3_6 = vaddq_u16(vLane2.val[0], vLane1.val[0]);
+                uint16x8_t vSum_4_7 = vaddq_u16(vLane2.val[1], vLane1.val[1]);
+                uint16x8_t vSum_5_8 = vaddq_u16(vLane2.val[2], vLane1.val[2]);
+
+                vSum_0_3 = vmlaq_u16(vSum_0_3, vLane1.val[0], vc6);
+                vSum_1_4 = vmlaq_u16(vSum_1_4, vLane1.val[1], vc6);
+                vSum_2_5 = vmlaq_u16(vSum_2_5, vLane1.val[2], vc6);
+
+                uint8x8x2_t vSumShr3;
+                vSumShr3.val[0] = vrshrn_n_u16(vSum_3_6, 4);
+                vSumShr3.val[1] = vrshrn_n_u16(vSum_0_3, 6);;
+                uint8x8x2_t vSumShr4;
+                vSumShr4.val[0] = vrshrn_n_u16(vSum_4_7, 4);
+                vSumShr4.val[1] = vrshrn_n_u16(vSum_1_4, 6);
+                uint8x8x2_t vSumShr5;
+                vSumShr5.val[0] = vrshrn_n_u16(vSum_5_8, 4);
+                vSumShr5.val[1] = vrshrn_n_u16(vSum_2_5, 6);
+
+                vSumShr3 = vzip_u8(vSumShr3.val[1], vSumShr3.val[0]);
+                vSumShr4 = vzip_u8(vSumShr4.val[1], vSumShr4.val[0]);
+                vSumShr5 = vzip_u8(vSumShr5.val[1], vSumShr5.val[0]);
+
+                uint8x8x3_t vRes1;
+                vRes1.val[0] = vSumShr3.val[0];
+                vRes1.val[1] = vSumShr4.val[0];
+                vRes1.val[2] = vSumShr5.val[0];
+                vst3_u8(dst + 2 * x,      vRes1);
+
+                uint8x8x3_t vRes2;
+                vRes2.val[0] = vSumShr3.val[1];
+                vRes2.val[1] = vSumShr4.val[1];
+                vRes2.val[2] = vSumShr5.val[1];
+                vst3_u8(dst + 2 * x + 24, vRes2);
+#endif
+            }
+        }
+        break;
+        case 4:
+            lim = dcolshn > 7 ? dcolshn - 7 : 0;
+            for (; x < lim; x += 8)
+            {
+                internal::prefetch(lane + x);
+#if defined(__GNUC__) && defined(__arm__)
+                __asm__ (
+                    "vld1.16 {d0-d1}, [%[in0]]       /*q0 = v0*/            \n\t"
+                    "vld1.16 {d2-d3}, [%[in2]]       /*q1 = v2*/            \n\t"
+                    "vld1.16 {d4-d5}, [%[in1],:128]  /*q2 = v1*/            \n\t"
+                    "vadd.i16 q0, q1                 /*q0 = v0 + v2*/       \n\t"
+                    "vadd.i16 q3, q1, q2             /*q3 = v1 + v2*/       \n\t"
+                    "vmla.i16 q0, q2, %q[c6]         /*q0 += v1*6*/         \n\t"
+                    "vrshrn.u16 d9, q3, #4                                  \n\t"
+                    "vrshrn.u16 d8, q0, #6                                  \n\t"
+                    "vst2.32 {d8-d9}, [%[out]]                              \n\t"
+                    : /*no output*/
+                    : [out] "r" (dst + x*2),
+                      [in0] "r" (lane + x-4),
+                      [in1] "r" (lane + x),
+                      [in2] "r" (lane + x+4),
+                      [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
+                );
+#else
+                uint16x8_t vLane0 = vld1q_u16(lane + x-4);
+                uint16x8_t vLane1 = vld1q_u16(lane + x+0);
+                uint16x8_t vLane2 = vld1q_u16(lane + x+4);
+
+                vLane0 = vaddq_u16(vLane0, vLane2);
+                vLane2 = vaddq_u16(vLane2, vLane1);
+                vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16);
+                uint32x2x2_t vRes;
+                vRes.val[1] = vreinterpret_u32_u8(vrshrn_n_u16(vLane2, 4));
+                vRes.val[0] = vreinterpret_u32_u8(vrshrn_n_u16(vLane0, 6));
+
+                vst2_u32((uint32_t*)(dst + x*2), vRes);
+#endif
+            }
+            break;
+        };
+
+        for (u32 h = 0; h < cn; ++h)
+        {
+            const u16* ln = lane + h;
+            u8* dt = dst + h;
+            size_t k = x;
+            for (; k < dcolshn; k += cn)
+            {
+                dt[2*k+0] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6);
+                dt[2*k+cn] = u8(((ln[k] + ln[k+cn]) * 4u + (1 << 5)) >> 6);
+            }
+            for (; k < dcolshw; k += cn)
+                dt[2*k] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6);
+        }
+        dst = internal::getRowPtr(dstBase, dstStride, 2*i+1);
+
+        //second row
+        if (lane == lane0 && 2*i+1 < dstSize.height)
+        {
+            lane = lane1;
+            goto pyrUp8uHorizontalConvolution;
+        }
+    }
+#else
+    // Remove 'unused parameter' warnings.
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void gaussianPyramidUp(const Size2D &srcSize,
+                       const s16 *srcBase, ptrdiff_t srcStride,
+                       const Size2D &dstSize,
+                       s16 *dstBase, ptrdiff_t dstStride, u8 cn)
+{
+    internal::assertSupportedConfiguration(isGaussianPyramidUpS16Supported(srcSize, dstSize, cn));
+#ifdef CAROTENE_NEON
+    size_t dcolshn = (dstSize.width/2) * cn;
+    size_t dcolshw = ((dstSize.width+1)/2) * cn;
+    size_t scolsn = srcSize.width*cn;
+
+    size_t idx_l =  (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn;
+    size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn;
+    size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn;
+
+    //2-lines buffer
+    std::vector<s32> _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(s32)));
+    s32* lane0 = internal::alignPtr(&_buf[cn], 32);
+    s32* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32);
+
+    int16x4_t vc6s16 = vmov_n_s16(6);
+    int32x4_t vc6s32 = vmovq_n_s32(6);
+
+    for (size_t i = 0; i < (dstSize.height + 1)/2; ++i)
+    {
+        s16* dst = internal::getRowPtr(dstBase, dstStride, 2*i);
+        //vertical convolution
+        const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2);
+        const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2);
+        const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2);
+
+        size_t x = 0;
+        for (; x <= scolsn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1));
+            int16x4_t v0 = vld1_s16(ln0 + x);
+            int16x4_t v2 = vld1_s16(ln2 + x);
+            int16x4_t v1 = vld1_s16(ln1 + x);
+
+            int32x4_t vl0 = vaddl_s16(v0, v2);
+            int32x4_t vl1 = vaddl_s16(v1, v2);
+
+            vl0 = vmlal_s16(vl0, v1, vc6s16);
+            vl1 = vshlq_n_s32(vl1, 2);
+
+            vst1q_s32(lane0 + x, vl0);
+            vst1q_s32(lane1 + x, vl1);
+        }
+        for (; x < scolsn; ++x)
+        {
+            lane0[x] = ln0[x] + ln2[x] + 6 * ln1[x];
+            lane1[x] = 4 * (ln1[x] + ln2[x]);
+        }
+
+        //left&right borders
+        for (u32 k = 0; k < cn; ++k)
+        {
+            lane0[(s32)(-cn+k)] = lane0[idx_l + k];
+            lane1[(s32)(-cn+k)] = lane1[idx_l + k];
+
+            lane0[scolsn+k] = lane0[idx_r1 + k];
+            lane0[scolsn+cn+k] = lane0[idx_r2 + k];
+            lane1[scolsn+k] = lane1[idx_r1 + k];
+            lane1[scolsn+cn+k] = lane1[idx_r2 + k];
+        }
+
+        //horizontal convolution
+        const s32* lane = lane0;
+pyrUp16sHorizontalConvolution:
+        x = 0;
+        size_t lim;
+        switch(cn)
+        {
+        case 1:
+            lim = dcolshn > 3 ? dcolshn - 3 : 0;
+            for (; x < lim; x += 4)
+            {
+                internal::prefetch(lane + x);
+#if defined(__GNUC__) && defined(__arm__)
+                __asm__ (
+                    "vld1.32 {d0-d1}, [%[in0]]       /*q0 = v0*/            \n\t"
+                    "vld1.32 {d2-d3}, [%[in2]]       /*q1 = v2*/            \n\t"
+                    "vld1.32 {d4-d5}, [%[in1],:128]  /*q2 = v1*/            \n\t"
+                    "vadd.i32 q0, q0, q1             /*q0 = v0 + v2*/       \n\t"
+                    "vadd.i32 q3, q1, q2             /*q3 = v1 + v2*/       \n\t"
+                    "vmla.i32 q0, q2, %q[c6]         /*q0 += v1*6*/         \n\t"
+                    "vrshrn.s32 d9, q3, #4                                  \n\t"
+                    "vrshrn.s32 d8, q0, #6                                  \n\t"
+                    "vst2.16 {d8-d9}, [%[out]]                              \n\t"
+                    : /*no output*/
+                    : [out] "r" (dst + x * 2),
+                      [in0] "r" (lane + x - 1),
+                      [in1] "r" (lane + x),
+                      [in2] "r" (lane + x + 1),
+                      [c6] "w" (vc6s32)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
+                );
+#else
+                int32x4_t vLane0 = vld1q_s32(lane + x - 1);
+                int32x4_t vLane1 = vld1q_s32(lane + x);
+                int32x4_t vLane2 = vld1q_s32(lane + x + 1);
+
+                vLane0 = vaddq_s32(vLane0, vLane2);
+                vLane2 = vaddq_s32(vLane2, vLane1);
+                vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32);
+                int16x4x2_t vRes;
+                vRes.val[0] = vrshrn_n_s32(vLane0, 6);
+                vRes.val[1] = vrshrn_n_s32(vLane2, 4);
+
+                vst2_s16(dst + x * 2, vRes);
+#endif
+            }
+            break;
+        case 3:
+        {
+            lim = dcolshn > 11 ? dcolshn - 11 : 0;
+            for (; x < lim; x += 12)
+            {
+                internal::prefetch(lane + x + 3);
+#if defined(__GNUC__) && defined(__arm__)
+                __asm__ (
+                    "vmov.s32 q9, #6                                           \n\t"
+                    "vld3.32 {d0, d2, d4}, [%[in0]]        /*v0*/              \n\t"
+                    "vld3.32 {d1, d3, d5}, [%[in2]]                            \n\t"
+                    "vld3.32 {d6, d8, d10}, [%[in2]]       /*v2*/              \n\t"
+                    "vld3.32 {d7, d9, d11}, [%[in22]]                          \n\t"
+                    "vld3.32 {d12, d14, d16}, [%[in1]]     /*v1*/              \n\t"
+                    "vld3.32 {d13, d15, d17}, [%[in12]]                        \n\t"
+                    "vadd.i32 q0, q3                       /*v0 + v2*/         \n\t"
+                    "vadd.i32 q1, q4                       /*v0 + v2*/         \n\t"
+                    "vadd.i32 q2, q5                       /*v0 + v2*/         \n\t"
+                    "vadd.i32 q3, q6                       /*v1 + v2*/         \n\t"
+                    "vadd.i32 q4, q7                       /*v1 + v2*/         \n\t"
+                    "vadd.i32 q5, q8                       /*v1 + v2*/         \n\t"
+                    "vmla.i32 q0, q6, q9                   /*v0 + v2 + v1*6 */ \n\t"
+                    "vmla.i32 q1, q7, q9                   /*v0 + v2 + v1*6 */ \n\t"
+                    "vmla.i32 q2, q8, q9                   /*v0 + v2 + v1*6 */ \n\t"
+                    "vrshrn.s32 d19, q3, #4                                    \n\t"
+                    "vrshrn.s32 d21, q4, #4                                    \n\t"
+                    "vrshrn.s32 d23, q5, #4                                    \n\t"
+                    "vrshrn.s32 d18, q0, #6                                    \n\t"
+                    "vrshrn.s32 d20, q1, #6                                    \n\t"
+                    "vrshrn.s32 d22, q2, #6                                    \n\t"
+                    "vzip.16 d18, d19                                          \n\t"
+                    "vzip.16 d20, d21                                          \n\t"
+                    "vzip.16 d22, d23                                          \n\t"
+                    "vst3.16 {d18, d20, d22}, [%[out1]]                        \n\t"
+                    "vst3.16 {d19, d21, d23}, [%[out2]]                        \n\t"
+                    : /*no output*/
+                    : [out1] "r" (dst + 2*x),
+                      [out2] "r" (dst + 2*x + 12),
+                      [in0]  "r" (lane + x - 3),
+                      [in1]  "r" (lane + x),
+                      [in12] "r" (lane + x + 6),
+                      [in2]  "r" (lane + x + 3),
+                      [in22] "r" (lane + x + 9)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+#else
+                int32x4_t vc6 = vmovq_n_s32(6);
+                int32x4x3_t vLane0  = vld3q_s32(lane + x - 3);
+                int32x4x3_t vLane1  = vld3q_s32(lane + x);
+                int32x4x3_t vLane2  = vld3q_s32(lane + x + 3);
+
+                int32x4_t vSum_0_3 = vaddq_s32(vLane0.val[0], vLane2.val[0]);
+                int32x4_t vSum_1_4 = vaddq_s32(vLane0.val[1], vLane2.val[1]);
+                int32x4_t vSum_2_5 = vaddq_s32(vLane0.val[2], vLane2.val[2]);
+                int32x4_t vSum_3_6 = vaddq_s32(vLane2.val[0], vLane1.val[0]);
+                int32x4_t vSum_4_7 = vaddq_s32(vLane2.val[1], vLane1.val[1]);
+                int32x4_t vSum_5_8 = vaddq_s32(vLane2.val[2], vLane1.val[2]);
+
+                vSum_0_3 = vmlaq_s32(vSum_0_3, vLane1.val[0], vc6);
+                vSum_1_4 = vmlaq_s32(vSum_1_4, vLane1.val[1], vc6);
+                vSum_2_5 = vmlaq_s32(vSum_2_5, vLane1.val[2], vc6);
+
+                int16x4x2_t vSumShr1;
+                vSumShr1.val[1] = vrshrn_n_s32(vSum_3_6, 4);
+                vSumShr1.val[0] = vrshrn_n_s32(vSum_0_3, 6);
+
+                int16x4x2_t vSumShr2;
+                vSumShr2.val[1] = vrshrn_n_s32(vSum_4_7, 4);
+                vSumShr2.val[0] = vrshrn_n_s32(vSum_1_4, 6);
+
+                int16x4x2_t vSumShr3;
+                vSumShr3.val[1] = vrshrn_n_s32(vSum_5_8, 4);
+                vSumShr3.val[0] = vrshrn_n_s32(vSum_2_5, 6);
+
+                vSumShr1 = vzip_s16(vSumShr1.val[0], vSumShr1.val[1]);
+                vSumShr2 = vzip_s16(vSumShr2.val[0], vSumShr2.val[1]);
+                vSumShr3 = vzip_s16(vSumShr3.val[0], vSumShr3.val[1]);
+
+                int16x4x3_t vRes1;
+                vRes1.val[0] = vSumShr1.val[0];
+                vRes1.val[1] = vSumShr2.val[0];
+                vRes1.val[2] = vSumShr3.val[0];
+                vst3_s16((int16_t*)(dst + 2 * x), vRes1);
+
+                int16x4x3_t vRes2;
+                vRes2.val[0] = vSumShr1.val[1];
+                vRes2.val[1] = vSumShr2.val[1];
+                vRes2.val[2] = vSumShr3.val[1];
+                vst3_s16(dst + 2 * x + 12, vRes2);
+#endif
+            }
+        }
+        break;
+        case 4:
+            lim = dcolshn > 3 ? dcolshn - 3 : 0;
+            for (; x < lim; x += 4)
+            {
+                internal::prefetch(lane + x);
+#if defined(__GNUC__) && defined(__arm__)
+                __asm__ (
+                    "vld1.32 {d0-d1}, [%[in0]]       /*q0 = v0*/            \n\t"
+                    "vld1.32 {d2-d3}, [%[in2]]       /*q1 = v2*/            \n\t"
+                    "vld1.32 {d4-d5}, [%[in1],:128]  /*q2 = v1*/            \n\t"
+                    "vadd.i32 q0, q1                 /*q0 = v0 + v2*/       \n\t"
+                    "vadd.i32 q3, q1, q2             /*q3 = v1 + v2*/       \n\t"
+                    "vmla.i32 q0, q2, %q[c6]         /*q0 += v1*6*/         \n\t"
+                    "vrshrn.s32 d9, q3, #4                                  \n\t"
+                    "vrshrn.s32 d8, q0, #6                                  \n\t"
+                    "vst1.16 {d8-d9}, [%[out]]                              \n\t"
+                    : /*no output*/
+                    : [out] "r" (dst + x * 2),
+                      [in0] "r" (lane + x - 4),
+                      [in1] "r" (lane + x),
+                      [in2] "r" (lane + x + 4),
+                      [c6] "w" (vc6s32)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
+                );
+#else
+                int32x4_t vLane0 = vld1q_s32(lane + x - 4);
+                int32x4_t vLane1 = vld1q_s32(lane + x);
+                int32x4_t vLane2 = vld1q_s32(lane + x + 4);
+
+                vLane0 = vaddq_s32(vLane0, vLane2);
+                vLane2 = vaddq_s32(vLane2, vLane1);
+                vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32);
+                int16x4x2_t vRes;
+                vRes.val[0] = vrshrn_n_s32(vLane0, 6);
+                vRes.val[1] = vrshrn_n_s32(vLane2, 4);
+
+                vst1q_s16(dst + x * 2, vcombine_s16(vRes.val[0], vRes.val[1]));
+#endif
+            }
+            break;
+        };
+
+        for (u32 h = 0; h < cn; ++h)
+        {
+            const s32* ln = lane + h;
+            s16* dt = dst + h;
+            size_t k = x;
+            for (; k < dcolshn; k += cn)
+            {
+                dt[2*k+0] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6);
+                dt[2*k+cn] = s16(((ln[k] + ln[k+cn]) * 4 + (1 << 5)) >> 6);
+            }
+            for (; k < dcolshw; k += cn)
+                dt[2*k] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6);
+        }
+        dst = internal::getRowPtr(dstBase, dstStride, 2*i+1);
+
+        //second row
+        if (lane == lane0 && 2*i+1 < dstSize.height)
+        {
+            lane = lane1;
+            goto pyrUp16sHorizontalConvolution;
+        }
+    }
+#else
+    // Remove 'unused parameter' warnings.
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/reduce.cpp b/3rdparty/carotene/src/reduce.cpp
new file mode 100644
index 0000000000..8c11c39e80
--- /dev/null
+++ b/3rdparty/carotene/src/reduce.cpp
@@ -0,0 +1,460 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+void reduceColSum(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  s32 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memset(dstBase, 0, size.width*sizeof(s32));
+    size_t i = 0;
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const u8* src_address = srcBase + i;
+
+        int32x4_t sll = vmovq_n_s32(0);
+        int32x4_t slh = vmovq_n_s32(0);
+        int32x4_t shl = vmovq_n_s32(0);
+        int32x4_t shh = vmovq_n_s32(0);
+
+        for (size_t h = 0; h < size.height; h += 256)
+        {
+            size_t lim = std::min(h + 256, size.height);
+
+            uint16x8_t sl = vmovq_n_u16(0);
+            uint16x8_t sh = vmovq_n_u16(0);
+
+            for (size_t k = h; k < lim; ++k, src_address += srcStride)
+            {
+                internal::prefetch(src_address + srcStride, 0);
+
+                uint8x16_t v = vld1q_u8(src_address);
+
+                sl = vaddw_u8(sl, vget_low_u8(v));
+                sh = vaddw_u8(sh, vget_high_u8(v));
+            }
+
+            int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
+            int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
+            int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
+            int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));
+
+            sll = vqaddq_s32(sll, vsll);
+            slh = vqaddq_s32(slh, vslh);
+            shl = vqaddq_s32(shl, vshl);
+            shh = vqaddq_s32(shh, vshh);
+        }
+
+        vst1q_s32(dstBase + i + 0, sll);
+        vst1q_s32(dstBase + i + 4, slh);
+        vst1q_s32(dstBase + i + 8, shl);
+        vst1q_s32(dstBase + i + 12, shh);
+    }
+
+    for(size_t h = 0; h < size.height; ++h)
+    {
+        for(size_t j = i ; j < size.width; j++ )
+        {
+            if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
+                dstBase[j] = 0x7fFFffFF;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColMax(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width);
+    size_t i = 0;
+    for (; i + 16*4 <= size.width; i += 16*4)
+    {
+        const u8* src_address = srcBase + i;
+
+        uint8x16_t s1 = vld1q_u8(src_address + 0);
+        uint8x16_t s2 = vld1q_u8(src_address + 16);
+        uint8x16_t s3 = vld1q_u8(src_address + 32);
+        uint8x16_t s4 = vld1q_u8(src_address + 48);
+
+        src_address += srcStride;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
+        {
+            internal::prefetch(src_address + srcStride, 0);
+            internal::prefetch(src_address + srcStride, 32);
+
+            uint8x16_t v1 = vld1q_u8(src_address + 0);
+            uint8x16_t v2 = vld1q_u8(src_address + 16);
+            uint8x16_t v3 = vld1q_u8(src_address + 32);
+            uint8x16_t v4 = vld1q_u8(src_address + 48);
+
+            s1 = vmaxq_u8(s1, v1);
+            s2 = vmaxq_u8(s2, v2);
+            s3 = vmaxq_u8(s3, v3);
+            s4 = vmaxq_u8(s4, v4);
+        }
+
+        vst1q_u8(dstBase + i + 0, s1);
+        vst1q_u8(dstBase + i + 16, s2);
+        vst1q_u8(dstBase + i + 32, s3);
+        vst1q_u8(dstBase + i + 48, s4);
+    }
+
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const u8* src_address = srcBase + i;
+        uint8x16_t s1 = vld1q_u8(src_address);
+        src_address += srcStride;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
+        {
+            internal::prefetch(src_address + srcStride, 0);
+
+            uint8x16_t v1 = vld1q_u8(src_address);
+            s1 = vmaxq_u8(s1, v1);
+        }
+        vst1q_u8(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+            for(size_t j = i ; j < size.width; j++ )
+                dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColMin(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width);
+    size_t i = 0;
+    for (; i + 16*4 <= size.width; i += 16*4)
+    {
+        const u8* src_address = srcBase + i;
+
+        uint8x16_t s1 = vld1q_u8(src_address + 0);
+        uint8x16_t s2 = vld1q_u8(src_address + 16);
+        uint8x16_t s3 = vld1q_u8(src_address + 32);
+        uint8x16_t s4 = vld1q_u8(src_address + 48);
+
+        src_address += srcStride;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
+        {
+            internal::prefetch(src_address + srcStride, 0);
+            internal::prefetch(src_address + srcStride, 32);
+
+            uint8x16_t v1 = vld1q_u8(src_address + 0);
+            uint8x16_t v2 = vld1q_u8(src_address + 16);
+            uint8x16_t v3 = vld1q_u8(src_address + 32);
+            uint8x16_t v4 = vld1q_u8(src_address + 48);
+
+            s1 = vminq_u8(s1, v1);
+            s2 = vminq_u8(s2, v2);
+            s3 = vminq_u8(s3, v3);
+            s4 = vminq_u8(s4, v4);
+        }
+
+        vst1q_u8(dstBase + i + 0, s1);
+        vst1q_u8(dstBase + i + 16, s2);
+        vst1q_u8(dstBase + i + 32, s3);
+        vst1q_u8(dstBase + i + 48, s4);
+    }
+
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const u8* src_address = srcBase + i;
+        uint8x16_t s1 = vld1q_u8(src_address);
+        src_address += srcStride;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
+        {
+            internal::prefetch(src_address + srcStride, 0);
+
+            uint8x16_t v1 = vld1q_u8(src_address);
+            s1 = vminq_u8(s1, v1);
+        }
+        vst1q_u8(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+            for(size_t j = i ; j < size.width; j++ )
+                dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColSum(const Size2D &size,
+                  const f32 * srcBase, ptrdiff_t srcStride,
+                  f32 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width*sizeof(f32));
+    size_t srcstep = srcStride/sizeof(f32);
+    size_t i = 0;
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const f32* src_address = srcBase + i;
+
+        float32x4_t s1 = vld1q_f32(src_address + 0);
+        float32x4_t s2 = vld1q_f32(src_address + 4);
+        float32x4_t s3 = vld1q_f32(src_address + 8);
+        float32x4_t s4 = vld1q_f32(src_address + 12);
+
+        src_address += srcstep;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+            internal::prefetch(src_address + srcstep, 32);
+
+            float32x4_t v1 = vld1q_f32(src_address + 0);
+            float32x4_t v2 = vld1q_f32(src_address + 4);
+            float32x4_t v3 = vld1q_f32(src_address + 8);
+            float32x4_t v4 = vld1q_f32(src_address + 12);
+
+            s1 = vaddq_f32(s1, v1);
+            s2 = vaddq_f32(s2, v2);
+            s3 = vaddq_f32(s3, v3);
+            s4 = vaddq_f32(s4, v4);
+        }
+
+        vst1q_f32(dstBase + i + 0, s1);
+        vst1q_f32(dstBase + i + 4, s2);
+        vst1q_f32(dstBase + i + 8, s3);
+        vst1q_f32(dstBase + i + 12, s4);
+    }
+
+    for (; i + 4 <= size.width; i += 4)
+    {
+        const f32* src_address = srcBase + i;
+        float32x4_t s1 = vld1q_f32(src_address);
+        src_address += srcstep;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+
+            float32x4_t v1 = vld1q_f32(src_address);
+            s1 = vaddq_f32(s1, v1);
+        }
+        vst1q_f32(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+        {
+            for(size_t j = i ; j < size.width; j++ )
+            {
+                dstBase[j] += srcBase[j + srcstep * h];
+            }
+        }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColMax(const Size2D &size,
+                  const f32 * srcBase, ptrdiff_t srcStride,
+                  f32 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width*sizeof(f32));
+    size_t srcstep = srcStride/sizeof(f32);
+    size_t i = 0;
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const f32* src_address = srcBase + i;
+
+        float32x4_t s1 = vld1q_f32(src_address + 0);
+        float32x4_t s2 = vld1q_f32(src_address + 4);
+        float32x4_t s3 = vld1q_f32(src_address + 8);
+        float32x4_t s4 = vld1q_f32(src_address + 12);
+
+        src_address += srcstep;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+            internal::prefetch(src_address + srcstep, 32);
+
+            float32x4_t v1 = vld1q_f32(src_address + 0);
+            float32x4_t v2 = vld1q_f32(src_address + 4);
+            float32x4_t v3 = vld1q_f32(src_address + 8);
+            float32x4_t v4 = vld1q_f32(src_address + 12);
+
+            s1 = vmaxq_f32(s1, v1);
+            s2 = vmaxq_f32(s2, v2);
+            s3 = vmaxq_f32(s3, v3);
+            s4 = vmaxq_f32(s4, v4);
+        }
+
+        vst1q_f32(dstBase + i + 0, s1);
+        vst1q_f32(dstBase + i + 4, s2);
+        vst1q_f32(dstBase + i + 8, s3);
+        vst1q_f32(dstBase + i + 12, s4);
+    }
+
+    for (; i + 4 <= size.width; i += 4)
+    {
+        const f32* src_address = srcBase + i;
+        float32x4_t s1 = vld1q_f32(src_address);
+        src_address += srcstep;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+
+            float32x4_t v1 = vld1q_f32(src_address);
+            s1 = vmaxq_f32(s1, v1);
+        }
+        vst1q_f32(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+            for(size_t j = i ; j < size.width; j++ )
+                dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColMin(const Size2D &size,
+                  const f32 * srcBase, ptrdiff_t srcStride,
+                  f32 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width*sizeof(f32));
+    size_t srcstep = srcStride/sizeof(f32);
+    size_t i = 0;
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const f32* src_address = srcBase + i;
+
+        float32x4_t s1 = vld1q_f32(src_address + 0);
+        float32x4_t s2 = vld1q_f32(src_address + 4);
+        float32x4_t s3 = vld1q_f32(src_address + 8);
+        float32x4_t s4 = vld1q_f32(src_address + 12);
+
+        src_address += srcstep;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+            internal::prefetch(src_address + srcstep, 32);
+
+            float32x4_t v1 = vld1q_f32(src_address + 0);
+            float32x4_t v2 = vld1q_f32(src_address + 4);
+            float32x4_t v3 = vld1q_f32(src_address + 8);
+            float32x4_t v4 = vld1q_f32(src_address + 12);
+
+            s1 = vminq_f32(s1, v1);
+            s2 = vminq_f32(s2, v2);
+            s3 = vminq_f32(s3, v3);
+            s4 = vminq_f32(s4, v4);
+        }
+
+        vst1q_f32(dstBase + i + 0, s1);
+        vst1q_f32(dstBase + i + 4, s2);
+        vst1q_f32(dstBase + i + 8, s3);
+        vst1q_f32(dstBase + i + 12, s4);
+    }
+
+    for (; i + 4 <= size.width; i += 4)
+    {
+        const f32* src_address = srcBase + i;
+        float32x4_t s1 = vld1q_f32(src_address);
+        src_address += srcstep;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+
+            float32x4_t v1 = vld1q_f32(src_address);
+            s1 = vminq_f32(s1, v1);
+        }
+        vst1q_f32(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+            for(size_t j = i ; j < size.width; j++ )
+                dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/remap.cpp b/3rdparty/carotene/src/remap.cpp
new file mode 100644
index 0000000000..a4b99c3db0
--- /dev/null
+++ b/3rdparty/carotene/src/remap.cpp
@@ -0,0 +1,694 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "remap.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace internal {
+
+void remapNearestNeighborReplicate(const Size2D size,
+                                   const u8 * srcBase,
+                                   const s32 * map,
+                                   u8 * dstBase, ptrdiff_t dstStride)
+{
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
+        u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
+
+        for (size_t x = 0; x < size.width; ++x)
+        {
+            dst_row[x] = srcBase[map_row[x]];
+        }
+    }
+}
+
+void remapNearestNeighborConst(const Size2D size,
+                               const u8 * srcBase,
+                               const s32 * map,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               u8 borderValue)
+{
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
+        u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
+
+        for (size_t x = 0; x < size.width; ++x)
+        {
+            s32 src_idx = map_row[x];
+            dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
+        }
+    }
+}
+
+void remapLinearReplicate(const Size2D size,
+                          const u8 * srcBase,
+                          const s32 * map,
+                          const f32 * coeffs,
+                          u8 * dstBase, ptrdiff_t dstStride)
+{
+    int16x8_t v_zero16 = vdupq_n_s16(0);
+
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
+        const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
+
+        u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
+
+        size_t x = 0;
+        for ( ; x + 8 < size.width; x += 8)
+        {
+            int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
+
+            int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
+
+            int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
+
+            int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
+
+            // first part
+            float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+
+            float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
+            float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
+                                                                               vget_low_s16(v_src00))), v_coeff.val[0]);
+            float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
+                                                                               vget_low_s16(v_src10))), v_coeff.val[0]);
+
+            float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // second part
+            v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+
+            v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
+            v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
+                                                                   vget_high_s16(v_src00))), v_coeff.val[0]);
+            v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
+                                                                   vget_high_s16(v_src10))), v_coeff.val[0]);
+
+            v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // store
+            vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
+        }
+
+        for ( ; x < size.width; ++x)
+        {
+            s32 src00_index = map_row[(x << 2)];
+            s32 src10_index = map_row[(x << 2) + 2];
+            f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
+                             srcBase[src00_index];
+            f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
+                             srcBase[src10_index];
+            dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
+        }
+    }
+}
+
+void remapLinearConst(const Size2D size,
+                      const u8 * srcBase,
+                      const s32 * map,
+                      const f32 * coeffs,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      u8 borderValue)
+{
+    int16x8_t v_zero16 = vdupq_n_s16(0);
+
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
+        const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
+
+        u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
+
+        size_t x = 0;
+        for ( ; x + 8 < size.width; x += 8)
+        {
+            int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) +  4] >= 0 ? srcBase[map_row[(x << 2) +  4]] : borderValue, v_src00, 1);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) +  8] >= 0 ? srcBase[map_row[(x << 2) +  8]] : borderValue, v_src00, 2);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
+
+            int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) +  5] >= 0 ? srcBase[map_row[(x << 2) +  5]] : borderValue, v_src01, 1);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) +  9] >= 0 ? srcBase[map_row[(x << 2) +  9]] : borderValue, v_src01, 2);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
+
+            int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) +  6] >= 0 ? srcBase[map_row[(x << 2) +  6]] : borderValue, v_src10, 1);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
+
+            int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) +  7] >= 0 ? srcBase[map_row[(x << 2) +  7]] : borderValue, v_src11, 1);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
+
+            // first part
+            float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+
+            float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
+            float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
+                                                                               vget_low_s16(v_src00))), v_coeff.val[0]);
+            float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
+                                                                               vget_low_s16(v_src10))), v_coeff.val[0]);
+
+            float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // second part
+            v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+
+            v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
+            v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
+                                                                   vget_high_s16(v_src00))), v_coeff.val[0]);
+            v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
+                                                                   vget_high_s16(v_src10))), v_coeff.val[0]);
+
+            v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // store
+            vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
+        }
+
+        for ( ; x < size.width; ++x)
+        {
+            s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
+            s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
+            s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
+            s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
+
+            f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
+            f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
+            dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
+        }
+    }
+}
+
+} // namespace internal
+
+#endif // CAROTENE_NEON
+
+bool isRemapNearestNeighborSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+bool isRemapLinearSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                          const u8 * srcBase, ptrdiff_t srcStride,
+                          const f32 * tableBase, ptrdiff_t tableStride,
+                          u8 * dstBase, ptrdiff_t dstStride,
+                          BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+    s32 * map = alignPtr(_map, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride);
+    int32x2_t v_step2 = vdup_n_s32(srcStride);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+        int32x2_t v_zero2 = vdup_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0;
+                    for ( ; x + 8 <= blockWidth; x += 8)
+                    {
+                        float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
+                                      v_table1 = vld2q_f32(table_row + (x << 1) + 8);
+
+                        int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
+                        int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
+                        int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
+                        vst1q_s32(map_row + x, v_dst_index);
+
+                        v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
+                        v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
+                        v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
+                        vst1q_s32(map_row + x + 4, v_dst_index);
+                    }
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
+
+                        int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
+                        int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
+                        int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
+                        vst1q_s32(map_row + x, v_dst_index);
+                    }
+
+                    for ( ; x + 2 <= blockWidth; x += 2)
+                    {
+                        float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
+
+                        int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
+                        int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
+                        int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
+                        vst1_s32(map_row + x, v_dst_index);
+                    }
+
+                    for ( ; x < blockWidth; ++x)
+                    {
+                        s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
+                        s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
+                        map_row[x] = src_y * srcStride + src_x;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                              getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        int32x2_t v_m1_2 = vdup_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        float32x2_t v_zero2 = vdup_n_f32(0.0f);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0;
+                    for ( ; x + 8 <= blockWidth; x += 8)
+                    {
+                        float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
+                                      v_table1 = vld2q_f32(table_row + (x << 1) + 8);
+
+                        int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
+                        int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
+                        uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
+                                                      vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
+                        int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x, v_dst_index);
+
+                        v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
+                        v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
+                        v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
+                                           vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
+                        v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x + 4, v_dst_index);
+                    }
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
+
+                        int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
+                        int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
+                        uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
+                                                      vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
+                        int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x, v_dst_index);
+                    }
+
+                    for ( ; x + 2 <= blockWidth; x += 2)
+                    {
+                        float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
+
+                        int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
+                        int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
+                        uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
+                                                     vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
+                        int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
+                        vst1_s32(map_row + x, v_dst_index);
+                    }
+
+                    for ( ; x < blockWidth; ++x)
+                    {
+                        s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
+                        s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
+                        map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
+                                     (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                          getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)tableBase;
+    (void)tableStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+void remapLinear(const Size2D &ssize, const Size2D &dsize,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 const f32 * tableBase, ptrdiff_t tableStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+    f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+
+    s32 * map = alignPtr(_map, 16);
+    f32 * coeffs = alignPtr(_coeffs, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
+    float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
+
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0;
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
+
+                        float32x4x2_t  v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
+                        v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
+                        v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
+
+                        int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
+                        int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
+                        int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
+                        int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+                    }
+
+                    for ( ; x < blockWidth; ++x)
+                    {
+                        f32 src_x_f = table_row[(x << 1) + 0];
+                        f32 src_y_f = table_row[(x << 1) + 1];
+
+                        s32 src0_x = (s32)floorf(src_x_f);
+                        s32 src0_y = (s32)floorf(src_y_f);
+
+                        coeff_row[x << 1] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
+                        src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
+                        s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
+                        src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
+
+                        map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
+                        map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
+                        map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
+                        map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
+                    }
+                }
+
+                remapLinearReplicate(Size2D(blockWidth, blockHeight),
+                                     srcBase, &map[0], &coeffs[0],
+                                     getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
+
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0;
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
+
+                        int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
+                        int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
+                        v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                        v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                        int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                        int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                        uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
+                        uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                        uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
+                        uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                        v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                        v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                        v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                        v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+                    }
+
+                    for ( ; x < blockWidth; ++x)
+                    {
+                        f32 src_x_f = table_row[(x << 1) + 0];
+                        f32 src_y_f = table_row[(x << 1) + 1];
+
+                        s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
+                        s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
+
+                        coeff_row[(x << 1)] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
+                        map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
+                    }
+                }
+
+                remapLinearConst(Size2D(blockWidth, blockHeight),
+                                 srcBase, &map[0], &coeffs[0],
+                                 getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)tableBase;
+    (void)tableStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/remap.hpp b/3rdparty/carotene/src/remap.hpp
new file mode 100644
index 0000000000..0f9765965f
--- /dev/null
+++ b/3rdparty/carotene/src/remap.hpp
@@ -0,0 +1,85 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_REMAP_HPP
+#define CAROTENE_SRC_REMAP_HPP
+
+#include "common.hpp"
+
+#include <cmath>
+
+#ifdef CAROTENE_NEON
+
+namespace CAROTENE_NS { namespace internal {
+
+enum
+{
+    BLOCK_SIZE = 32
+};
+
+
+void remapNearestNeighborReplicate(const Size2D size,
+                                   const u8 * srcBase,
+                                   const s32 * map,
+                                   u8 * dstBase, ptrdiff_t dstStride);
+
+void remapNearestNeighborConst(const Size2D size,
+                               const u8 * srcBase,
+                               const s32 * map,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               u8 borderValue);
+
+void remapLinearReplicate(const Size2D size,
+                          const u8 * srcBase,
+                          const s32 * map,
+                          const f32 * coeffs,
+                          u8 * dstBase, ptrdiff_t dstStride);
+
+void remapLinearConst(const Size2D size,
+                      const u8 * srcBase,
+                      const s32 * map,
+                      const f32 * coeffs,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      u8 borderValue);
+
+} }
+
+#endif // CAROTENE_NEON
+
+#endif // CAROTENE_SRC_REMAP_HPP
diff --git a/3rdparty/carotene/src/resize.cpp b/3rdparty/carotene/src/resize.cpp
new file mode 100644
index 0000000000..3a80d472df
--- /dev/null
+++ b/3rdparty/carotene/src/resize.cpp
@@ -0,0 +1,2191 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cmath>
+#include <vector>
+#include <algorithm>
+
+namespace CAROTENE_NS {
+
+bool isResizeNearestNeighborSupported(const Size2D &ssize, u32 elemSize)
+{
+#if SIZE_MAX <= UINT32_MAX
+    (void)ssize;
+#endif
+    bool supportedElemSize = (elemSize == 1) || (elemSize == 3) || (elemSize == 4);
+    return isSupportedConfiguration()
+#if SIZE_MAX > UINT32_MAX
+           && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internally used resizeGeneric performs
+                                                                      // index evaluation with u32
+#endif
+           && supportedElemSize;
+}
+
+bool isResizeAreaSupported(f32 wr, f32 hr, u32 channels)
+{
+    bool supportedRatio = false;
+
+    if (channels == 1)
+        supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5));
+    else if (channels == 3)
+        supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f));
+    else if (channels == 4)
+        supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f));
+
+    return isSupportedConfiguration() && supportedRatio;
+}
+
+bool isResizeLinearSupported(const Size2D &ssize, const Size2D &dsize,
+                             f32 wr, f32 hr, u32 channels)
+{
+    if ((wr <= 2.0f) && (hr <= 2.0f))
+    {
+        bool channelsSupport = (channels == 1) || (channels == 3) || (channels == 4);
+        return (ssize.width >= 16) && (dsize.height >= 8) &&
+                (dsize.width >= 8) && channelsSupport;
+    }
+
+    return false;
+}
+
+bool isResizeLinearOpenCVSupported(const Size2D &ssize, const Size2D &dsize, u32 channels)
+{
+    switch(channels)
+    {
+    case 1:
+        if (ssize.width >= 8
+#if SIZE_MAX > UINT32_MAX
+            && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+#endif
+            && dsize.width >= 8 && dsize.height >= 8)
+            return isSupportedConfiguration();
+        return false;
+    case 4:
+        if (ssize.width >= 2
+#if SIZE_MAX > UINT32_MAX
+            && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+#endif
+            && dsize.width >= 2 && dsize.height >= 8)
+            return isSupportedConfiguration();
+    default:
+        return false;
+    };
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+u32 * calcLUT(size_t size, f32 ratio,
+              std::vector<u32> & _ofs)
+{
+    _ofs.resize(size);
+    u32 * ofs = &_ofs[0];
+
+    size_t roiw8 = size >= 7 ? size - 7 : 0;
+    size_t roiw4 = size >= 3 ? size - 3 : 0;
+    size_t x = 0;
+
+    f32 indeces[4] = { 0, 1, 2, 3 };
+    float32x4_t v_index = vld1q_f32(indeces), v_inc = vdupq_n_f32(4);
+    float32x4_t v_05 = vdupq_n_f32(0.5f), v_ratio = vdupq_n_f32(ratio);
+
+    for ( ; x < roiw8; x += 8)
+    {
+        float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
+        vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf));
+        v_index = vaddq_f32(v_index, v_inc);
+
+        v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
+        vst1q_u32(ofs + x + 4, vcvtq_u32_f32(v_dstf));
+        v_index = vaddq_f32(v_index, v_inc);
+    }
+
+    for ( ; x < roiw4; x += 4)
+    {
+        float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
+        vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf));
+        v_index = vaddq_f32(v_index, v_inc);
+    }
+
+    for ( ; x < size; ++x)
+    {
+        ofs[x] = static_cast<u32>(floorf((x + 0.5f) * ratio));
+    }
+
+    return ofs;
+}
+
+template <typename T>
+void resizeGeneric(const Size2D &dsize,
+                   const void * srcBase, ptrdiff_t srcStride,
+                   void * dstBase, ptrdiff_t dstStride,
+                   f32 wr, f32 hr)
+{
+    std::vector<u32> _x_ofs;
+    u32 * x_ofs = calcLUT(dsize.width, wr, _x_ofs);//32bit LUT is used so we could get issues on src image dimensions greater than (2^32-1)
+
+    for (size_t dst_y = 0; dst_y < dsize.height; ++dst_y)
+    {
+        size_t src_y = static_cast<size_t>(floorf((dst_y + 0.5f) * hr));
+        const T * src = internal::getRowPtr(static_cast<const T *>(srcBase), srcStride, src_y);
+        T * dst = internal::getRowPtr(static_cast<T *>(dstBase), dstStride, dst_y);
+
+        for (size_t dst_x = 0; dst_x < dsize.width; ++dst_x)
+        {
+            internal::prefetch(src + dst_x);
+            dst[dst_x] = src[x_ofs[dst_x]];
+        }
+    }
+}
+
+typedef struct _24bit_
+{
+    u8 a[3];
+} _24bit;
+
+} // namespace
+
+
+#endif
+
+void resizeNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                           const void * srcBase, ptrdiff_t srcStride,
+                           void * dstBase, ptrdiff_t dstStride,
+                           f32 wr, f32 hr, u32 elemSize)
+{
+    internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
+                                           (dsize.width - 0.5) * wr < ssize.width &&
+                                           (dsize.height - 0.5) * hr < ssize.height &&  // Ensure we have enough source data
+                                           (dsize.width + 0.5) * wr >= ssize.width &&
+                                           (dsize.height + 0.5) * hr >= ssize.height && // Ensure source isn't too big
+                                           isResizeNearestNeighborSupported(ssize, elemSize));
+#ifdef CAROTENE_NEON
+
+    if (elemSize == 1)
+    {
+        resizeGeneric<u8>(dsize,
+                          srcBase, srcStride,
+                          dstBase, dstStride,
+                          wr, hr);
+    }
+    else if (elemSize == 3)
+    {
+        resizeGeneric<_24bit>(dsize,
+                              srcBase, srcStride,
+                              dstBase, dstStride,
+                              wr, hr);
+    }
+    else if (elemSize == 4)
+    {
+        resizeGeneric<u32>(dsize,
+                           srcBase, srcStride,
+                           dstBase, dstStride,
+                           wr, hr);
+    }
+
+#else
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)wr;
+    (void)hr;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+template <bool opencv_like, int shiftsize>
+inline uint8x8_t areaDownsamplingDivision(uint16x8_t data)
+{
+    return vshrn_n_u16(data, shiftsize);
+}
+template <>
+inline uint8x8_t areaDownsamplingDivision<true,2>(uint16x8_t data)
+{
+    // rounding
+    return vrshrn_n_u16(data,2);
+}
+template <>
+inline uint8x8_t areaDownsamplingDivision<true,4>(uint16x8_t data)
+{
+    // bankers rounding
+    return vrshrn_n_u16(vqsubq_u16(data, vshrq_n_u16(vbicq_u16(vdupq_n_u16(1<<4), data), 4)),4);
+}
+
+template <bool opencv_like, int shiftsize>
+inline u8 areaDownsamplingDivision(u16 data)
+{
+    return data >> shiftsize;
+}
+template <>
+inline u8 areaDownsamplingDivision<true,2>(u16 data)
+{
+    // rounding
+    return (data + 2) >> 2;
+}
+template <>
+inline u8 areaDownsamplingDivision<true,4>(u16 data)
+{
+    // bankers rounding
+    return (data - (((1<<4) & ~data) >> 4) + 8) >> 4;
+}
+#endif
+
+template <bool opencv_like>
+inline void resizeAreaRounding(const Size2D &ssize, const Size2D &dsize,
+                               const u8 * srcBase, ptrdiff_t srcStride,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               f32 wr, f32 hr, u32 channels)
+{
+    internal::assertSupportedConfiguration(isResizeAreaSupported(wr, hr, channels) &&
+                                           std::abs(dsize.width  * wr -  ssize.width) < 0.1 &&
+                                           std::abs(dsize.height * hr - ssize.height) < 0.1);
+#ifdef CAROTENE_NEON
+    if (channels == 1)
+    {
+        if ((wr == 2.0f) && (hr == 2.0f))
+        {
+            size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0;
+
+            for (size_t i = 0; i < dsize.height; ++i)
+            {
+                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
+                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
+                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
+                size_t sj = 0, dj = 0;
+
+                for ( ; dj < roiw8; dj += 8, sj += 16)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+
+                    uint16x8_t vSum1 = vpaddlq_u8(vld1q_u8(src0_row + sj));
+                    uint16x8_t vSum2 = vpaddlq_u8(vld1q_u8(src1_row + sj));
+                    uint8x8_t vRes1 = areaDownsamplingDivision<opencv_like,2>(vaddq_u16(vSum1, vSum2));
+
+                    vst1_u8(dst_row + dj, vRes1);
+                }
+
+                for ( ; dj < dsize.width; ++dj, sj += 2)
+                {
+                    dst_row[dj] = areaDownsamplingDivision<opencv_like,2>(
+                                      (u16)src0_row[sj] + src0_row[sj + 1] +
+                                      src1_row[sj] + src1_row[sj + 1]);
+                }
+            }
+        }
+        else if ((wr == 0.5f) && (hr == 0.5f))
+        {
+            size_t roiw32 = dsize.width >= 31 ? dsize.width - 31 : 0;
+            size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0;
+
+            for (size_t i = 0; i < dsize.height; i += 2)
+            {
+                const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
+                u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
+                u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
+                size_t sj = 0, dj = 0;
+
+                for ( ; dj < roiw32; dj += 32, sj += 16)
+                {
+                    internal::prefetch(src_row + sj);
+
+                    uint8x16x2_t v_dst;
+                    v_dst.val[0] = v_dst.val[1] = vld1q_u8(src_row + sj);
+
+                    vst2q_u8(dst0_row + dj, v_dst);
+                    vst2q_u8(dst1_row + dj, v_dst);
+                }
+
+                for ( ; dj < roiw16; dj += 16, sj += 8)
+                {
+                    uint8x8x2_t v_dst;
+                    v_dst.val[0] = v_dst.val[1] = vld1_u8(src_row + sj);
+
+                    vst2_u8(dst0_row + dj, v_dst);
+                    vst2_u8(dst1_row + dj, v_dst);
+                }
+
+                for ( ; dj < dsize.width; dj += 2, ++sj)
+                {
+                    u8 src_val = src_row[sj];
+                    dst0_row[dj] = dst0_row[dj + 1] = src_val;
+                    dst1_row[dj] = dst1_row[dj + 1] = src_val;
+                }
+            }
+        }
+        else //if ((wr == 4.0f) && (hr == 4.0f)) //the only scale that lasts after isSupported check
+        {
+#ifndef ANDROID
+            size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0;
+#endif
+            size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0;
+
+            for (size_t i = 0; i < dsize.height; ++i)
+            {
+                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
+                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
+                const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
+                const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
+                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
+                size_t sj = 0, dj = 0;
+
+#ifndef ANDROID
+                for ( ; dj < roiw16; dj += 16, sj += 64)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+                    internal::prefetch(src2_row + sj);
+                    internal::prefetch(src3_row + sj);
+
+                    uint8x16x4_t vLane1 = vld4q_u8(src0_row + sj);
+                    uint8x16x4_t vLane2 = vld4q_u8(src1_row + sj);
+                    uint8x16x4_t vLane3 = vld4q_u8(src2_row + sj);
+                    uint8x16x4_t vLane4 = vld4q_u8(src3_row + sj);
+
+                    uint16x8_t vSum_0 = vaddl_u8(vget_low_u8(vLane1.val[0]), vget_low_u8(vLane1.val[1]));
+                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane1.val[2]), vget_low_u8(vLane1.val[3])));
+                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[0]), vget_low_u8(vLane2.val[1])));
+                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[2]), vget_low_u8(vLane2.val[3])));
+                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[0]), vget_low_u8(vLane3.val[1])));
+                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[2]), vget_low_u8(vLane3.val[3])));
+                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[0]), vget_low_u8(vLane4.val[1])));
+                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[2]), vget_low_u8(vLane4.val[3])));
+
+                    uint16x8_t vSum_1 = vaddl_u8(vget_high_u8(vLane1.val[0]), vget_high_u8(vLane1.val[1]));
+                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane1.val[2]), vget_high_u8(vLane1.val[3])));
+                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[0]), vget_high_u8(vLane2.val[1])));
+                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[2]), vget_high_u8(vLane2.val[3])));
+                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[0]), vget_high_u8(vLane3.val[1])));
+                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[2]), vget_high_u8(vLane3.val[3])));
+                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[0]), vget_high_u8(vLane4.val[1])));
+                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[2]), vget_high_u8(vLane4.val[3])));
+
+                    uint8x8_t vRes_0 = areaDownsamplingDivision<opencv_like,4>(vSum_0);
+                    uint8x8_t vRes_1 = areaDownsamplingDivision<opencv_like,4>(vSum_1);
+
+                    vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1));
+                }
+#endif
+
+                for ( ; dj < roiw8; dj += 8, sj += 32)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+                    internal::prefetch(src2_row + sj);
+                    internal::prefetch(src3_row + sj);
+
+                    uint8x8x4_t vLane1 = vld4_u8(src0_row + sj);
+                    uint8x8x4_t vLane2 = vld4_u8(src1_row + sj);
+                    uint8x8x4_t vLane3 = vld4_u8(src2_row + sj);
+                    uint8x8x4_t vLane4 = vld4_u8(src3_row + sj);
+
+                    uint16x8_t vSum = vaddl_u8(vLane1.val[0], vLane1.val[1]);
+                    vSum = vaddq_u16(vSum, vaddl_u8(vLane1.val[2], vLane1.val[3]));
+                    vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[0], vLane2.val[1]));
+                    vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[2], vLane2.val[3]));
+                    vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[0], vLane3.val[1]));
+                    vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[2], vLane3.val[3]));
+                    vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[0], vLane4.val[1]));
+                    vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[2], vLane4.val[3]));
+
+                    vst1_u8(dst_row + dj, areaDownsamplingDivision<opencv_like,4>(vSum));
+                }
+
+                for ( ; dj < dsize.width; ++dj, sj += 4)
+                {
+                    dst_row[dj] = areaDownsamplingDivision<opencv_like,4>(
+                                      (u16)src0_row[sj] + src0_row[sj + 1] + src0_row[sj + 2] + src0_row[sj + 3] +
+                                      src1_row[sj] + src1_row[sj + 1] + src1_row[sj + 2] + src1_row[sj + 3] +
+                                      src2_row[sj] + src2_row[sj + 1] + src2_row[sj + 2] + src2_row[sj + 3] +
+                                      src3_row[sj] + src3_row[sj + 1] + src3_row[sj + 2] + src3_row[sj + 3]);
+                }
+            }
+        }
+    }
+    else if (channels == 4)
+    {
+        if ((wr == 2.0f) && (hr == 2.0f))
+        {
+#ifndef ANDROID
+            size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0;
+#endif
+            size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0;
+
+            for (size_t i = 0; i < dsize.height; ++i)
+            {
+                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
+                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
+                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
+                size_t sj = 0, dj = 0;
+
+#ifndef ANDROID
+                for ( ; dj < roiw4; dj += 16, sj += 32)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+
+                    uint8x8_t vRes_0, vRes_1;
+
+                    {
+                        uint8x16_t vLane1 = vld1q_u8(src0_row + sj);
+                        uint8x16_t vLane2 = vld1q_u8(src1_row + sj);
+
+                        uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
+                        uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
+
+                        uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
+                        uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
+
+                        vRes_0 = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
+                    }
+
+                    {
+                        uint8x16_t vLane1 = vld1q_u8(src0_row + sj + 16);
+                        uint8x16_t vLane2 = vld1q_u8(src1_row + sj + 16);
+
+                        uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
+                        uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
+
+                        uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
+                        uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
+
+                        vRes_1 = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
+                    }
+
+                    vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1));
+                }
+#endif
+
+                for ( ; dj < roiw2; dj += 8, sj += 16)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+
+                    uint8x16_t vLane1 = vld1q_u8(src0_row + sj);
+                    uint8x16_t vLane2 = vld1q_u8(src1_row + sj);
+
+                    uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
+                    uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
+
+                    uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
+                    uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
+
+                    uint8x8_t vRes = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
+                    vst1_u8(dst_row + dj, vRes);
+                }
+
+                for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 8)
+                {
+                    dst_row[dj    ] = areaDownsamplingDivision<opencv_like,2>(
+                                          (u16)src0_row[sj    ] + src0_row[sj + 4] +
+                                               src1_row[sj    ] + src1_row[sj + 4]);
+                    dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,2>(
+                                          (u16)src0_row[sj + 1] + src0_row[sj + 5] +
+                                               src1_row[sj + 1] + src1_row[sj + 5]);
+                    dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,2>(
+                                          (u16)src0_row[sj + 2] + src0_row[sj + 6] +
+                                               src1_row[sj + 2] + src1_row[sj + 6]);
+                    dst_row[dj + 3] = areaDownsamplingDivision<opencv_like,2>(
+                                          (u16)src0_row[sj + 3] + src0_row[sj + 7] +
+                                               src1_row[sj + 3] + src1_row[sj + 7]);
+                }
+            }
+        }
+        else if ((wr == 0.5f) && (hr == 0.5f))
+        {
+#ifndef ANDROID
+            size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) << 2 : 0;
+#endif
+            size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) << 2 : 0;
+
+            for (size_t i = 0; i < dsize.height; i += 2)
+            {
+                const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
+                u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
+                u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
+                size_t sj = 0, dj = 0;
+
+#ifndef ANDROID
+                for ( ; dj < roiw32; dj += 128, sj += 64)
+                {
+                    internal::prefetch(src_row + sj);
+
+                    uint8x16x4_t v_src = vld4q_u8(src_row + sj);
+                    uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]);
+                    uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]);
+                    uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]);
+                    uint8x16x2_t v_c3 = vzipq_u8(v_src.val[3], v_src.val[3]);
+
+                    uint8x16x4_t v_dst;
+                    v_dst.val[0] = v_c0.val[0];
+                    v_dst.val[1] = v_c1.val[0];
+                    v_dst.val[2] = v_c2.val[0];
+                    v_dst.val[3] = v_c3.val[0];
+                    vst4q_u8(dst0_row + dj, v_dst);
+                    vst4q_u8(dst1_row + dj, v_dst);
+
+                    v_dst.val[0] = v_c0.val[1];
+                    v_dst.val[1] = v_c1.val[1];
+                    v_dst.val[2] = v_c2.val[1];
+                    v_dst.val[3] = v_c3.val[1];
+                    vst4q_u8(dst0_row + dj + 64, v_dst);
+                    vst4q_u8(dst1_row + dj + 64, v_dst);
+                }
+#endif
+
+                for ( ; dj < roiw16; dj += 64, sj += 32)
+                {
+                    internal::prefetch(src_row + sj);
+
+                    uint8x8x4_t v_src = vld4_u8(src_row + sj);
+                    uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]);
+                    uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]);
+                    uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]);
+                    uint8x8x2_t v_c3 = vzip_u8(v_src.val[3], v_src.val[3]);
+
+                    uint8x16x4_t v_dst;
+                    v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]);
+                    v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]);
+                    v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]);
+                    v_dst.val[3] = vcombine_u8(v_c3.val[0], v_c3.val[1]);
+                    vst4q_u8(dst0_row + dj, v_dst);
+                    vst4q_u8(dst1_row + dj, v_dst);
+                }
+
+                for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 8, sj += 4)
+                {
+                    u8 src_val = src_row[sj];
+                    dst0_row[dj] = dst0_row[dj + 4] = src_val;
+                    dst1_row[dj] = dst1_row[dj + 4] = src_val;
+
+                    src_val = src_row[sj + 1];
+                    dst0_row[dj + 1] = dst0_row[dj + 5] = src_val;
+                    dst1_row[dj + 1] = dst1_row[dj + 5] = src_val;
+
+                    src_val = src_row[sj + 2];
+                    dst0_row[dj + 2] = dst0_row[dj + 6] = src_val;
+                    dst1_row[dj + 2] = dst1_row[dj + 6] = src_val;
+
+                    src_val = src_row[sj + 3];
+                    dst0_row[dj + 3] = dst0_row[dj + 7] = src_val;
+                    dst1_row[dj + 3] = dst1_row[dj + 7] = src_val;
+                }
+            }
+        }
+        else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check
+        {
+            size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0;
+            size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0;
+
+            for (size_t i = 0; i < dsize.height; ++i)
+            {
+                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
+                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
+                const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
+                const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
+                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
+                size_t sj = 0, dj = 0;
+
+                for ( ; dj < roiw4; dj += 16, sj += 64)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+                    internal::prefetch(src2_row + sj);
+                    internal::prefetch(src3_row + sj);
+
+                    uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16);
+                    uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16);
+                    uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16);
+                    uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16);
+
+                    uint16x8_t v_part_0, v_part_1;
+                    {
+                        uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
+                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
+                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
+                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
+
+                        uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
+                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
+                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
+                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
+
+                        v_part_0 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
+                                                vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
+                    }
+
+                    vLane10 = vld1q_u8(src0_row + sj + 32);
+                    vLane11 = vld1q_u8(src0_row + sj + 48);
+                    vLane20 = vld1q_u8(src1_row + sj + 32);
+                    vLane21 = vld1q_u8(src1_row + sj + 48);
+                    vLane30 = vld1q_u8(src2_row + sj + 32);
+                    vLane31 = vld1q_u8(src2_row + sj + 48);
+                    vLane40 = vld1q_u8(src3_row + sj + 32);
+                    vLane41 = vld1q_u8(src3_row + sj + 48);
+
+                    {
+                        uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
+                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
+                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
+                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
+
+                        uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
+                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
+                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
+                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
+
+                        v_part_1 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
+                                                vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
+                    }
+
+                    vst1q_u8(dst_row + dj, vcombine_u8(areaDownsamplingDivision<opencv_like,4>(v_part_0),
+                                                       areaDownsamplingDivision<opencv_like,4>(v_part_1)));
+                }
+
+                for ( ; dj < roiw2; dj += 8, sj += 32)
+                {
+                    uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16);
+                    uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16);
+                    uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16);
+                    uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16);
+
+                    uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
+                    v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
+                    v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
+                    v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
+
+                    uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
+                    v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
+                    v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
+                    v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
+
+                    uint16x8_t v_sum = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
+                                                    vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
+
+                    vst1_u8(dst_row + dj, areaDownsamplingDivision<opencv_like,4>(v_sum));
+                }
+
+                for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 16)
+                {
+                    dst_row[dj    ] = areaDownsamplingDivision<opencv_like,4>(
+                                            (u16)src0_row[sj     ] + src0_row[sj +  4] +
+                                                 src0_row[sj +  8] + src0_row[sj + 12] +
+                                                 src1_row[sj     ] + src1_row[sj +  4] +
+                                                 src1_row[sj +  8] + src1_row[sj + 12] +
+                                                 src2_row[sj     ] + src2_row[sj +  4] +
+                                                 src2_row[sj +  8] + src2_row[sj + 12] +
+                                                 src3_row[sj     ] + src3_row[sj +  4] +
+                                                 src3_row[sj +  8] + src3_row[sj + 12]);
+
+                    dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,4>(
+                                            (u16)src0_row[sj +  1] + src0_row[sj +  5] +
+                                                 src0_row[sj +  9] + src0_row[sj + 13] +
+                                                 src1_row[sj +  1] + src1_row[sj +  5] +
+                                                 src1_row[sj +  9] + src1_row[sj + 13] +
+                                                 src2_row[sj +  1] + src2_row[sj +  5] +
+                                                 src2_row[sj +  9] + src2_row[sj + 13] +
+                                                 src3_row[sj +  1] + src3_row[sj +  5] +
+                                                 src3_row[sj +  9] + src3_row[sj + 13]);
+
+                    dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,4>(
+                                            (u16)src0_row[sj +  2] + src0_row[sj +  6] +
+                                                 src0_row[sj + 10] + src0_row[sj + 14] +
+                                                 src1_row[sj +  2] + src1_row[sj +  6] +
+                                                 src1_row[sj + 10] + src1_row[sj + 14] +
+                                                 src2_row[sj +  2] + src2_row[sj +  6] +
+                                                 src2_row[sj + 10] + src2_row[sj + 14] +
+                                                 src3_row[sj +  2] + src3_row[sj +  6] +
+                                                 src3_row[sj + 10] + src3_row[sj + 14]);
+
+                    dst_row[dj + 3] = areaDownsamplingDivision<opencv_like,4>(
+                                            (u16)src0_row[sj +  3] + src0_row[sj +  7] +
+                                                 src0_row[sj + 11] + src0_row[sj + 15] +
+                                                 src1_row[sj +  3] + src1_row[sj +  7] +
+                                                 src1_row[sj + 11] + src1_row[sj + 15] +
+                                                 src2_row[sj +  3] + src2_row[sj +  7] +
+                                                 src2_row[sj + 11] + src2_row[sj + 15] +
+                                                 src3_row[sj +  3] + src3_row[sj +  7] +
+                                                 src3_row[sj + 11] + src3_row[sj + 15]);
+                }
+            }
+        }
+    }
+    else if (channels == 3)
+    {
+        if ((wr == 2.0f) && (wr == 2.0f))
+        {
+#ifndef ANDROID
+            size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0;
+#endif
+            size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0;
+
+            for (size_t i = 0; i < dsize.height; ++i)
+            {
+                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
+                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
+                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
+                size_t sj = 0, dj = 0;
+
+#ifndef ANDROID
+                for ( ; dj < roiw16; dj += 48, sj += 96)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+
+                    uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj);
+                    uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj);
+
+                    uint8x8x3_t v_dst0, v_dst1;
+                    {
+                        uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
+                        uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
+                        uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
+                        v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
+                        v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
+                        v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
+
+                        v_dst0.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
+                        v_dst0.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
+                        v_dst0.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
+                    }
+
+                    vLane1 = vld3q_u8(src0_row + sj + 48);
+                    vLane2 = vld3q_u8(src1_row + sj + 48);
+                    {
+                        uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
+                        uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
+                        uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
+                        v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
+                        v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
+                        v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
+
+                        v_dst1.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
+                        v_dst1.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
+                        v_dst1.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
+                    }
+
+                    uint8x16x3_t v_dst;
+                    v_dst.val[0] = vcombine_u8(v_dst0.val[0], v_dst1.val[0]);
+                    v_dst.val[1] = vcombine_u8(v_dst0.val[1], v_dst1.val[1]);
+                    v_dst.val[2] = vcombine_u8(v_dst0.val[2], v_dst1.val[2]);
+
+                    vst3q_u8(dst_row + dj, v_dst);
+                }
+#endif
+
+                for ( ; dj < roiw8; dj += 24, sj += 48)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+
+                    uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj);
+                    uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj);
+
+                    uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
+                    uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
+                    uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
+                    v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
+                    v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
+                    v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
+
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
+                    v_dst.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
+                    v_dst.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
+
+                    vst3_u8(dst_row + dj, v_dst);
+                }
+
+                for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 6)
+                {
+                    dst_row[dj    ] = areaDownsamplingDivision<opencv_like,2>(
+                                          (u16)src0_row[sj    ] + src0_row[sj + 3] +
+                                               src1_row[sj    ] + src1_row[sj + 3]);
+                    dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,2>(
+                                          (u16)src0_row[sj + 1] + src0_row[sj + 4] +
+                                               src1_row[sj + 1] + src1_row[sj + 4]);
+                    dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,2>(
+                                          (u16)src0_row[sj + 2] + src0_row[sj + 5] +
+                                               src1_row[sj + 2] + src1_row[sj + 5]);
+                }
+            }
+        }
+        else if ((wr == 0.5f) && (hr == 0.5f))
+        {
+#ifndef ANDROID
+            size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) * 3 : 0;
+#endif
+            size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0;
+
+            for (size_t i = 0; i < dsize.height; i += 2)
+            {
+                const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
+                u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
+                u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
+                size_t sj = 0, dj = 0;
+
+#ifndef ANDROID
+                for ( ; dj < roiw32; dj += 96, sj += 48)
+                {
+                    internal::prefetch(src_row + sj);
+
+                    uint8x16x3_t v_src = vld3q_u8(src_row + sj);
+                    uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]);
+                    uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]);
+                    uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]);
+
+                    uint8x16x3_t v_dst;
+                    v_dst.val[0] = v_c0.val[0];
+                    v_dst.val[1] = v_c1.val[0];
+                    v_dst.val[2] = v_c2.val[0];
+                    vst3q_u8(dst0_row + dj, v_dst);
+                    vst3q_u8(dst1_row + dj, v_dst);
+
+                    v_dst.val[0] = v_c0.val[1];
+                    v_dst.val[1] = v_c1.val[1];
+                    v_dst.val[2] = v_c2.val[1];
+                    vst3q_u8(dst0_row + dj + 48, v_dst);
+                    vst3q_u8(dst1_row + dj + 48, v_dst);
+                }
+#endif
+
+                for ( ; dj < roiw16; dj += 48, sj += 24)
+                {
+                    internal::prefetch(src_row + sj);
+
+                    uint8x8x3_t v_src = vld3_u8(src_row + sj);
+                    uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]);
+                    uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]);
+                    uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]);
+
+                    uint8x16x3_t v_dst;
+                    v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]);
+                    v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]);
+                    v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]);
+                    vst3q_u8(dst0_row + dj, v_dst);
+                    vst3q_u8(dst1_row + dj, v_dst);
+                }
+
+                for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 6, sj += 3)
+                {
+                    u8 src_val = src_row[sj];
+                    dst0_row[dj] = dst0_row[dj + 3] = src_val;
+                    dst1_row[dj] = dst1_row[dj + 3] = src_val;
+
+                    src_val = src_row[sj + 1];
+                    dst0_row[dj + 1] = dst0_row[dj + 4] = src_val;
+                    dst1_row[dj + 1] = dst1_row[dj + 4] = src_val;
+
+                    src_val = src_row[sj + 2];
+                    dst0_row[dj + 2] = dst0_row[dj + 5] = src_val;
+                    dst1_row[dj + 2] = dst1_row[dj + 5] = src_val;
+                }
+            }
+        }
+        else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check
+        {
+#ifndef ANDROID
+            size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0;
+#endif
+
+            for (size_t i = 0; i < dsize.height; ++i)
+            {
+                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
+                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
+                const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
+                const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
+                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
+                size_t sj = 0, dj = 0;
+
+#ifndef ANDROID
+                for ( ; dj < roiw8; dj += 24, sj += 96)
+                {
+                    internal::prefetch(src0_row + sj);
+                    internal::prefetch(src1_row + sj);
+                    internal::prefetch(src2_row + sj);
+                    internal::prefetch(src3_row + sj);
+
+                    uint8x16x3_t vLane10 = vld3q_u8(src0_row + sj), vLane11 = vld3q_u8(src0_row + sj + 48);
+                    uint8x16x3_t vLane20 = vld3q_u8(src1_row + sj), vLane21 = vld3q_u8(src1_row + sj + 48);
+                    uint8x16x3_t vLane30 = vld3q_u8(src2_row + sj), vLane31 = vld3q_u8(src2_row + sj + 48);
+                    uint8x16x3_t vLane40 = vld3q_u8(src3_row + sj), vLane41 = vld3q_u8(src3_row + sj + 48);
+
+                    uint8x8x3_t v_dst;
+
+                    // channel 0
+                    {
+                        uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[0]);
+                        uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[0]);
+                        uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[0]);
+                        uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[0]);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane1);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane2);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane3);
+
+                        uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[0]);
+                        uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[0]);
+                        uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[0]);
+                        uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[0]);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
+
+                        v_dst.val[0] = areaDownsamplingDivision<opencv_like,4>(
+                                           vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
+                                                        vmovn_u32(vpaddlq_u16(v_lane0_))));
+                    }
+
+                    // channel 1
+                    {
+                        uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[1]);
+                        uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[1]);
+                        uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[1]);
+                        uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[1]);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane1);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane2);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane3);
+
+                        uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[1]);
+                        uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[1]);
+                        uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[1]);
+                        uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[1]);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
+
+                        v_dst.val[1] = areaDownsamplingDivision<opencv_like,4>(
+                                           vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
+                                                        vmovn_u32(vpaddlq_u16(v_lane0_))));
+                    }
+
+                    // channel 2
+                    {
+                        uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[2]);
+                        uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[2]);
+                        uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[2]);
+                        uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[2]);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane1);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane2);
+                        v_lane0 = vaddq_u16(v_lane0, v_lane3);
+
+                        uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[2]);
+                        uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[2]);
+                        uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[2]);
+                        uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[2]);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
+                        v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
+
+                        v_dst.val[2] = areaDownsamplingDivision<opencv_like,4>(
+                                           vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
+                                                        vmovn_u32(vpaddlq_u16(v_lane0_))));
+                    }
+
+                    vst3_u8(dst_row + dj, v_dst);
+                }
+#endif
+
+                for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 12)
+                {
+                    dst_row[dj    ] = areaDownsamplingDivision<opencv_like,4>(
+                                          (u16)src0_row[sj    ] + src0_row[sj +  3] +
+                                               src0_row[sj + 6] + src0_row[sj +  9] +
+                                               src1_row[sj    ] + src1_row[sj +  3] +
+                                               src1_row[sj + 6] + src1_row[sj +  9] +
+                                               src2_row[sj    ] + src2_row[sj +  3] +
+                                               src2_row[sj + 6] + src2_row[sj +  9] +
+                                               src3_row[sj    ] + src3_row[sj +  3] +
+                                               src3_row[sj + 6] + src3_row[sj +  9]);
+
+                    dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,4>(
+                                          (u16)src0_row[sj + 1] + src0_row[sj +  4] +
+                                               src0_row[sj + 7] + src0_row[sj + 10] +
+                                               src1_row[sj + 1] + src1_row[sj +  4] +
+                                               src1_row[sj + 7] + src1_row[sj + 10] +
+                                               src2_row[sj + 1] + src2_row[sj +  4] +
+                                               src2_row[sj + 7] + src2_row[sj + 10] +
+                                               src3_row[sj + 1] + src3_row[sj +  4] +
+                                               src3_row[sj + 7] + src3_row[sj + 10]);
+
+                    dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,4>(
+                                          (u16)src0_row[sj + 2] + src0_row[sj +  5] +
+                                               src0_row[sj + 8] + src0_row[sj + 11] +
+                                               src1_row[sj + 2] + src1_row[sj +  5] +
+                                               src1_row[sj + 8] + src1_row[sj + 11] +
+                                               src2_row[sj + 2] + src2_row[sj +  5] +
+                                               src2_row[sj + 8] + src2_row[sj + 11] +
+                                               src3_row[sj + 2] + src3_row[sj +  5] +
+                                               src3_row[sj + 8] + src3_row[sj + 11]);
+                }
+            }
+        }
+    }
+#else
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)wr;
+    (void)hr;
+#endif
+    (void)ssize;
+}
+
+void resizeAreaOpenCV(const Size2D &ssize, const Size2D &dsize,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f32 wr, f32 hr, u32 channels)
+{
+   resizeAreaRounding<true>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels);
+}
+
+void resizeArea(const Size2D &ssize, const Size2D &dsize,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f32 wr, f32 hr, u32 channels)
+{
+   resizeAreaRounding<false>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels);
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+uint8x8_t resizeLinearStep(uint8x16_t vr1, uint8x16_t vr2,
+                           uint8x8_t vlutl, uint8x8_t vluth,
+                           float32x4_t vrw, float32x4_t vcw0, float32x4_t vcw1)
+{
+    uint8x8_t vr1l = internal::vqtbl1_u8(vr1, vlutl);
+    uint8x8_t vr1h = internal::vqtbl1_u8(vr1, vluth);
+    uint8x8_t vr2l = internal::vqtbl1_u8(vr2, vlutl);
+    uint8x8_t vr2h = internal::vqtbl1_u8(vr2, vluth);
+
+    uint16x8_t v1hw = vmovl_u8(vr1h);
+    uint16x8_t v2hw = vmovl_u8(vr2h);
+
+    int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
+    int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
+
+    float32x4_t v1L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1hw)));
+    float32x4_t v1H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1hw)));
+    float32x4_t v2L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v2hw)));
+    float32x4_t v2H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v2hw)));
+
+    v1L = vmlaq_f32(v1L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v1df))), vcw0);
+    v1H = vmlaq_f32(v1H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v1df))), vcw1);
+    v2L = vmlaq_f32(v2L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v2df))), vcw0);
+    v2H = vmlaq_f32(v2H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v2df))), vcw1);
+
+    float32x4_t vdiffL = vsubq_f32(v1L, v2L);
+    float32x4_t vdiffH = vsubq_f32(v1H, v2H);
+
+    float32x4_t vL = vmlaq_f32(v2L, vdiffL, vrw);
+    float32x4_t vH = vmlaq_f32(v2H, vdiffH, vrw);
+    uint16x4_t vL_ = vmovn_u32(vcvtq_u32_f32(vL));
+    uint16x4_t vH_ = vmovn_u32(vcvtq_u32_f32(vH));
+    return vmovn_u16(vcombine_u16(vL_, vH_));
+}
+
+} // namespace
+
+namespace {
+
+void resize_bilinear_rows(const Size2D &ssize, const Size2D &dsize,
+                        const u8 * srcBase, ptrdiff_t srcStride,
+                        u8 * dstBase, ptrdiff_t dstStride,
+                        f32 hr, const u8** gcols, u8* gcweight, u8* buf)
+{
+    f32 scale_y_offset = 0.5f * hr - 0.5f;
+
+    size_t dst_h8 = dsize.height & ~7;
+    size_t dst_w8 = dsize.width & ~7;
+    size_t src_w8 = ssize.width & ~7;
+
+    size_t r = 0;
+    for (; r < dst_h8; r += 8)
+    {
+resize8u_xystretch:
+        const u8* rows[16];
+        u8 rweight[8];
+
+        for (u32 i = 0; i < 8; ++i)
+        {
+            f32 w = (i + r) * hr + scale_y_offset;
+            ptrdiff_t src_row = floorf(w);
+            ptrdiff_t src_row2 = src_row + 1;
+
+            rweight[i] = (u8)((src_row2-w) * 128);
+
+            if (src_row < 0)
+                src_row = 0;
+            if (src_row2 >= (ptrdiff_t)ssize.height)
+                src_row2 = ssize.height-1;
+
+            rows[2 * i] = srcBase + src_row * srcStride;
+            rows[2 * i + 1] = srcBase + src_row2 * srcStride;
+        }
+
+        uint8x8_t vr0w = vdup_n_u8(rweight[0]);
+        uint8x8_t vr1w = vdup_n_u8(rweight[1]);
+        uint8x8_t vr2w = vdup_n_u8(rweight[2]);
+        uint8x8_t vr3w = vdup_n_u8(rweight[3]);
+        uint8x8_t vr4w = vdup_n_u8(rweight[4]);
+        uint8x8_t vr5w = vdup_n_u8(rweight[5]);
+        uint8x8_t vr6w = vdup_n_u8(rweight[6]);
+        uint8x8_t vr7w = vdup_n_u8(rweight[7]);
+
+        uint8x8_t vr0w2 = vdup_n_u8(128 - rweight[0]);
+        uint8x8_t vr1w2 = vdup_n_u8(128 - rweight[1]);
+        uint8x8_t vr2w2 = vdup_n_u8(128 - rweight[2]);
+        uint8x8_t vr3w2 = vdup_n_u8(128 - rweight[3]);
+        uint8x8_t vr4w2 = vdup_n_u8(128 - rweight[4]);
+        uint8x8_t vr5w2 = vdup_n_u8(128 - rweight[5]);
+        uint8x8_t vr6w2 = vdup_n_u8(128 - rweight[6]);
+        uint8x8_t vr7w2 = vdup_n_u8(128 - rweight[7]);
+
+        size_t col = 0;
+        for(; col < src_w8; col += 8)
+        {
+            internal::prefetch(rows[3] + col);
+            internal::prefetch(rows[7] + col);
+            internal::prefetch(rows[11] + col);
+            internal::prefetch(rows[15] + col);
+resize8u_ystretch:
+            uint8x8_t vsrc0l1 = vld1_u8(rows[0] + col);
+            uint8x8_t vsrc0l2 = vld1_u8(rows[1] + col);
+            uint8x8_t vsrc1l1 = vld1_u8(rows[2] + col);
+            uint8x8_t vsrc1l2 = vld1_u8(rows[3] + col);
+
+            // (l1 * w + l2 * (128 - w) + 64) / 128
+            uint16x8_t vdst0l = vmull_u8(vsrc0l1, vr0w);
+            uint16x8_t vdst1l = vmull_u8(vsrc1l1, vr1w);
+
+            uint8x8_t vsrc2l1 = vld1_u8(rows[4] + col);
+            uint8x8_t vsrc2l2 = vld1_u8(rows[5] + col);
+            uint8x8_t vsrc3l1 = vld1_u8(rows[6] + col);
+            uint8x8_t vsrc3l2 = vld1_u8(rows[7] + col);
+
+            vdst0l = vmlal_u8(vdst0l, vsrc0l2, vr0w2);
+            vdst1l = vmlal_u8(vdst1l, vsrc1l2, vr1w2);
+            uint16x8_t vdst2l = vmull_u8(vsrc2l1, vr2w);
+            uint16x8_t vdst3l = vmull_u8(vsrc3l1, vr3w);
+
+            uint8x8_t vsrc4l1 = vld1_u8(rows[8] + col);
+            uint8x8_t vsrc4l2 = vld1_u8(rows[9] + col);
+            uint8x8_t vsrc5l1 = vld1_u8(rows[10] + col);
+            uint8x8_t vsrc5l2 = vld1_u8(rows[11] + col);
+
+            vdst2l = vmlal_u8(vdst2l, vsrc2l2, vr2w2);
+            vdst3l = vmlal_u8(vdst3l, vsrc3l2, vr3w2);
+            uint16x8_t vdst4l = vmull_u8(vsrc4l1, vr4w);
+            uint16x8_t vdst5l = vmull_u8(vsrc5l1, vr5w);
+
+            uint8x8_t vsrc6l1 = vld1_u8(rows[12] + col);
+            uint8x8_t vsrc6l2 = vld1_u8(rows[13] + col);
+            uint8x8_t vsrc7l1 = vld1_u8(rows[14] + col);
+            uint8x8_t vsrc7l2 = vld1_u8(rows[15] + col);
+
+            uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7);
+            uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7);
+            vdst4l = vmlal_u8(vdst4l, vsrc4l2, vr4w2);
+            vdst5l = vmlal_u8(vdst5l, vsrc5l2, vr5w2);
+            uint16x8_t vdst6l = vmull_u8(vsrc6l1, vr6w);
+            uint16x8_t vdst7l = vmull_u8(vsrc7l1, vr7w);
+
+            uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7);
+            uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7);
+            vdst6l = vmlal_u8(vdst6l, vsrc6l2, vr6w2);
+            vdst7l = vmlal_u8(vdst7l, vsrc7l2, vr7w2);
+
+            uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7);
+            uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7);
+            uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7);
+            uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7);
+
+            // == 8x8 matrix transpose ==
+
+            //00 01 02 03 04 05 06 07   d0
+            //10 11 12 13 14 15 16 17   d1
+            //20 21 22 23 24 25 26 27   d2
+            //30 31 32 33 34 35 36 37   d3
+            //40 41 42 43 44 45 46 47   d4
+            //50 51 52 53 54 55 56 57   d5
+            //60 61 62 63 64 65 66 67   d6
+            //70 71 72 73 74 75 76 77   d7
+
+            uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1);
+            uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3);
+            uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5);
+            uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7);
+
+            uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]);
+            uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]);
+            uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]);
+            uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]);
+
+            //00 10 02 12 04 14 06 16   d0
+            //01 11 03 13 05 15 07 17   d1
+            //20 30 22 32 24 34 26 36   d2
+            //21 31 23 33 25 35 27 37   d3
+            //40 50 42 52 44 54 46 56   d4
+            //41 51 43 53 45 55 47 57   d5
+            //60 70 62 72 64 74 66 76   d6
+            //61 71 63 73 65 75 67 77   d7
+
+            uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2);
+            uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6);
+
+            //00 10 20 30 04 14 24 34   d0
+            //01 11 21 31 05 15 25 35   d1
+            //02 12 22 32 06 16 26 36   d2
+            //03 13 23 33 07 17 27 37   d3
+            //40 50 60 70 44 54 64 74   d4
+            //41 51 61 71 45 55 65 75   d5
+            //42 52 62 72 46 56 66 76   d6
+            //43 53 63 73 47 57 67 77   d7
+
+            uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]);
+            uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]);
+
+            //00 10 20 30 40 50 60 70   d0
+            //01 11 21 31 41 51 61 71   d1
+            //02 12 22 32 42 52 62 72   d2
+            //03 13 23 33 43 53 63 73   d3
+            //04 14 24 34 44 54 64 74   d4
+            //05 15 25 35 45 55 65 75   d5
+            //06 16 26 36 46 56 66 76   d6
+            //07 17 27 37 47 57 67 77   d7
+
+            vst1q_u8(buf + col * 8 +  0, (uint8x16_t)vq2q0t.val[0]);
+            vst1q_u8(buf + col * 8 + 16, (uint8x16_t)vq3q1t.val[0]);
+            vst1q_u8(buf + col * 8 + 32, (uint8x16_t)vq2q0t.val[1]);
+            vst1q_u8(buf + col * 8 + 48, (uint8x16_t)vq3q1t.val[1]);
+        }
+
+        if (col < ssize.width)
+        {
+            col = ssize.width - 8;
+            goto resize8u_ystretch;
+        }
+
+        u8* dst_data = dstBase + r * dstStride;
+        const u8** cols = gcols;
+        u8* cweight = gcweight;
+
+        size_t dcol = 0;
+        for (; dcol < dst_w8; dcol += 8, cols += 16, cweight += 8)
+        {
+            internal::prefetch(cols[0], 64*4);
+resize8u_xstretch:
+            uint8x8_t vc0w = vdup_n_u8(cweight[0]);
+            uint8x8_t vc1w = vdup_n_u8(cweight[1]);
+            uint8x8_t vc2w = vdup_n_u8(cweight[2]);
+            uint8x8_t vc3w = vdup_n_u8(cweight[3]);
+            uint8x8_t vc4w = vdup_n_u8(cweight[4]);
+            uint8x8_t vc5w = vdup_n_u8(cweight[5]);
+            uint8x8_t vc6w = vdup_n_u8(cweight[6]);
+            uint8x8_t vc7w = vdup_n_u8(cweight[7]);
+
+            uint8x8_t vc0w2 = vdup_n_u8(128 - cweight[0]);
+            uint8x8_t vc1w2 = vdup_n_u8(128 - cweight[1]);
+            uint8x8_t vc2w2 = vdup_n_u8(128 - cweight[2]);
+            uint8x8_t vc3w2 = vdup_n_u8(128 - cweight[3]);
+            uint8x8_t vc4w2 = vdup_n_u8(128 - cweight[4]);
+            uint8x8_t vc5w2 = vdup_n_u8(128 - cweight[5]);
+            uint8x8_t vc6w2 = vdup_n_u8(128 - cweight[6]);
+            uint8x8_t vc7w2 = vdup_n_u8(128 - cweight[7]);
+
+            uint8x8_t vsrc0l1 = vld1_u8(cols[0]);
+            uint8x8_t vsrc0l2 = vld1_u8(cols[1]);
+            uint8x8_t vsrc1l1 = vld1_u8(cols[2]);
+            uint8x8_t vsrc1l2 = vld1_u8(cols[3]);
+            uint8x8_t vsrc2l1 = vld1_u8(cols[4]);
+            uint8x8_t vsrc2l2 = vld1_u8(cols[5]);
+            uint8x8_t vsrc3l1 = vld1_u8(cols[6]);
+            uint8x8_t vsrc3l2 = vld1_u8(cols[7]);
+            uint8x8_t vsrc4l1 = vld1_u8(cols[8]);
+            uint8x8_t vsrc4l2 = vld1_u8(cols[9]);
+            uint8x8_t vsrc5l1 = vld1_u8(cols[10]);
+            uint8x8_t vsrc5l2 = vld1_u8(cols[11]);
+            uint8x8_t vsrc6l1 = vld1_u8(cols[12]);
+            uint8x8_t vsrc6l2 = vld1_u8(cols[13]);
+            uint8x8_t vsrc7l1 = vld1_u8(cols[14]);
+            uint8x8_t vsrc7l2 = vld1_u8(cols[15]);
+
+            // (l1 * w + l2 * (128 - w) + 64) / 128
+            uint16x8_t vdst0l = vmull_u8(vsrc0l1, vc0w);
+            uint16x8_t vdst1l = vmull_u8(vsrc1l1, vc1w);
+            uint16x8_t vdst2l = vmull_u8(vsrc2l1, vc2w);
+            uint16x8_t vdst3l = vmull_u8(vsrc3l1, vc3w);
+            uint16x8_t vdst4l = vmull_u8(vsrc4l1, vc4w);
+            uint16x8_t vdst5l = vmull_u8(vsrc5l1, vc5w);
+            uint16x8_t vdst6l = vmull_u8(vsrc6l1, vc6w);
+            uint16x8_t vdst7l = vmull_u8(vsrc7l1, vc7w);
+
+            vdst0l = vmlal_u8(vdst0l, vsrc0l2, vc0w2);
+            vdst1l = vmlal_u8(vdst1l, vsrc1l2, vc1w2);
+            vdst2l = vmlal_u8(vdst2l, vsrc2l2, vc2w2);
+            vdst3l = vmlal_u8(vdst3l, vsrc3l2, vc3w2);
+            vdst4l = vmlal_u8(vdst4l, vsrc4l2, vc4w2);
+            vdst5l = vmlal_u8(vdst5l, vsrc5l2, vc5w2);
+            vdst6l = vmlal_u8(vdst6l, vsrc6l2, vc6w2);
+            vdst7l = vmlal_u8(vdst7l, vsrc7l2, vc7w2);
+
+            uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7);
+            uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7);
+            uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7);
+            uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7);
+            uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7);
+            uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7);
+            uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7);
+            uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7);
+
+            // == 8x8 matrix transpose ==
+            uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1);
+            uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3);
+            uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5);
+            uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7);
+            uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]);
+            uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]);
+            uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]);
+            uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]);
+            uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2);
+            uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6);
+            uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]);
+            uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]);
+
+            //save results
+            vst1_u8(dst_data + 0 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[0]));
+            vst1_u8(dst_data + 1 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[0]));
+            vst1_u8(dst_data + 2 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[0]));
+            vst1_u8(dst_data + 3 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[0]));
+            vst1_u8(dst_data + 4 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[1]));
+            vst1_u8(dst_data + 5 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[1]));
+            vst1_u8(dst_data + 6 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[1]));
+            vst1_u8(dst_data + 7 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[1]));
+        }
+
+        if (dcol < dsize.width)
+        {
+            dcol = dsize.width - 8;
+            cols = gcols + dcol * 2;
+            cweight = gcweight + dcol;
+            goto resize8u_xstretch;
+        }
+    }
+
+    if (r < dsize.height)
+    {
+        r = dsize.height - 8;
+        goto resize8u_xystretch;
+    }
+}
+
+template <int channels> struct resizeLinearInternals;
+template <> struct resizeLinearInternals<1>
+{
+    int32x4_t vc_upd;
+    int32x4_t vc0;
+    int32x4_t vcmax;
+
+    inline resizeLinearInternals(int32x4_t & vi, u32 srccols)
+    {
+        vc_upd = vdupq_n_s32(4);
+        vc0 = vdupq_n_s32(0);
+        vcmax = vdupq_n_s32(srccols-1);
+
+        s32 tmp0123[] = {0, 1, 2, 3 };
+        vi = vld1q_s32(tmp0123);
+    }
+    inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl)
+    {
+        vsrch = vminq_s32(vsrch, vcmax);
+        vsrcl = vmaxq_s32(vsrcl, vc0);
+        vsrcl = vminq_s32(vsrcl, vcmax);//for safe tail
+        vsrch = vshlq_n_s32(vsrch, 3);
+        vsrcl = vshlq_n_s32(vsrcl, 3);
+        vi = vaddq_s32(vi, vc_upd);
+    }
+};
+template <> struct resizeLinearInternals<4>
+{
+    int32x4_t vc_upd;
+    int32x4_t vc0;
+    int32x4_t vcmax;
+    int32x4_t v0123x8;
+
+    inline resizeLinearInternals(int32x4_t & vi, u32 srccols)
+    {
+        vc_upd = vdupq_n_s32(1);
+        vc0 = vdupq_n_s32(0);
+        vcmax = vdupq_n_s32(srccols-1);
+        s32 tmp0123x8[] = {0, 8, 16, 24};
+        v0123x8 = vld1q_s32(tmp0123x8);
+
+        vi = vc0;
+    }
+    inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl)
+    {
+        vsrch = vminq_s32(vsrch, vcmax);
+        vsrcl = vmaxq_s32(vsrcl, vc0);
+        vsrch = vshlq_n_s32(vsrch, 5);
+        vsrcl = vshlq_n_s32(vsrcl, 5);
+        vsrch = vaddq_s32(vsrch, v0123x8);
+        vsrcl = vaddq_s32(vsrcl, v0123x8);
+        vi = vaddq_s32(vi, vc_upd);
+    }
+};
+
+template <int channels>
+void resizeLinearOpenCVchan(const Size2D &_ssize, const Size2D &_dsize,
+                            const u8 * srcBase, ptrdiff_t srcStride,
+                            u8 * dstBase, ptrdiff_t dstStride,
+                            f32 wr, f32 hr)
+{
+    float scale_x_offset = 0.5f * wr - 0.5f;
+
+    Size2D ssize(_ssize.width*channels, _ssize.height);
+    Size2D dsize(_dsize.width*channels, _dsize.height);
+
+    std::vector<u8> gcweight((dsize.width + 7) & ~7);
+    std::vector<const u8*> gcols(((dsize.width + 7) & ~7) * 2);
+    std::vector<u8> buf(((ssize.width + 7) & ~7) * 8); // (8 rows) x (width of src)
+
+    float32x4_t vscale_x = vdupq_n_f32(wr);
+    float32x4_t vscale_x_offset = vdupq_n_f32(scale_x_offset);
+    int32x4_t vc1 = vdupq_n_s32(1);
+    float32x4_t vc128f = vdupq_n_f32(128.0f);
+
+    int32x4_t vi;
+    resizeLinearInternals<channels> indexes(vi, _ssize.width);//u32 is used to store indexes
+                                                              //so we could get issues on src image dimensions greater than (2^32-1)
+
+    for (size_t dcol = 0; dcol < dsize.width; dcol += 8)
+    {
+        s32 idx[16];
+
+        float32x4_t vif = vcvtq_f32_s32(vi);
+        float32x4_t vw = vmlaq_f32(vscale_x_offset, vscale_x, vif);
+        int32x4_t vwi = vcvtq_s32_f32(vw);
+        float32x4_t vwif = vcvtq_f32_s32(vwi);
+        int32x4_t vmask = (int32x4_t)vcltq_f32(vwif, vw);
+        int32x4_t vsrch = vsubq_s32(vwi, vmask);
+        int32x4_t vsrcl = vsubq_s32(vsrch, vc1);
+        float32x4_t vsrchf = vcvtq_f32_s32(vsrch);
+        float32x4_t vw2 = vsubq_f32(vsrchf, vw);
+
+        vw2 = vmulq_f32(vw2, vc128f);
+        uint32x4_t vw32u = vcvtq_u32_f32(vw2);
+        uint16x4_t vw16ul = vmovn_u32(vw32u);
+        indexes.updateIndexes(vi, vsrch, vsrcl);
+
+        vst1q_s32(idx + 0, vsrcl);
+        vst1q_s32(idx + 8, vsrch);
+
+        vif = vcvtq_f32_s32(vi);
+        vw = vmlaq_f32(vscale_x_offset, vscale_x, vif);
+        vwi = vcvtq_s32_f32(vw);
+        vwif = vcvtq_f32_s32(vwi);
+        vmask = (int32x4_t)vcltq_f32(vwif, vw);
+        vsrch = vsubq_s32(vwi, vmask);
+        vsrcl = vsubq_s32(vsrch, vc1);
+        vsrchf = vcvtq_f32_s32(vsrch);
+        vw2 = vsubq_f32(vsrchf, vw);
+
+        vw2 = vmulq_f32(vw2, vc128f);
+        vw32u = vcvtq_u32_f32(vw2);
+        indexes.updateIndexes(vi, vsrch, vsrcl);
+
+        uint16x4_t vw16uh = vmovn_u32(vw32u);
+
+        vst1q_s32(idx + 4, vsrcl);
+        vst1q_s32(idx + 12, vsrch);
+
+        uint8x8_t vw8u = vmovn_u16(vcombine_u16(vw16ul, vw16uh));
+
+        for (u32 i = 0; i < 8; ++i)
+        {
+            gcols[dcol * 2 + i*2] = &buf[idx[i]];
+            gcols[dcol * 2 + i*2 + 1] = &buf[idx[i + 8]];
+        }
+
+        vst1_u8(&gcweight[dcol], vw8u);
+    }
+
+    resize_bilinear_rows(ssize, dsize, srcBase, srcStride, dstBase, dstStride, hr, &gcols[0], &gcweight[0], &buf[0]);
+}
+
+void downsample_bilinear_8uc1(const Size2D &ssize, const Size2D &dsize,
+                              const u8 * srcBase, ptrdiff_t srcStride,
+                              u8 * dstBase, ptrdiff_t dstStride,
+                              f32 wr, f32 hr)
+{
+    internal::assertSupportedConfiguration(wr <= 2.f && hr <= 2.f);
+
+    enum { SHIFT_BITS = 11 };
+
+    f32 scale_x_offset = 0.5f * wr - 0.5f;
+    f32 scale_y_offset = 0.5f * hr - 0.5f;
+
+    std::vector<s32> _buf(dsize.height*(2*(sizeof(ptrdiff_t)/sizeof(s32))+1)+1);
+    ptrdiff_t* buf = (ptrdiff_t*)&_buf[0];
+    s32* buf2 = (s32*)buf+2*(sizeof(ptrdiff_t)/sizeof(s32))*dsize.height;
+    for(size_t row = 0; row < (size_t)dsize.height; ++row)
+    {
+        f32 r = row * hr + scale_y_offset;
+        ptrdiff_t src_row = floorf(r);
+        ptrdiff_t src_row2 = src_row + 1;
+
+        f32 rweight = src_row2 - r;
+        buf2[row] = floorf(rweight * (1 << SHIFT_BITS) + 0.5f);
+        buf[0 * dsize.height + row] = std::max<ptrdiff_t>(0, src_row);
+        buf[1 * dsize.height + row] = std::min((ptrdiff_t)ssize.height-1, src_row2);
+    }
+
+#define USE_CORRECT_VERSION 0
+
+    ptrdiff_t col = 0;
+/***********************************************/
+    for(; col <= (ptrdiff_t)dsize.width-16; col+=16)
+    {
+        ptrdiff_t col1[16];
+        ptrdiff_t col2[16];
+        s16 cwi[16];
+
+        for(s32 k = 0; k < 16; ++k)
+        {
+            f32 c = (col + k) * wr + scale_x_offset;
+            col1[k] = (ptrdiff_t)c;
+            col2[k] = col1[k] + 1;
+
+            cwi[k] = (short)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f);
+
+            if(col1[k] < 0) col1[k] = 0;
+            if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = ssize.width-1;
+        }
+
+        ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16);
+        ptrdiff_t y = std::min(col1[8], (ptrdiff_t)ssize.width-16);
+        u8 lutl[16];
+        u8 luth[16];
+        for(s32 k = 0; k < 8; ++k)
+        {
+            lutl[k] = (u8)(col1[k] - x);
+            luth[k] = (u8)(col2[k] - x);
+            lutl[k+8] = (u8)(col1[k+8] - y);
+            luth[k+8] = (u8)(col2[k+8] - y);
+        }
+
+        uint8x8_t vlutl = vld1_u8(lutl);
+        uint8x8_t vluth = vld1_u8(luth);
+        int16x8_t vcw = vld1q_s16(cwi);
+
+        uint8x8_t vlutl_ = vld1_u8(lutl+8);
+        uint8x8_t vluth_ = vld1_u8(luth+8);
+        int16x8_t vcw_ = vld1q_s16(cwi+8);
+
+        for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row)
+        {
+#if USE_CORRECT_VERSION
+            int32x4_t vrw = vdupq_n_s32(buf2[row]);
+#else
+            int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]);
+            int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row]));
+#endif
+
+            internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride);
+            internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride);
+
+            {
+                union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) };
+                union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) };
+
+                uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl);
+                uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth);
+                uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl);
+                uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth);
+
+                uint16x8_t v1hw = vmovl_u8(vr1h);
+                uint16x8_t v2hw = vmovl_u8(vr2h);
+
+                int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
+                int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
+
+                int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw),  SHIFT_BITS));
+                int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
+                int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw),  SHIFT_BITS));
+                int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
+
+                v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw));
+                v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw));
+                v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw));
+                v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw));
+
+#if USE_CORRECT_VERSION
+                /* correct version */
+                int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
+                int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
+                int32x4_t vdiffL = vsubq_s32(v1L, v2L);
+                int32x4_t vdiffH = vsubq_s32(v1H, v2H);
+
+                vL = vmlaq_s32(vL, vdiffL, vrw);
+                vH = vmlaq_s32(vH, vdiffH, vrw);
+                uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
+                uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
+                uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
+                vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
+#else
+                /* ugly version matching to OpenCV's SSE optimization */
+                int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
+                int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
+                int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
+                int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
+
+                int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
+                int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
+
+                int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
+                uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
+
+                vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
+#endif
+            }
+
+            {
+                union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + y) };
+                union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + y) };
+
+                uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl_);
+                uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth_);
+                uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl_);
+                uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth_);
+
+                uint16x8_t v1hw = vmovl_u8(vr1h);
+                uint16x8_t v2hw = vmovl_u8(vr2h);
+
+                int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
+                int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
+
+                int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw),  SHIFT_BITS));
+                int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
+                int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw),  SHIFT_BITS));
+                int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
+
+                v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw_));
+                v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw_));
+                v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw_));
+                v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw_));
+
+#if USE_CORRECT_VERSION
+                /* correct version */
+                int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
+                int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
+                int32x4_t vdiffL = vsubq_s32(v1L, v2L);
+                int32x4_t vdiffH = vsubq_s32(v1H, v2H);
+
+                vL = vmlaq_s32(vL, vdiffL, vrw);
+                vH = vmlaq_s32(vH, vdiffH, vrw);
+                uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
+                uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
+                uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
+                vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres);
+#else
+                /* ugly version matching to OpenCV's SSE optimization */
+                int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
+                int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
+                int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
+                int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
+
+                int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
+                int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
+
+                int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
+                uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
+
+                vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres);
+#endif
+            }
+        }
+    }
+/***********************************************/
+    for(; col <= (ptrdiff_t)dsize.width-8; col+=8)
+    {
+downsample_bilinear_8uc1_col_loop8:
+        ptrdiff_t col1[8];
+        ptrdiff_t col2[8];
+        s16 cwi[8];
+
+        for(s32 k = 0; k < 8; ++k)
+        {
+            f32 c = (col + k) * wr + scale_x_offset;
+            col1[k] = (ptrdiff_t)c;
+            col2[k] = col1[k] + 1;
+
+            cwi[k] = (s16)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f);
+
+            if(col1[k] < 0) col1[k] = 0;
+            if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = (ptrdiff_t)ssize.width-1;
+        }
+
+        ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16);
+        u8 lutl[8];
+        u8 luth[8];
+        for(s32 k = 0; k < 8; ++k)
+        {
+            lutl[k] = (u8)(col1[k] - x);
+            luth[k] = (u8)(col2[k] - x);
+        }
+
+        uint8x8_t vlutl = vld1_u8(lutl);
+        uint8x8_t vluth = vld1_u8(luth);
+        int16x8_t vcw = vld1q_s16(cwi);
+
+        for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row)
+        {
+#if USE_CORRECT_VERSION
+            int32x4_t vrw = vdupq_n_s32(buf2[row]);
+#else
+            int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]);
+            int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row]));
+#endif
+
+            internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride);
+            internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride);
+
+            union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) };
+            union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) };
+
+            uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl);
+            uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth);
+            uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl);
+            uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth);
+
+            uint16x8_t v1hw = vmovl_u8(vr1h);
+            uint16x8_t v2hw = vmovl_u8(vr2h);
+
+            int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
+            int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
+
+            int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw),  SHIFT_BITS));
+            int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
+            int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw),  SHIFT_BITS));
+            int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
+
+            v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw));
+            v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw));
+            v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw));
+            v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw));
+
+#if USE_CORRECT_VERSION
+            /* correct version */
+            int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
+            int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
+            int32x4_t vdiffL = vsubq_s32(v1L, v2L);
+            int32x4_t vdiffH = vsubq_s32(v1H, v2H);
+
+            vL = vmlaq_s32(vL, vdiffL, vrw);
+            vH = vmlaq_s32(vH, vdiffH, vrw);
+            uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
+            uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
+            uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
+            vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
+#else
+            /* ugly version matching to OpenCV's SSE optimization */
+            int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
+            int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
+            int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
+            int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
+
+            int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
+            int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
+
+            int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
+            uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
+
+            vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
+#endif
+        }
+    }
+    if (col < (ptrdiff_t)dsize.width)
+    {
+        col = dsize.width - 8;
+        goto downsample_bilinear_8uc1_col_loop8;
+    }
+}
+
+} // namespace
+
+#endif
+
+void resizeLinearOpenCV(const Size2D &ssize, const Size2D &dsize,
+                        const u8 * srcBase, ptrdiff_t srcStride,
+                        u8 * dstBase, ptrdiff_t dstStride,
+                        f32 wr, f32 hr, u32 channels)
+{
+    internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
+                                           (dsize.width - 0.5) * wr - 0.5 < ssize.width &&
+                                           (dsize.height - 0.5) * hr - 0.5 < ssize.height &&  // Ensure we have enough source data
+                                           (dsize.width + 0.5) * wr + 0.5 >= ssize.width &&
+                                           (dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big
+                                           isResizeLinearOpenCVSupported(ssize, dsize, channels));
+#ifdef CAROTENE_NEON
+        if(1 == channels)
+        {
+            if (wr <= 1.f && hr <= 1.f)
+                resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
+            else if (wr <= 2.0f && hr <= 2.0f && ssize.width >= 16)
+                downsample_bilinear_8uc1(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
+            else
+                resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
+        }
+        else if(4 == channels)
+            resizeLinearOpenCVchan<4>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)wr;
+    (void)hr;
+    (void)channels;
+#endif
+}
+
+void resizeLinear(const Size2D &ssize, const Size2D &dsize,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  f32 wr, f32 hr, u32 channels)
+{
+    internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
+                                           (dsize.width - 0.5) * wr - 0.5 < ssize.width &&
+                                           (dsize.height - 0.5) * hr - 0.5 < ssize.height &&  // Ensure we have enough source data
+                                           (dsize.width + 0.5) * wr + 0.5 >= ssize.width &&
+                                           (dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big
+                                           isResizeLinearSupported(ssize, dsize,
+                                                                   wr, hr, channels));
+#ifdef CAROTENE_NEON
+    f32 scale_x = wr;
+    f32 scale_x_offset = 0.5f * scale_x - 0.5f;
+    f32 scale_y = hr;
+    f32 scale_y_offset = 0.5f * scale_y - 0.5f;
+
+    std::vector<ptrdiff_t> _buf(dsize.height * 3 + 1);
+    std::vector<f32> coeff(dsize.height);
+    ptrdiff_t * buf = &_buf[0];
+
+    for (size_t row = 0; row < dsize.height; ++row)
+    {
+        f32 r = row * scale_y + scale_y_offset;
+        ptrdiff_t src_row = floorf(r);
+        ptrdiff_t src_row2 = src_row + 1;
+
+        f32 rweight = src_row2 - r;
+        buf[0 * dsize.height + row] = std::max<ptrdiff_t>(0, src_row);
+        buf[1 * dsize.height + row] = std::min<ptrdiff_t>(ssize.height - 1, src_row2);
+        coeff[row] = rweight;
+    }
+
+    size_t col = 0;
+    for ( ; col + 16 <= dsize.width; col += 16)
+    {
+        ptrdiff_t col1[16], col2[16];
+        f32 cwi[16];
+
+        for(s32 k = 0; k < 16; ++k)
+        {
+            f32 c = (col + k) * scale_x + scale_x_offset;
+            col1[k] = floorf(c);
+            col2[k] = col1[k] + 1;
+
+            cwi[k] = col2[k] - c;
+
+            if (col1[k] < 0)
+                col1[k] = 0;
+            if (col2[k] >= (ptrdiff_t)ssize.width)
+                col2[k] = ssize.width - 1;
+        }
+
+        ptrdiff_t x = std::min<ptrdiff_t>(col1[0], ssize.width - 16);
+        ptrdiff_t y = std::min<ptrdiff_t>(col1[8], ssize.width - 16);
+        u8 lutl[16], luth[16];
+
+        for (s32 k = 0; k < 8; ++k)
+        {
+            lutl[k] = (u8)(col1[k] - x);
+            luth[k] = (u8)(col2[k] - x);
+            lutl[k + 8] = (u8)(col1[k + 8] - y);
+            luth[k + 8] = (u8)(col2[k + 8] - y);
+        }
+
+        uint8x8_t vlutl = vld1_u8(lutl);
+        uint8x8_t vluth = vld1_u8(luth);
+        float32x4_t vcw0 = vld1q_f32(cwi);
+        float32x4_t vcw1 = vld1q_f32(cwi + 4);
+
+        uint8x8_t vlutl_ = vld1_u8(lutl + 8);
+        uint8x8_t vluth_ = vld1_u8(luth + 8);
+        float32x4_t vcw0_ = vld1q_f32(cwi + 8);
+        float32x4_t vcw1_ = vld1q_f32(cwi + 12);
+
+        if (channels == 1)
+        {
+            for (size_t row = 0; row < dsize.height; ++row)
+            {
+                float32x4_t vrw = vdupq_n_f32(coeff[row]);
+
+                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
+                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
+                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
+
+                internal::prefetch(srow0 + x + 2 * srcStride);
+                internal::prefetch(srow1 + x + 2 * srcStride);
+
+                uint8x8_t vres0 = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x),
+                                                   vlutl, vluth,
+                                                   vrw, vcw0, vcw1);
+
+                uint8x8_t vres1 = resizeLinearStep(vld1q_u8(srow0 + y), vld1q_u8(srow1 + y),
+                                                   vlutl_, vluth_,
+                                                   vrw, vcw0_, vcw1_);
+
+                vst1q_u8(drow + col, vcombine_u8(vres0, vres1));
+            }
+        }
+        else if (channels == 3)
+        {
+            for (size_t row = 0; row < dsize.height; ++row)
+            {
+                float32x4_t vrw = vdupq_n_f32(coeff[row]);
+
+                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
+                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
+                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
+
+                internal::prefetch(srow0 + x + 2 * srcStride);
+                internal::prefetch(srow1 + x + 2 * srcStride);
+
+                uint8x16x3_t v_src10 = vld3q_u8(srow0 + (x * 3));
+                uint8x16x3_t v_src20 = vld3q_u8(srow1 + (x * 3));
+
+                uint8x16x3_t v_src11 = vld3q_u8(srow0 + (y * 3));
+                uint8x16x3_t v_src21 = vld3q_u8(srow1 + (y * 3));
+
+                uint8x16x3_t v_dst;
+
+                v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1),
+                                           resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_));
+                v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1),
+                                           resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_));
+                v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1),
+                                           resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_));
+
+                vst3q_u8(drow + (col * 3), v_dst);
+            }
+        }
+        else if (channels == 4)
+        {
+            for (size_t row = 0; row < dsize.height; ++row)
+            {
+                float32x4_t vrw = vdupq_n_f32(coeff[row]);
+
+                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
+                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
+                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
+
+                internal::prefetch(srow0 + x + 2 * srcStride);
+                internal::prefetch(srow1 + x + 2 * srcStride);
+
+                uint8x16x4_t v_src10 = vld4q_u8(srow0 + (x << 2));
+                uint8x16x4_t v_src20 = vld4q_u8(srow1 + (x << 2));
+
+                uint8x16x4_t v_src11 = vld4q_u8(srow0 + (y << 2));
+                uint8x16x4_t v_src21 = vld4q_u8(srow1 + (y << 2));
+
+                uint8x16x4_t v_dst;
+
+                v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1),
+                                           resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_));
+                v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1),
+                                           resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_));
+                v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1),
+                                           resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_));
+                v_dst.val[3] = vcombine_u8(resizeLinearStep(v_src10.val[3], v_src20.val[3], vlutl, vluth, vrw, vcw0, vcw1),
+                                           resizeLinearStep(v_src11.val[3], v_src21.val[3], vlutl_, vluth_, vrw, vcw0_, vcw1_));
+
+                vst4q_u8(drow + (col << 2), v_dst);
+            }
+        }
+    }
+
+    for ( ; col + 8 <= dsize.width; col += 8)
+    {
+downsample_bilinear_8uc1_col_loop8:
+        ptrdiff_t col1[8], col2[8];
+        f32 cwi[8];
+
+        for (s32 k = 0; k < 8; ++k)
+        {
+            f32 c = (col + k) * scale_x + scale_x_offset;
+            col1[k] = floorf(c);
+            col2[k] = col1[k] + 1;
+
+            cwi[k] = col2[k] - c;
+
+            if (col1[k] < 0)
+                col1[k] = 0;
+            if (col2[k] >= (ptrdiff_t)ssize.width)
+                col2[k] = ssize.width - 1;
+        }
+
+        ptrdiff_t x = std::min<ptrdiff_t>(col1[0], ssize.width - 16);
+        u8 lutl[8], luth[8];
+        for (s32 k = 0; k < 8; ++k)
+        {
+            lutl[k] = (u8)(col1[k] - x);
+            luth[k] = (u8)(col2[k] - x);
+        }
+
+        uint8x8_t vlutl = vld1_u8(lutl);
+        uint8x8_t vluth = vld1_u8(luth);
+        float32x4_t vcw0 = vld1q_f32(cwi);
+        float32x4_t vcw1 = vld1q_f32(cwi + 4);
+
+        if (channels == 1)
+        {
+            for (size_t row = 0; row < dsize.height; ++row)
+            {
+                float32x4_t vrw = vdupq_n_f32(coeff[row]);
+
+                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
+                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
+                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
+
+                internal::prefetch(srow0 + x + 2 * srcStride);
+                internal::prefetch(srow1 + x + 2 * srcStride);
+
+                uint8x8_t vres = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x),
+                                                  vlutl, vluth,
+                                                  vrw, vcw0, vcw1);
+                vst1_u8(drow + col, vres);
+            }
+        }
+        else if (channels == 3)
+        {
+            for (size_t row = 0; row < dsize.height; ++row)
+            {
+                float32x4_t vrw = vdupq_n_f32(coeff[row]);
+
+                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
+                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
+                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
+
+                internal::prefetch(srow0 + x + 2 * srcStride);
+                internal::prefetch(srow1 + x + 2 * srcStride);
+
+                uint8x16x3_t v_src1 = vld3q_u8(srow0 + (x * 3));
+                uint8x16x3_t v_src2 = vld3q_u8(srow1 + (x * 3));
+
+                uint8x8x3_t v_dst;
+
+                v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1);
+                v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1);
+                v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1);
+
+                vst3_u8(drow + (col * 3), v_dst);
+            }
+        }
+        else if (channels == 4)
+        {
+            for (size_t row = 0; row < dsize.height; ++row)
+            {
+                float32x4_t vrw = vdupq_n_f32(coeff[row]);
+
+                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
+                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
+                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
+
+                internal::prefetch(srow0 + x + 2 * srcStride);
+                internal::prefetch(srow1 + x + 2 * srcStride);
+
+                uint8x16x4_t v_src1 = vld4q_u8(srow0 + (x << 2));
+                uint8x16x4_t v_src2 = vld4q_u8(srow1 + (x << 2));
+
+                uint8x8x4_t v_dst;
+
+                v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1);
+                v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1);
+                v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1);
+                v_dst.val[3] = resizeLinearStep(v_src1.val[3], v_src2.val[3], vlutl, vluth, vrw, vcw0, vcw1);
+
+                vst4_u8(drow + (col << 2), v_dst);
+            }
+        }
+    }
+
+    if (col < dsize.width)
+    {
+        col = dsize.width - 8;
+        goto downsample_bilinear_8uc1_col_loop8;
+    }
+
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)wr;
+    (void)hr;
+    (void)channels;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/saturate_cast.hpp b/3rdparty/carotene/src/saturate_cast.hpp
new file mode 100644
index 0000000000..98f8545009
--- /dev/null
+++ b/3rdparty/carotene/src/saturate_cast.hpp
@@ -0,0 +1,199 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SATURATE_CAST_HPP
+#define CAROTENE_SATURATE_CAST_HPP
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+
+#if defined _MSC_VER && defined _M_ARM
+# include <intrin.h>
+#endif
+
+#include <carotene/definitions.hpp>
+#include <carotene/types.hpp>
+
+namespace CAROTENE_NS { namespace internal {
+
+#if defined _MSC_VER && defined _M_ARM
+
+__declspec(naked) static void vcvtr_s32_f64_imp(f64 d)
+{
+    (void)d;
+    __emit(0xEEBD);  // vcvtr.s32.f64 s0, d0
+    __emit(0x0B40);
+    __emit(0xEE10);  // vmov r0, s0
+    __emit(0x0A10);
+    __emit(0x4770);  // bx lr
+}
+
+# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x);
+# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x);
+
+#elif defined CV_ICC || defined __GNUC__
+
+# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__)
+#  define CAROTENE_ROUND_FLT(value) {                              \
+    register union { f32 f; s32 i; } result;                    \
+    asm ("ftosis  %0, %1 \n" : "=w" (result.f) : "w" (value) ); \
+    return result.i; }
+#  define CAROTENE_ROUND_DBL(value) {                      \
+    register union {f32 f; s32 i;} __tegra_result;      \
+    asm (                                               \
+        "ftosid  %0, %P1\n"                             \
+        : "=w" (__tegra_result.f)                       \
+        : "w" (value)                                   \
+    );                                                  \
+    return __tegra_result.i;                            \
+    }
+# else
+#  define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value);
+#  define CAROTENE_ROUND_DBL(value) return (s32)lrint(value);
+# endif
+
+#endif
+
+inline s32 round(f32 value)
+{
+#ifdef CAROTENE_ROUND_FLT
+    CAROTENE_ROUND_FLT(value)
+#else
+    s32 intpart = (s32)(value);
+    f32 fractpart = value - intpart;
+    if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
+        return (s32)(value + (value >= 0 ? 0.5 : -0.5));
+    else
+        return intpart;
+#endif
+}
+
+inline s32 round(f64 value)
+{
+#ifdef CAROTENE_ROUND_DBL
+    CAROTENE_ROUND_DBL(value)
+#else
+    s32 intpart = (s32)(value);
+    f64 fractpart = value - intpart;
+    if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
+        return (s32)(value + (value >= 0 ? 0.5 : -0.5));
+    else
+        return intpart;
+#endif
+}
+/////////////// saturate_cast (used in image & signal processing) ///////////////////
+
+template<typename _Tp> inline _Tp saturate_cast(u8 v)    { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(s8 v)    { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(u16 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(s16 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(u32 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(s32 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(s64 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(u64 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(f32 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(f64 v)   { return _Tp(v); }
+
+template<> inline u8 saturate_cast<u8>(s8 v)      { return (u8)std::max((s32)v, 0); }
+template<> inline u8 saturate_cast<u8>(u16 v)     { return (u8)std::min((u32)v, (u32)UCHAR_MAX); }
+template<> inline u8 saturate_cast<u8>(s32 v)     { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline u8 saturate_cast<u8>(s16 v)     { return saturate_cast<u8>((s32)v); }
+template<> inline u8 saturate_cast<u8>(u32 v)     { return (u8)std::min(v, (u32)UCHAR_MAX); }
+template<> inline u8 saturate_cast<u8>(s64 v)     { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline u8 saturate_cast<u8>(u64 v)     { return (u8)std::min(v, (u64)UCHAR_MAX); }
+template<> inline u8 saturate_cast<u8>(f32 v)     { return saturate_cast<u8>(round(v)); }
+template<> inline u8 saturate_cast<u8>(f64 v)     { return saturate_cast<u8>(round(v)); }
+
+template<> inline s8 saturate_cast<s8>(u8 v)      { return (s8)std::min((s32)v, SCHAR_MAX); }
+template<> inline s8 saturate_cast<s8>(u16 v)     { return (s8)std::min((u32)v, (u32)SCHAR_MAX); }
+template<> inline s8 saturate_cast<s8>(s32 v)     { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline s8 saturate_cast<s8>(s16 v)     { return saturate_cast<s8>((s32)v); }
+template<> inline s8 saturate_cast<s8>(u32 v)     { return (s8)std::min(v, (u32)SCHAR_MAX); }
+template<> inline s8 saturate_cast<s8>(s64 v)     { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline s8 saturate_cast<s8>(u64 v)     { return (s8)std::min(v, (u64)SCHAR_MAX); }
+template<> inline s8 saturate_cast<s8>(f32 v)     { return saturate_cast<s8>(round(v)); }
+template<> inline s8 saturate_cast<s8>(f64 v)     { return saturate_cast<s8>(round(v)); }
+
+template<> inline u16 saturate_cast<u16>(s8 v)    { return (u16)std::max((s32)v, 0); }
+template<> inline u16 saturate_cast<u16>(s16 v)   { return (u16)std::max((s32)v, 0); }
+template<> inline u16 saturate_cast<u16>(s32 v)   { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline u16 saturate_cast<u16>(u32 v)   { return (u16)std::min(v, (u32)USHRT_MAX); }
+template<> inline u16 saturate_cast<u16>(s64 v)   { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline u16 saturate_cast<u16>(u64 v)   { return (u16)std::min(v, (u64)USHRT_MAX); }
+template<> inline u16 saturate_cast<u16>(f32 v)   { return saturate_cast<u16>(round(v)); }
+template<> inline u16 saturate_cast<u16>(f64 v)   { return saturate_cast<u16>(round(v)); }
+
+template<> inline s16 saturate_cast<s16>(u16 v)   { return (s16)std::min((s32)v, SHRT_MAX); }
+template<> inline s16 saturate_cast<s16>(s32 v)   { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline s16 saturate_cast<s16>(u32 v)   { return (s16)std::min(v, (u32)SHRT_MAX); }
+template<> inline s16 saturate_cast<s16>(s64 v)   { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline s16 saturate_cast<s16>(u64 v)   { return (s16)std::min(v, (u64)SHRT_MAX); }
+template<> inline s16 saturate_cast<s16>(f32 v)   { return saturate_cast<s16>(round(v)); }
+template<> inline s16 saturate_cast<s16>(f64 v)   { return saturate_cast<s16>(round(v)); }
+
+template<> inline u32 saturate_cast<u32>(s8 v)    { return (u32)std::max(v, (s8)0); }
+template<> inline u32 saturate_cast<u32>(s16 v)   { return (u32)std::max(v, (s16)0); }
+template<> inline u32 saturate_cast<u32>(s32 v)   { return (u32)std::max(v, (s32)0); }
+template<> inline u32 saturate_cast<u32>(s64 v)   { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
+template<> inline u32 saturate_cast<u32>(u64 v)   { return (u32)std::min(v, (u64)UINT_MAX); }
+//OpenCV like f32/f64 -> u32 conversion
+//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+template<> inline u32 saturate_cast<u32>(f32 v)   { return round(v); }
+template<> inline u32 saturate_cast<u32>(f64 v)   { return round(v); }
+//Negative clipping implementation
+//template<> inline u32 saturate_cast<u32>(f32 v)   { return saturate_cast<u32>(round(v)); }
+//template<> inline u32 saturate_cast<u32>(f64 v)   { return saturate_cast<u32>(round(v)); }
+
+template<> inline s32 saturate_cast<s32>(u32 v)   { return (s32)std::min(v, (u32)INT_MAX); }
+template<> inline s32 saturate_cast<s32>(s64 v)   { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
+template<> inline s32 saturate_cast<s32>(u64 v)   { return (s32)std::min(v, (u64)INT_MAX); }
+template<> inline s32 saturate_cast<s32>(f32 v)   { return round(v); }
+template<> inline s32 saturate_cast<s32>(f64 v)   { return round(v); }
+
+template<> inline u64 saturate_cast<u64>(s8 v)    { return (u64)std::max(v, (s8)0); }
+template<> inline u64 saturate_cast<u64>(s16 v)   { return (u64)std::max(v, (s16)0); }
+template<> inline u64 saturate_cast<u64>(s32 v)   { return (u64)std::max(v, (s32)0); }
+template<> inline u64 saturate_cast<u64>(s64 v)   { return (u64)std::max(v, (s64)0); }
+
+template<> inline s64 saturate_cast<s64>(u64 v)   { return (s64)std::min(v, (u64)LLONG_MAX); }
+
+} }
+
+#endif
diff --git a/3rdparty/carotene/src/scharr.cpp b/3rdparty/carotene/src/scharr.cpp
new file mode 100644
index 0000000000..2c4ba29742
--- /dev/null
+++ b/3rdparty/carotene/src/scharr.cpp
@@ -0,0 +1,219 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <vector>
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
+{
+    return (dx == 0 && dy == 1 &&
+                   isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
+           (dx == 1 && dy == 0 &&
+                   isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
+}
+
+void Scharr3x3(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               s16 * dstBase, ptrdiff_t dstStride,
+               s32 dx, s32 dy,
+               BORDER_MODE border, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
+#ifdef CAROTENE_NEON
+    static s16 dw[] = {3, 10, 3};
+
+    if (dy == 1)
+        SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
+                           3, 1, dw, 0,
+                           border, borderValue, borderMargin);
+    else
+        SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
+                           1, 3, 0, dw,
+                           border, borderValue, borderMargin);
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+void ScharrDeriv(const Size2D &size, s32 cn,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width*cn;
+    size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
+
+    ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
+    std::vector<s16> _tempBuf((delta << 1) + 64);
+    s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
+
+    int16x8_t vc3 = vmovq_n_s16(3);
+    int16x8_t vc10 = vmovq_n_s16(10);
+    uint8x8_t v8c10 = vmov_n_u8(10);
+
+    for(size_t y = 0; y < size.height; y++ )
+    {
+        const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
+        const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        // do vertical convolution
+        size_t x = 0;
+        for( ; x < roiw8; x += 8 )
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+#if __GNUC_MINOR__ < 7
+            __asm__ (
+                "vld1.8 {d0}, [%[src0]]                                \n\t"
+                "vld1.8 {d2}, [%[src2]]                                \n\t"
+                "vld1.8 {d1}, [%[src1]]                                \n\t"
+                "vaddl.u8 q2, d2, d0                                   \n\t"
+                "vmull.u8 q3, d1, %[vc10]                              \n\t"
+                "vsubl.u8 q4, d2, d0                                   \n\t"
+                "vmla.s16 q3, q2, %q[vc3]                              \n\t"
+                "vst1.16 {d8-d9}, [%[out1],:128]                       \n\t"
+                "vst1.16 {d6-d7}, [%[out0],:128]                       \n\t"
+                :
+                : [out0] "r" (trow0 + x),
+                  [out1] "r" (trow1 + x),
+                  [src0] "r" (srow0 + x),
+                  [src1] "r" (srow1 + x),
+                  [src2] "r" (srow2 + x),
+                  [vc10] "w" (v8c10), [vc3] "w" (vc3)
+                : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
+            );
+#else
+            uint8x8_t s0 = vld1_u8(srow0 + x);
+            uint8x8_t s1 = vld1_u8(srow1 + x);
+            uint8x8_t s2 = vld1_u8(srow2 + x);
+
+            int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
+            int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
+            int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
+            int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
+
+            vst1q_s16(trow1 + x, t1);
+            vst1q_s16(trow0 + x, t0);
+#endif
+        }
+        for( ; x < colsn; x++ )
+        {
+            trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
+            trow1[x] = (s16)(srow2[x] - srow0[x]);
+        }
+
+        // make border
+        size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
+        for( s32 k = 0; k < cn; k++ )
+        {
+            trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
+            trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
+        }
+
+        // do horizontal convolution, interleave the results and store them to dst
+        x = 0;
+        for( ; x < roiw8; x += 8 )
+        {
+#if __GNUC_MINOR__ < 6
+            __asm__ (
+                "vld1.16 {d4-d5}, [%[s2ptr]]                           \n\t"
+                "vld1.16 {d8-d9}, [%[s4ptr]]                           \n\t"
+                "vld1.16 {d6-d7}, [%[s3ptr],:128]                      \n\t"
+                "vld1.16 {d0-d1}, [%[s0ptr]]                           \n\t"
+                "vld1.16 {d2-d3}, [%[s1ptr]]                           \n\t"
+                "vadd.i16 q7, q2, q4                                   \n\t"
+                "vmul.s16 q6, q3, %q[vc10]                             \n\t"
+                "vsub.s16 q5, q1, q0                                   \n\t"
+                "vmla.s16 q6, q7, %q[vc3]                              \n\t"
+                "vst2.16 {d10-d13}, [%[out]]                           \n\t"
+                :
+                : [out] "r" (drow + x * 2),
+                  [s0ptr] "r" (trow0 + x - cn),
+                  [s1ptr] "r" (trow0 + x + cn),
+                  [s2ptr] "r" (trow1 + x - cn),
+                  [s3ptr] "r" (trow1 + x),
+                  [s4ptr] "r" (trow1 + x + cn),
+                  [vc10] "w" (vc10), [vc3] "w" (vc3)
+                : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
+            );
+#else
+            int16x8_t s0 = vld1q_s16(trow0 + x - cn);
+            int16x8_t s1 = vld1q_s16(trow0 + x + cn);
+            int16x8_t s2 = vld1q_s16(trow1 + x - cn);
+            int16x8_t s3 = vld1q_s16(trow1 + x);
+            int16x8_t s4 = vld1q_s16(trow1 + x + cn);
+
+            int16x8_t s3x10 = vmulq_s16(s3, vc10);
+            int16x8_t s24 = vaddq_s16(s2, s4);
+
+            int16x8x2_t vr;
+            vr.val[0] = vsubq_s16(s1, s0);
+            vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
+
+            vst2q_s16(drow + x*2, vr);
+#endif //__GNUC_MINOR__ < 6
+        }
+        for( ; x < colsn; x++ )
+        {
+            drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
+            drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
+        }
+    }
+#else
+    (void)size;
+    (void)cn;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/separable_filter.cpp b/3rdparty/carotene/src/separable_filter.cpp
new file mode 100644
index 0000000000..a06172c4e6
--- /dev/null
+++ b/3rdparty/carotene/src/separable_filter.cpp
@@ -0,0 +1,109 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "separable_filter.hpp"
+
+namespace CAROTENE_NS {
+
+bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
+{
+    return isSupportedConfiguration() &&
+        size.width >= 9 && size.height >= 1 &&
+        (size.height + borderMargin.top + borderMargin.bottom) >= 2  &&
+        (dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) &&
+        (border == BORDER_MODE_CONSTANT   ||
+         border == BORDER_MODE_REFLECT    ||
+         border == BORDER_MODE_REFLECT101 ||
+         border == BORDER_MODE_REPLICATE   );
+}
+
+void SeparableFilter3x3(const Size2D &size,
+                        const u8 * srcBase, ptrdiff_t srcStride,
+                        s16 * dstBase, ptrdiff_t dstStride,
+                        const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw,
+                        BORDER_MODE border, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin));
+#ifdef CAROTENE_NEON
+    if(!((xw || rowFilter < 3) && (yw || colFilter < 3)))
+        std::abort();//Couldn't call generic filter without provided weights
+
+    typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t,
+                                            const s16*, const s16*, BORDER_MODE, u8, Margin);
+
+    static sepFilter3x3_8u16s_func quickFilters[4][4]=
+    {
+    /*d0y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121,    internal::ColFilter3x3S16_121>::process,
+             /*dx*/  internal::sepFilter3x3<internal::RowFilter3x3S16_m101,   internal::ColFilter3x3S16_121>::process,
+             /*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21,   internal::ColFilter3x3S16_121>::process,
+             /*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_121>::process},
+
+    /*dy */{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121,    internal::ColFilter3x3S16_m101>::process,
+             /*dx*/  internal::sepFilter3x3<internal::RowFilter3x3S16_m101,   internal::ColFilter3x3S16_m101>::process,
+             /*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21,   internal::ColFilter3x3S16_m101>::process,
+             /*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_m101>::process},
+
+    /*d2y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121,    internal::ColFilter3x3S16_1m21>::process,
+             /*dx*/  internal::sepFilter3x3<internal::RowFilter3x3S16_m101,   internal::ColFilter3x3S16_1m21>::process,
+             /*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21,   internal::ColFilter3x3S16_1m21>::process,
+             /*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_1m21>::process},
+
+    /*dNy*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121,    internal::ColFilter3x3S16Generic>::process,
+             /*dx*/  internal::sepFilter3x3<internal::RowFilter3x3S16_m101,   internal::ColFilter3x3S16Generic>::process,
+             /*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21,   internal::ColFilter3x3S16Generic>::process,
+             /*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16Generic>::process}
+    };
+
+    quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride,
+                                       xw, yw, border, borderValue, borderMargin);
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)xw;
+    (void)yw;
+    (void)borderValue;
+#endif
+}
+
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/separable_filter.hpp b/3rdparty/carotene/src/separable_filter.hpp
new file mode 100644
index 0000000000..b0f7307fa0
--- /dev/null
+++ b/3rdparty/carotene/src/separable_filter.hpp
@@ -0,0 +1,1161 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_SEPARABLE_FILTER_HPP
+#define CAROTENE_SRC_SEPARABLE_FILTER_HPP
+
+#include "common.hpp"
+
+#include <carotene/types.hpp>
+
+#include <vector>
+
+#ifdef CAROTENE_NEON
+
+namespace CAROTENE_NS {
+
+namespace internal {
+
+struct RowFilter3x3S16Base
+{
+    typedef u8 srcType;
+     /*
+     Various border types, image boundaries are denoted with '|'
+
+     * BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+     * BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+     * BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+     * BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+     * BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii  with some specified 'i'
+     */
+    inline RowFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue, const ptrdiff_t borderxl, const ptrdiff_t borderxr):
+                               borderType(_borderType),borderValue(_borderValue)
+    {
+        if (borderType == BORDER_MODE_CONSTANT)
+        {
+            vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x00ffFFffFFffFFffULL : 0x0100FFffFFffFFffULL));
+            vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0xFF07060504030201ULL : 0x0706050403020100ULL));
+        }
+        else if (borderType == BORDER_MODE_REFLECT101)
+        {
+            vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0001FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
+            vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0607060504030201ULL : 0x0706050403020100ULL));
+        }
+        else //if (borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE)
+        {
+            vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
+            vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
+        }
+        lookLeft = offsetk - borderxl;
+        lookRight = offsetk - borderxr;
+    }
+
+    uint8x8_t vfmask;
+    uint8x8_t vtmask;
+    enum { offsetk = 1};
+    ptrdiff_t lookLeft;
+    ptrdiff_t lookRight;
+    const BORDER_MODE borderType;
+    const srcType borderValue;
+};
+
+struct ColFilter3x3S16Base
+{
+    typedef s16 srcType;
+
+    inline ColFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue):
+                               borderType(_borderType),borderValue(_borderValue) {}
+
+    enum { offsetk = 1};
+    const BORDER_MODE borderType;
+    const srcType borderValue;
+};
+
+struct RowFilter3x3S16Generic : public RowFilter3x3S16Base
+{
+    typedef s16 dstType;
+
+    inline RowFilter3x3S16Generic(BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16 *w):
+                                  RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter( (w[0]+w[1]+w[2]) * borderValue )
+    {
+        vw0 = vdupq_n_s16(w[0]);
+        vw1 = vdupq_n_s16(w[1]);
+        vw2 = vdupq_n_s16(w[2]);
+    }
+
+    int16x8_t vw0;
+    int16x8_t vw1;
+    int16x8_t vw2;
+    const dstType borderFilter;
+
+    inline void operator()(const u8* src, s16* dst, ptrdiff_t width)
+    {
+        uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
+        if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
+            l = vset_lane_u8(borderValue, l, 6);
+
+        ptrdiff_t i = 0;
+        for (; i < width - 16 + lookRight; i += 16)
+        {
+            internal::prefetch(src + i);
+            uint8x8_t l18u = vld1_u8(src + i + 1);
+            vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0),
+                                                   vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1),
+                                                   vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2)));
+            l = vld1_u8(src + i + 9);
+            vst1q_s16(dst + i + 8, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 6))), vw0),
+                                                   vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 7))), vw1),
+                                                   vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l)), vw2)));
+        }
+        if (i < width - 8 + lookRight)
+        {
+            uint8x8_t l18u = vld1_u8(src + i + 1);
+            vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0),
+                                                   vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1),
+                                                   vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2)));
+            i += 8;
+        }
+
+        //tail
+        if (lookRight == 0 || i != width)
+        {
+            uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
+            uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
+            if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
+                tail2 = vset_lane_u8(borderValue, tail2, 7);
+            uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
+
+            int16x8_t l0 = vreinterpretq_s16_u16(vmovl_u8(tail0));
+            int16x8_t l1 = vreinterpretq_s16_u16(vmovl_u8(tail1));
+            int16x8_t l2 = vreinterpretq_s16_u16(vmovl_u8(tail2));
+
+            int16x8_t l0w = vmulq_s16(l0, vw0);
+            int16x8_t l2w = vmulq_s16(l2, vw2);
+            int16x8_t ls = vaddq_s16(vmlaq_s16(l0w, l1, vw1), l2w);
+
+            vst1q_s16(dst + (width - 8), ls);
+        }
+    }
+};
+
+struct RowFilter3x3S16_m101 : public RowFilter3x3S16Base
+{
+    typedef s16 dstType;
+
+    inline RowFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*):
+                                RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {}
+
+    const dstType borderFilter;
+
+    inline void operator()(const u8* src, s16* dst, ptrdiff_t width)
+    {
+        uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
+        if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
+            l = vset_lane_u8(borderValue, l, 6);
+
+        ptrdiff_t i = 0;
+        for (; i < width - 16 + lookRight; i += 16)
+        {
+            internal::prefetch(src + i);
+
+            uint8x8_t l2 = vld1_u8(src + i + 1);
+            vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6))));
+
+            l = vld1_u8(src + i + 9);
+            vst1q_s16(dst + i + 8, vreinterpretq_s16_u16(vsubl_u8(l, vext_u8(l2, l, 6))));
+        }
+
+        if (i < width - 8 + lookRight)
+        {
+            uint8x8_t l2 = vld1_u8(src + i + 1);
+            vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6))));
+            i += 8;
+        }
+
+        //tail
+        if (lookRight == 0 || i != width)
+        {
+            uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
+            uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
+            if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
+                tail2 = vset_lane_u8(borderValue, tail2, 7);
+
+            int16x8_t ls = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
+
+            vst1q_s16(dst + (width - 8), ls);
+        }
+    }
+};
+
+struct RowFilter3x3S16_121 : public RowFilter3x3S16Base
+{
+    typedef s16 dstType;
+
+    inline RowFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*):
+                               RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(borderValue << 2) {}
+
+    const dstType borderFilter;
+
+    inline void operator()(const u8* src, s16* dst, ptrdiff_t width)
+    {
+        uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
+        if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
+            l = vset_lane_u8(borderValue, l, 6);
+
+        ptrdiff_t i = 0;
+        for (; i < width - 16 + lookRight; i += 16)
+        {
+            internal::prefetch(src + i);
+
+            uint8x8_t l2 = vld1_u8(src + i + 1);
+            vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)),
+                                          vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1))));
+
+            l = vld1_u8(src + i + 9);
+            vst1q_s16(dst + i + 8, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)),
+                                              vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1))));
+        }
+
+        if (i < width - 8 + lookRight)
+        {
+            uint8x8_t l2 = vld1_u8(src + i + 1);
+            vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)),
+                                          vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1))));
+            i += 8;
+        }
+
+        //tail
+        if (lookRight == 0 || i != width)
+        {
+            uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
+            uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
+            if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
+                tail2 = vset_lane_u8(borderValue, tail2, 7);
+            uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
+
+            int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2));
+            int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
+
+            int16x8_t ls = vqaddq_s16(tail02, tail1x2);
+
+            vst1q_s16(dst + (width - 8), ls);
+        }
+    }
+};
+
+struct RowFilter3x3S16_1m21 : public RowFilter3x3S16Base
+{
+    typedef s16 dstType;
+
+    inline RowFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*):
+                                RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {}
+
+    const dstType borderFilter;
+
+    inline void operator()(const u8* src, s16* dst, ptrdiff_t width)
+    {
+        uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
+        if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
+            l = vset_lane_u8(borderValue, l, 6);
+
+        ptrdiff_t i = 0;
+        for (; i < width - 16 + lookRight; i += 16)
+        {
+            internal::prefetch(src + i);
+
+            uint8x8_t l2 = vld1_u8(src + i + 1);
+            vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)),
+                                          vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1))));
+
+            l = vld1_u8(src + i + 9);
+            vst1q_s16(dst + i + 8, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)),
+                                              vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1))));
+        }
+
+        if (i < width - 8 + lookRight)
+        {
+            uint8x8_t l2 = vld1_u8(src + i + 1);
+            vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)),
+                                          vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1))));
+            i += 8;
+        }
+
+        //tail
+        if (lookRight == 0 || i != width)
+        {
+            uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
+            uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
+            if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
+                tail2 = vset_lane_u8(borderValue, tail2, 7);
+            uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
+
+            int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2));
+            int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
+
+            int16x8_t ls = vqsubq_s16(tail02, tail1x2);
+
+            vst1q_s16(dst + (width - 8), ls);
+        }
+    }
+};
+
+struct ColFilter3x3S16Generic : public ColFilter3x3S16Base
+{
+    typedef s16 dstType;
+
+    inline ColFilter3x3S16Generic(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *w):
+                                  ColFilter3x3S16Base(_borderType, _borderValue)
+    {
+        vw0 = vdupq_n_s16(w[0]);
+        vw1 = vdupq_n_s16(w[1]);
+        vw2 = vdupq_n_s16(w[2]);
+    }
+
+    int16x8_t vw0;
+    int16x8_t vw1;
+    int16x8_t vw2;
+
+    inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width)
+    {
+        ptrdiff_t j = 0;
+        for (; j <= width - 16; j += 16)
+        {
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+            vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2));
+            vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1));
+
+            line1 = vld1q_s16(src1 + j + 8);
+            line2 = vld1q_s16(src2 + j + 8);
+            vst1q_s16(dst0 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0), line1, vw1), line2, vw2));
+            vst1q_s16(dst1 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j + 8), vw2), line1, vw0), line2, vw1));
+        }
+        if (j <= width - 8)
+        {
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+            vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2));
+            vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1));
+            j += 8;
+        }
+        if (j != width)
+        {
+            j = width - 8;
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+            vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2));
+            vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1));
+        }
+    }
+
+    inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width)
+    {
+        if (src0 == 0 || src2 == 0)
+        {
+            int16x8_t vwl1 = vw0;
+            int16x8_t vwl2 = vw2;
+            if (src2 == 0)
+            {
+                src2 = src0;
+                vwl1 = vw2;
+                vwl2 = vw0;
+            }
+
+            int16x8_t v_border = vdupq_n_s16(0);
+            if (borderType == BORDER_MODE_CONSTANT)
+            {
+                v_border = vmulq_s16(vdupq_n_s16(borderValue), vwl1);
+                vwl1 = vw1;
+            }
+            else if (borderType == BORDER_MODE_REFLECT101)
+            {
+                vwl1 = vw1;
+                vwl2 = vaddq_s16(vw0, vw2);
+            }
+            else //replicate\reflect
+                vwl1 = vaddq_s16(vwl1, vw1);
+
+            ptrdiff_t j = 0;
+            for (; j <= width - 16; j += 16)
+            {
+                vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1),
+                                             vmulq_s16(vld1q_s16(src2 + j), vwl2)));
+                vst1q_s16(dst + j + 8, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j + 8), vwl1),
+                                             vmulq_s16(vld1q_s16(src2 + j + 8), vwl2)));
+            }
+            if (j <= width - 8)
+            {
+                vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1),
+                                             vmulq_s16(vld1q_s16(src2 + j), vwl2)));
+                j += 8;
+            }
+            if (j != width)
+            {
+                j = width - 8;
+                vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1),
+                                             vmulq_s16(vld1q_s16(src2 + j), vwl2)));
+            }
+        }
+        else
+        {
+            ptrdiff_t j = 0;
+            for (; j <= width - 16; j += 16)
+            {
+                vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0),
+                                                                 vld1q_s16(src1 + j), vw1),
+                                                                 vld1q_s16(src2 + j), vw2));
+                vst1q_s16(dst + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0),
+                                                                     vld1q_s16(src1 + j + 8), vw1),
+                                                                     vld1q_s16(src2 + j + 8), vw2));
+            }
+            if (j <= width - 8)
+            {
+                vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0),
+                                                                 vld1q_s16(src1 + j), vw1),
+                                                                 vld1q_s16(src2 + j), vw2));
+                j += 8;
+            }
+            if (j != width)
+            {
+                j = width - 8;
+                vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0),
+                                                                 vld1q_s16(src1 + j), vw1),
+                                                                 vld1q_s16(src2 + j), vw2));
+            }
+        }
+    }
+};
+
+struct ColFilter3x3S16_m101 : public ColFilter3x3S16Base
+{
+    typedef s16 dstType;
+
+    inline ColFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *):
+                                ColFilter3x3S16Base(_borderType, _borderValue) {}
+
+    inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width)
+    {
+        ptrdiff_t j = 0;
+        for (; j <= width - 16; j += 16)
+        {
+            vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+            vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j)));
+            vst1q_s16(dst0 + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8)));
+            vst1q_s16(dst1 + j + 8, vqsubq_s16(vld1q_s16(src3 + j + 8), vld1q_s16(src1 + j + 8)));
+        }
+        if (j <= width - 8)
+        {
+            vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+            vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j)));
+            j += 8;
+        }
+        if (j != width)
+        {
+            j = width - 8;
+            vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+            vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j)));
+        }
+    }
+
+    inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width)
+    {
+        if (src0 == 0 || src2 == 0)
+        {
+            if (borderType == BORDER_MODE_CONSTANT)
+            {
+                int16x8_t v_border = vdupq_n_s16(borderValue);
+                if (src0 == 0)
+                {
+                    ptrdiff_t j = 0;
+                    for (; j <= width - 16; j += 16)
+                    {
+                        vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border));
+                        vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), v_border));
+                    }
+                    if (j <= width - 8)
+                    {
+                        vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border));
+                        j += 8;
+                    }
+                    if (j != width)
+                    {
+                        j = width - 8;
+                        vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border));
+                    }
+                }
+                else
+                {
+                    ptrdiff_t j = 0;
+                    for (; j <= width - 16; j += 16)
+                    {
+                        vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j)));
+                        vst1q_s16(dst + j + 8, vqsubq_s16(v_border, vld1q_s16(src0 + j + 8)));
+                    }
+                    if (j <= width - 8)
+                    {
+                        vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j)));
+                        j += 8;
+                    }
+                    if (j != width)
+                    {
+                        j = width - 8;
+                        vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j)));
+                    }
+                }
+            }
+            else if (borderType == BORDER_MODE_REFLECT101)
+            {
+                int16x8_t vzero = vmovq_n_s16(0);
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    vst1q_s16(dst + j, vzero);
+                    vst1q_s16(dst + j + 8, vzero);
+                }
+                if (j <= width - 8)
+                {
+                    vst1q_s16(dst + j, vzero);
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1q_s16(dst + j, vzero);
+                }
+            }
+            else //replicate\reflect
+            {
+                if (src0 == 0) src0 = src1; else src2 = src1;
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+                    vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8)));
+                }
+                if (j <= width - 8)
+                {
+                    vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+                }
+            }
+        }
+        else
+        {
+            ptrdiff_t j = 0;
+            for (; j <= width - 16; j += 16)
+            {
+                vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+                vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8)));
+            }
+            if (j <= width - 8)
+            {
+                vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+                j += 8;
+            }
+            if (j != width)
+            {
+                j = width - 8;
+                vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
+            }
+        }
+    }
+};
+
+struct ColFilter3x3S16_121 : public ColFilter3x3S16Base
+{
+    typedef s16 dstType;
+
+    inline ColFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*):
+                               ColFilter3x3S16Base(_borderType, _borderValue) {}
+
+    inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width)
+    {
+        ptrdiff_t j = 0;
+        //int16x8_t line0 = vld1q_s16(src0 + j);//1
+        //int16x8_t line1 = vld1q_s16(src1 + j);//11
+        //int16x8_t line2 = vld1q_s16(src2 + j);// 11
+        //int16x8_t line3 = vld1q_s16(src3 + j);//  1
+        for (; j <= width - 16; j += 16)
+        {
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vqaddq_s16(line1, line2);
+
+            vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12));
+            vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j))));
+
+            line1 = vld1q_s16(src1 + j + 8);
+            line2 = vld1q_s16(src2 + j + 8);
+
+            l12 = vqaddq_s16(line1, line2);
+
+            vst1q_s16(dst0 + j + 8, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), line1), l12));
+            vst1q_s16(dst1 + j + 8, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j + 8))));
+        }
+        if (j <= width - 8)
+        {
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vqaddq_s16(line1, line2);
+
+            vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12));
+            vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j))));
+            j += 8;
+        }
+        if (j != width)
+        {
+            j = width - 8;
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vqaddq_s16(line1, line2);
+
+            vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12));
+            vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j))));
+        }
+    }
+
+    inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width)
+    {
+        if (src0 == 0 || src2 == 0)
+        {
+            if (src2 == 0)
+                src2 = src0;
+
+            if (borderType == BORDER_MODE_CONSTANT)
+            {
+                int16x8_t v_border = vdupq_n_s16(borderValue);
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                  vqaddq_s16(v_border, vld1q_s16(src2 + j))));
+                    vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1),
+                                                      vqaddq_s16(v_border, vld1q_s16(src2 + j + 8))));
+                }
+                if (j <= width - 8)
+                {
+                    vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                  vqaddq_s16(v_border, vld1q_s16(src2 + j))));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                  vqaddq_s16(v_border, vld1q_s16(src2 + j))));
+                }
+            }
+            else if (borderType == BORDER_MODE_REFLECT101)
+            {
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j),
+                                                               vld1q_s16(src2 + j)), 1));
+                    vst1q_s16(dst + j + 8, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j + 8),
+                                                                   vld1q_s16(src2 + j + 8)), 1));
+                }
+                if (j <= width - 8)
+                {
+                    vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j),
+                                                               vld1q_s16(src2 + j)), 1));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j),
+                                                               vld1q_s16(src2 + j)), 1));
+                }
+            }
+            else //replicate\reflect
+            {
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    int16x8_t line1 = vld1q_s16(src1 + j);
+                    vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1),
+                                                  vqaddq_s16(line1, vld1q_s16(src2 + j))));
+
+                    line1 = vld1q_s16(src1 + j + 8);
+                    vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(line1, 1),
+                                                      vqaddq_s16(line1, vld1q_s16(src2 + j + 8))));
+                }
+                if (j <= width - 8)
+                {
+                    int16x8_t line1 = vld1q_s16(src1 + j);
+                    vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1),
+                                                  vqaddq_s16(line1, vld1q_s16(src2 + j))));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    int16x8_t line1 = vld1q_s16(src1 + j);
+                    vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1),
+                                                  vqaddq_s16(line1, vld1q_s16(src2 + j))));
+                }
+            }
+        }
+        else
+        {
+            ptrdiff_t j = 0;
+            for (; j <= width - 16; j += 16)
+            {
+                vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                              vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))));
+
+                vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1),
+                                              vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8))));
+            }
+            if (j <= width - 8)
+            {
+                vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                              vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))));
+                j += 8;
+            }
+            if (j != width)
+            {
+                j = width - 8;
+                vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                              vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))));
+            }
+        }
+    }
+};
+
+struct ColFilter3x3U8_121 : public ColFilter3x3S16Base
+{
+    typedef u8 dstType;
+
+    inline ColFilter3x3U8_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*):
+                              ColFilter3x3S16Base(_borderType, _borderValue) {}
+
+    inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, const srcType* src3, dstType* dst0, dstType* dst1, ptrdiff_t width)
+    {
+        ptrdiff_t j = 0;
+        //int16x8_t line0 = vld1q_s16(src0 + j);//1
+        //int16x8_t line1 = vld1q_s16(src1 + j);//11
+        //int16x8_t line2 = vld1q_s16(src2 + j);// 11
+        //int16x8_t line3 = vld1q_s16(src3 + j);//  1
+        for (; j <= width - 16; j += 16)
+        {
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vaddq_s16(line1, line2);
+
+            vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4));
+            vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4));
+
+            line1 = vld1q_s16(src1 + j + 8);
+            line2 = vld1q_s16(src2 + j + 8);
+
+            l12 = vaddq_s16(line1, line2);
+
+            vst1_u8(dst0 + j + 8, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j + 8), line1), l12), 4));
+            vst1_u8(dst1 + j + 8, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j + 8))), 4));
+        }
+        if (j <= width - 8)
+        {
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vaddq_s16(line1, line2);
+
+            vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4));
+            vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4));
+            j += 8;
+        }
+        if (j != width)
+        {
+            j = width - 8;
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vaddq_s16(line1, line2);
+
+            vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4));
+            vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4));
+        }
+    }
+
+    inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, dstType* dst, ptrdiff_t width)
+    {
+        if (src0 == 0 || src2 == 0)
+        {
+            if (src2 == 0)
+                src2 = src0;
+
+            if (borderType == BORDER_MODE_CONSTANT)
+            {
+                ptrdiff_t j = 0;
+                int16x8_t v_border = vdupq_n_s16(borderValue);
+                for (; j <= width - 16; j += 16)
+                {
+                    //Store normalized result, essential for gaussianBlur
+                    vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                              vaddq_s16(v_border, vld1q_s16(src2 + j))), 4));
+
+                    vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1),
+                                                                  vaddq_s16(v_border, vld1q_s16(src2 + j + 8))), 4));
+                }
+                if (j <= width - 8)
+                {
+                    vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                              vaddq_s16(v_border, vld1q_s16(src2 + j))), 4));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                              vaddq_s16(v_border, vld1q_s16(src2 + j))), 4));
+                }
+            }
+            else if (borderType == BORDER_MODE_REFLECT101)
+            {
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j),
+                                                                          vld1q_s16(src2 + j)), 1), 4));
+                    vst1_u8(dst + j + 8, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j + 8),
+                                                                          vld1q_s16(src2 + j + 8)), 1), 4));
+                }
+                if (j <= width - 8)
+                {
+                    vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j),
+                                                                          vld1q_s16(src2 + j)), 1), 4));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j),
+                                                                          vld1q_s16(src2 + j)), 1), 4));
+                }
+            }
+            else //replicate\reflect
+            {
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    int16x8_t line1 = vld1q_s16(src1 + j);
+                    vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1),
+                                                              vaddq_s16(line1, vld1q_s16(src2 + j))), 4));
+
+                    line1 = vld1q_s16(src1 + j + 8);
+                    vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1),
+                                                              vaddq_s16(line1, vld1q_s16(src2 + j + 8))), 4));
+                }
+                if (j <= width - 8)
+                {
+                    int16x8_t line1 = vld1q_s16(src1 + j);
+                    vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1),
+                                                              vaddq_s16(line1, vld1q_s16(src2 + j))), 4));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    int16x8_t line1 = vld1q_s16(src1 + j);
+                    vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1),
+                                                              vaddq_s16(line1, vld1q_s16(src2 + j))), 4));
+                }
+            }
+        }
+        else
+        {
+            ptrdiff_t j = 0;
+            for (; j <= width - 16; j += 16)
+            {
+                vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                          vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4));
+                vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1),
+                                                          vaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8))), 4));
+            }
+            if (j <= width - 8)
+            {
+                vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                          vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4));
+                j += 8;
+            }
+            if (j != width)
+            {
+                j = width - 8;
+                vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
+                                                          vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4));
+            }
+        }
+    }
+};
+
+struct ColFilter3x3S16_1m21 : public ColFilter3x3S16Base
+{
+    typedef s16 dstType;
+
+    inline ColFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, const s16*):
+                                ColFilter3x3S16Base(_borderType, _borderValue) {}
+
+    inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width)
+    {
+        ptrdiff_t j = 0;
+        //int16x8_t line0 = vld1q_s16(src0 + j);// 1
+        //int16x8_t line1 = vld1q_s16(src1 + j);//-1 1
+        //int16x8_t line2 = vld1q_s16(src2 + j);//  -1 -1
+        //int16x8_t line3 = vld1q_s16(src3 + j);//      1
+        for (; j <= width - 16; j += 16)
+        {
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vqsubq_s16(line1, line2);
+
+            vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12));
+            vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12));
+
+            line1 = vld1q_s16(src1 + j + 8);
+            line2 = vld1q_s16(src2 + j + 8);
+
+            l12 = vqsubq_s16(line1, line2);
+
+            vst1q_s16(dst0 + j + 8, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j + 8), line1), l12));
+            vst1q_s16(dst1 + j + 8, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j + 8), line2), l12));
+        }
+        if (j <= width - 8)
+        {
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vqsubq_s16(line1, line2);
+
+            vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12));
+            vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12));
+            j += 8;
+        }
+        if (j != width)
+        {
+            j = width - 8;
+            int16x8_t line1 = vld1q_s16(src1 + j);
+            int16x8_t line2 = vld1q_s16(src2 + j);
+
+            int16x8_t l12 = vqsubq_s16(line1, line2);
+
+            vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12));
+            vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12));
+        }
+    }
+
+    inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width)
+    {
+        if (src0 == 0 || src2 == 0)
+        {
+            if (src2 == 0)
+                src2 = src0;
+
+            if (borderType == BORDER_MODE_CONSTANT)
+            {
+                ptrdiff_t j = 0;
+                int16x8_t v_border = vdupq_n_s16(borderValue);
+                for (; j <= width - 16; j += 16)
+                {
+                    vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1)));
+                    vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j + 8)), vshlq_n_s16(vld1q_s16(src1 + j + 8), 1)));
+                }
+                if (j <= width - 8)
+                {
+                    vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1)));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1)));
+                }
+            }
+            else if (borderType == BORDER_MODE_REFLECT101)
+            {
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1));
+                    vst1q_s16(dst + j + 8, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8)), 1));
+                }
+                if (j <= width - 8)
+                {
+                    vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1));
+                }
+            }
+            else //replicate\reflect
+            {
+                ptrdiff_t j = 0;
+                for (; j <= width - 16; j += 16)
+                {
+                    vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)));
+                    vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8)));
+                }
+                if (j <= width - 8)
+                {
+                    vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)));
+                    j += 8;
+                }
+                if (j != width)
+                {
+                    j = width - 8;
+                    vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)));
+                }
+            }
+        }
+        else
+        {
+            ptrdiff_t j = 0;
+            for (; j <= width - 16; j += 16)
+            {
+                vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)),
+                                              vqshlq_n_s16(vld1q_s16(src1 + j), 1)));
+                vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8)),
+                                              vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1)));
+            }
+            if (j <= width - 8)
+            {
+                vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)),
+                                              vqshlq_n_s16(vld1q_s16(src1 + j), 1)));
+                j += 8;
+            }
+            if (j != width)
+            {
+                j = width - 8;
+                vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)),
+                                              vqshlq_n_s16(vld1q_s16(src1 + j), 1)));
+            }
+        }
+    }
+};
+
+template<class RowFilter, class ColFilter> struct sepFilter3x3
+{
+    typedef typename RowFilter::srcType srcType;
+    typedef typename RowFilter::dstType tmpType;
+    typedef typename ColFilter::dstType dstType;
+
+    static void process(const Size2D &ssize,
+                        const srcType * srcBase, ptrdiff_t srcStride,
+                        dstType * dstBase, ptrdiff_t dstStride,
+                        const s16 *xw, const s16 *yw,
+                        BORDER_MODE borderType, srcType borderValue, Margin borderMargin)
+    {
+        const ptrdiff_t offsetk = 1;
+        ptrdiff_t borderxl, borderxr, borderyt, borderyb;
+        borderxl = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left);
+        borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
+        borderxr = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right);
+        borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
+
+        std::vector<tmpType> _buf(ssize.width << 2);
+        tmpType * buf = &_buf[0];
+
+        RowFilter filterX(borderType, borderValue, borderxl, borderxr, xw);
+        ColFilter filterY(borderType, filterX.borderFilter, yw);
+        const ptrdiff_t lookTop = offsetk - borderyt;
+        const ptrdiff_t lookBottom = offsetk - borderyb;
+
+        const srcType* src = srcBase - lookTop * srcStride / sizeof(srcType);
+        dstType* dst = dstBase;
+
+        ptrdiff_t ridx = -lookTop;
+        for (; ridx <= (ptrdiff_t)ssize.height + lookBottom - 2; ridx += 2)
+        {
+            for (ptrdiff_t bidx = 0; bidx < 2; ++bidx, src += srcStride / sizeof(srcType))
+                filterX(src, buf + ssize.width * ((4 + ridx + bidx) % 4), ssize.width);
+
+            if (ridx <= 0)
+            {
+                if (ridx == 0) //first row
+                {
+                    filterY(0, buf + ssize.width * ((ridx + 4) % 4), buf + ssize.width * ((ridx + 1) % 4), dst, ssize.width);
+                    dst += dstStride / sizeof(dstType);
+                }
+                continue;
+            }
+
+            filterY(buf + ssize.width * ((ridx + 2) % 4),
+                    buf + ssize.width * ((ridx + 3) % 4),
+                    buf + ssize.width * ((ridx + 4) % 4),
+                    buf + ssize.width * ((ridx + 1) % 4),
+                    dst, dst + dstStride / sizeof(dstType),  ssize.width);
+
+            dst += dstStride * 2 / sizeof(dstType);
+        }
+
+        if (ridx < (ptrdiff_t)ssize.height + lookBottom)
+        {
+            filterX(src, buf + ssize.width * ((4 + ridx) % 4), ssize.width);
+            filterY(buf + ssize.width * ((2 + ridx) % 4),
+                    buf + ssize.width * ((3 + ridx) % 4),
+                    buf + ssize.width * ((4 + ridx) % 4), dst, ssize.width);
+            dst += dstStride / sizeof(dstType);
+            ridx++;
+        }
+        if (lookBottom == 0)
+            filterY(buf + ssize.width * ((ridx + 2) % 4), buf + ssize.width * ((ridx + 3) % 4), 0, dst, ssize.width);
+    }
+};
+
+} //namespace internal
+
+} //namespace CAROTENE_NS
+
+#endif // CAROTENE_NEON
+
+#endif // CAROTENE_SRC_REMAP_HPP
diff --git a/3rdparty/carotene/src/sobel.cpp b/3rdparty/carotene/src/sobel.cpp
new file mode 100644
index 0000000000..5d46045d9f
--- /dev/null
+++ b/3rdparty/carotene/src/sobel.cpp
@@ -0,0 +1,317 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <vector>
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border,
+                         s32 dx, s32 dy, Margin borderMargin)
+{
+    return dx < 3 && dx >= 0 &&
+           dy < 3 && dy >= 0 &&
+           (dx + dy) > 0 &&
+           isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin);
+}
+
+void Sobel3x3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              s16 * dstBase, ptrdiff_t dstStride,
+              s32 dx, s32 dy,
+              BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin));
+#ifdef CAROTENE_NEON
+    SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
+                       dx, dy, 0, 0,
+                       borderType, borderValue, borderMargin);
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border,
+                            s32 dx, s32 dy)
+{
+    return isSupportedConfiguration() &&
+           dx < 3 && dx >= 0 &&
+           dy < 3 && dy >= 0 &&
+           (dx + dy) > 0 &&
+           size.width >= 4 && size.height >= 2 &&
+           (border == BORDER_MODE_CONSTANT   ||
+            border == BORDER_MODE_REFLECT    ||
+            border == BORDER_MODE_REFLECT101 ||
+            border == BORDER_MODE_REPLICATE   );
+}
+
+void Sobel3x3(const Size2D &size,
+              const f32 * srcBase, ptrdiff_t srcStride,
+              f32 * dstBase, ptrdiff_t dstStride,
+              s32 dx, s32 dy,
+              BORDER_MODE borderType, f32 borderValue)
+{
+    internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy));
+#ifdef CAROTENE_NEON
+    std::vector<f32> _tmp;
+    f32 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(size.width + 2, borderValue);
+        tmp = &_tmp[1];
+    }
+
+    ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size
+    std::vector<f32> _tempBuf((delta << 1) + 64);
+    f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32);
+
+    for( size_t y = 0; y < size.height; y++ )
+    {
+        const f32* srow0;
+        const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const f32* srow2;
+        f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0);
+        f32* drow1 = internal::getRowPtr(dstBase, dstStride, y);
+        if (borderType == BORDER_MODE_REFLECT101) {
+            srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
+            srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
+        } else  if (borderType == BORDER_MODE_CONSTANT) {
+            srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            srow2 =  y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+        } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
+            srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
+        }
+
+        float32x4_t tprev = vmovq_n_f32(0.f);
+        float32x4_t tcurr = vmovq_n_f32(0.f);
+        float32x4_t tnext = vmovq_n_f32(0.f);
+        float32x4_t t0, t1, t2;
+        // do vertical convolution
+        size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4);
+        for( ; x <= bcolsn; x += 4 )
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            float32x4_t x0 = vld1q_f32(srow0 + x);
+            float32x4_t x1 = vld1q_f32(srow1 + x);
+            float32x4_t x2 = vld1q_f32(srow2 + x);
+
+            tprev = tcurr;
+            tcurr = tnext;
+            if(!dy)
+            {
+                tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0);
+            }
+            else if(dy == 2)
+            {
+                tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0));
+            }
+            else
+            {
+                tnext = vsubq_f32(x2, x0);
+            }
+
+            if(!x) {
+                tcurr = tnext;
+                // make border
+                if (borderType == BORDER_MODE_CONSTANT)
+                {
+                    tcurr = vsetq_lane_f32(borderValue,tcurr, 3);
+                }
+                else if (borderType == BORDER_MODE_REFLECT101)
+                {
+                    tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3);
+                }
+                else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
+                {
+                    tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3);
+                }
+                continue;
+            }
+
+            internal::prefetch(trow0 + x);
+            internal::prefetch(trow1 + x);
+
+            t0 = vextq_f32(tprev, tcurr, 3);
+            t1 = tcurr;
+            t2 = vextq_f32(tcurr, tnext, 1);
+            if(!dx)
+            {
+                t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2));
+            }
+            else if(dx == 2)
+            {
+                t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0));
+            }
+            else
+            {
+                t0 = vsubq_f32(t2, t0);
+            }
+
+            if(!(y%2))
+            {
+                vst1q_f32(trow0 + x - 4, t0);
+            }
+            else
+            {
+                vst1q_f32(trow1 + x - 4, t0);
+            }
+        }
+        x -= 4;
+        if(x == size.width){
+            x--;
+        }
+        f32 prevx = 0, rowx = 0, nextx = 0;
+        if(!dy)
+        {
+            prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] :
+                    (borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] :
+                    (borderType == BORDER_MODE_CONSTANT   ? 4*borderValue :
+                                                            srow2[0] + 2*srow1[0] + srow0[0]) );
+            rowx  = srow2[x] + 2*srow1[x] + srow0[x];
+        }
+        else if(dy == 2)
+        {
+            prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] :
+                    (borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] :
+                    (borderType == BORDER_MODE_CONSTANT   ? 0.f :
+                                                            srow2[0] - 2*srow1[0] + srow0[0]) );
+            rowx  = srow2[x] - 2*srow1[x] + srow0[x];
+        }
+        else
+        {
+            prevx = x > 0 ? srow2[x-1] - srow0[x-1] :
+                    (borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] :
+                    (borderType == BORDER_MODE_CONSTANT   ? 0.f :
+                                                            srow2[0] - srow0[0]) );
+            rowx  = srow2[x] - srow0[x];
+        }
+
+        for( ; x < size.width; x++ )
+        {
+            if(x+1 == size.width) {
+                // make border
+                if (borderType == BORDER_MODE_CONSTANT)
+                {
+                    if(!dy) {
+                        nextx = 4*borderValue;
+                    } else {
+                        nextx = 0.f;
+                    }
+                } else if (borderType == BORDER_MODE_REFLECT101)
+                {
+                    if(!dy) {
+                        nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1];
+                    } else if(dy == 2) {
+                        nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1];
+                    } else {
+                        nextx = srow2[x-1] - srow0[x-1];
+                    }
+                } else {
+                    if(!dy) {
+                        nextx = srow2[x] + 2*srow1[x] + srow0[x];
+                    } else if(dy == 2) {
+                        nextx = srow2[x] - 2*srow1[x] + srow0[x];
+                    } else {
+                        nextx = srow2[x] - srow0[x];
+                    }
+                }
+            } else {
+                if(!dy) {
+                    nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1];
+                } else if(dy == 2) {
+                    nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1];
+                } else {
+                    nextx = srow2[x+1] - srow0[x+1];
+                }
+            }
+            f32 res;
+            if(dx==1) {
+                res = nextx - prevx;
+            } else if(!dx) {
+                res = prevx + 2*rowx + nextx;
+            } else {
+                res = prevx - 2*rowx + nextx;
+            }
+            if(!(y%2)) {
+                *(trow0+x) = res;
+            } else {
+                *(trow1+x) = res;
+            }
+            prevx = rowx;
+            rowx = nextx;
+        }
+
+        if(y>0) {
+            for(size_t x1 = 0; x1 < size.width; x1++ )
+            {
+                if(y%2)
+                    *(drow + x1) = trow0[x1];
+                else
+                    *(drow + x1) = trow1[x1];
+            }
+        }
+        if(y == size.height-1) {
+            for(size_t x1 = 0; x1 < size.width; x1++ )
+            {
+                if(!(y%2))
+                    *(drow1 + x1) = trow0[x1];
+                else
+                    *(drow1 + x1) = trow1[x1];
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/sub.cpp b/3rdparty/carotene/src/sub.cpp
new file mode 100644
index 0000000000..38853895e7
--- /dev/null
+++ b/3rdparty/carotene/src/sub.cpp
@@ -0,0 +1,621 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T, typename WT>
+struct SubWrap
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vsubq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vsub(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = (T)((WT)src0[0] - (WT)src1[0]);
+    }
+};
+
+template <typename T, typename WT>
+struct SubSaturate
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vqsubq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vqsub(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = internal::saturate_cast<T>((WT)src0[0] - (WT)src1[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void sub(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         u8 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<u8, s16>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<u8, s16>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+            uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
+            uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
+            vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
+            vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
+            vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
+            vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src0 = vld1_u8(src0 + j);
+            uint8x8_t v_src1 = vld1_u8(src1 + j);
+            vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1));
+        }
+
+        for (; j < size.width; j++)
+            dst[j] = (s16)src0[j] - (s16)src1[j];
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         f32 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+            uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
+            uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
+            int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00),  vget_low_u8(v_src10)));
+            int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
+
+            vst1q_f32(dst + j +  0, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vsl) )));
+            vst1q_f32(dst + j +  4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
+            vst1q_f32(dst + j +  8, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vsh) )));
+            vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
+
+            vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01),  vget_low_u8(v_src11)));
+            vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
+
+            vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vsl) )));
+            vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
+            vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vsh) )));
+            vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src0 = vld1_u8(src0 + j);
+            uint8x8_t v_src1 = vld1_u8(src1 + j);
+
+            int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1));
+            vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vs) )));
+            vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) )));
+        }
+        for(; j < size.width; j++)
+            dst[j] = (f32)src0[j] - (f32)src1[j];
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (policy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+                int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
+                int16x8_t v_src1 = vld1q_s16(src1 + j);
+                int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
+        }
+        else
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+                int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
+                int16x8_t v_src1 = vld1q_s16(src1 + j);
+                int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (policy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
+                uint8x16_t v_src1 = vld1q_u8(src1 + j);
+                int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
+                int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
+                int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vld1q_s16(src0 + j);
+                int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
+                int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
+        }
+        else
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
+                uint8x16_t v_src1 = vld1q_u8(src1 + j);
+                int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
+                int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
+                int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vld1q_s16(src0 + j);
+                int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
+                int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const s8 * src0Base, ptrdiff_t src0Stride,
+         const s8 * src1Base, ptrdiff_t src1Stride,
+         s8 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<s8, s16>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<s8, s16>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<s16, s32>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<s16, s32>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u16 * src0Base, ptrdiff_t src0Stride,
+         const u16 * src1Base, ptrdiff_t src1Stride,
+         u16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<u16, s32>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<u16, s32>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const s32 * src0Base, ptrdiff_t src0Stride,
+         const s32 * src1Base, ptrdiff_t src1Stride,
+         s32 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<s32, s64>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<s32, s64>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u32 * src0Base, ptrdiff_t src0Stride,
+         const u32 * src1Base, ptrdiff_t src1Stride,
+         u32 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<u32, s64>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<u32, s64>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const f32 * src0Base, ptrdiff_t src0Stride,
+         const f32 * src1Base, ptrdiff_t src1Stride,
+         f32 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         SubWrap<f32, f32>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/sum.cpp b/3rdparty/carotene/src/sum.cpp
new file mode 100644
index 0000000000..812e7fca67
--- /dev/null
+++ b/3rdparty/carotene/src/sum.cpp
@@ -0,0 +1,385 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+bool isSumSupported(u32 channels)
+{
+    return (channels && channels < 5);
+}
+
+void sum(const Size2D &_size,
+         const u8 * srcBase, ptrdiff_t srcStride,
+         u32 * sumdst, u32 channels)
+{
+    internal::assertSupportedConfiguration(isSumSupported(channels));
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const ptrdiff_t width = size.width * channels;
+
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        ptrdiff_t i = 0;
+
+        if (channels == 3)
+        {
+            uint32x4_t vs1231 = vdupq_n_u32(0);
+            uint32x4_t vs3123 = vdupq_n_u32(0);
+            uint32x4_t vs2312 = vdupq_n_u32(0);
+            for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3)
+            {
+                uint16x8_t s1 = vmovl_u8(vld1_u8(src +  0));
+                uint16x8_t s2 = vmovl_u8(vld1_u8(src +  8));
+                uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
+
+                for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3)
+                {
+                    internal::prefetch(src + j + 24);
+                    s1 = vaddw_u8(s1, vld1_u8(src + j +  0));
+                    s2 = vaddw_u8(s2, vld1_u8(src + j +  8));
+                    s3 = vaddw_u8(s3, vld1_u8(src + j + 16));
+                }
+
+                vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
+                vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
+                vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
+            }
+            if (i <= width - 8*3)
+            {
+                uint16x8_t s1 = vmovl_u8(vld1_u8(src +  0));
+                uint16x8_t s2 = vmovl_u8(vld1_u8(src +  8));
+                uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
+
+                for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3)
+                {
+                    internal::prefetch(src + 24);
+                    s1 = vaddw_u8(s1, vld1_u8(src +  0));
+                    s2 = vaddw_u8(s2, vld1_u8(src +  8));
+                    s3 = vaddw_u8(s3, vld1_u8(src + 16));
+                }
+
+                vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
+                vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
+                vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
+            }
+
+            u32 sum[12];
+            vst1q_u32(sum+0, vs1231);
+            vst1q_u32(sum+4, vs2312);
+            vst1q_u32(sum+8, vs3123);
+
+            for (; i < width; i += 3, src += 3)
+            {
+                sumdst[0] += src[0];
+                sumdst[1] += src[1];
+                sumdst[2] += src[2];
+            }
+
+            sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9];
+            sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10];
+            sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11];
+        }
+        else
+        {
+            uint32x4_t vs = vdupq_n_u32(0);
+            for (; i <= width - 257*8; i += 257*8, src += 257 * 8)
+            {
+                uint16x8_t s1 = vmovl_u8(vld1_u8(src));
+
+                for (int j = 8; j < 257 * 8; j += 8)
+                {
+                    internal::prefetch(src + j);
+                    s1 = vaddw_u8(s1, vld1_u8(src + j));
+                }
+
+                vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
+            }
+            if (i < width - 7)
+            {
+                uint16x8_t s1 = vmovl_u8(vld1_u8(src));
+
+                for(i+=8,src+=8; i < width-7; i+=8,src+=8)
+                {
+                    internal::prefetch(src);
+                    s1 = vaddw_u8(s1, vld1_u8(src));
+                }
+                vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
+            }
+
+            if (channels == 1)
+            {
+                uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+                uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2));
+
+                u32 s0 = vget_lane_u32(vs1, 0);
+                for(; i < width; ++i,++src)
+                    s0 += src[0];
+                sumdst[0] += s0;
+            }
+            else if (channels == 4)
+            {
+                vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst)));
+
+                for(; i < width; i+=4,src+=4)
+                {
+                    sumdst[0] += src[0];
+                    sumdst[1] += src[1];
+                    sumdst[2] += src[2];
+                    sumdst[3] += src[3];
+                }
+            }
+            else//if (channels == 2)
+            {
+                uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+                vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst)));
+
+                for(; i < width; i+=2,src+=2)
+                {
+                    sumdst[0] += src[0];
+                    sumdst[1] += src[1];
+                }
+            }
+        }//channels != 3
+    }
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumdst;
+    (void)channels;
+#endif
+}
+
+void sum(const Size2D &_size,
+         const f32 * srcBase, ptrdiff_t srcStride,
+         f64 * sumdst, u32 channels)
+{
+    internal::assertSupportedConfiguration(isSumSupported(channels));
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const ptrdiff_t width = size.width * channels;
+
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        ptrdiff_t i = 0;
+
+        if (channels == 3)
+        {
+            float32x4_t vs1231 = vdupq_n_f32(0);
+            float32x4_t vs2312 = vdupq_n_f32(0);
+            float32x4_t vs3123 = vdupq_n_f32(0);
+            for(; i <= width-12; i += 12)
+            {
+                internal::prefetch(src + i + 12);
+                vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0));
+                vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4));
+                vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8));
+            }
+
+            f32 s[12];
+            vst1q_f32(s + 0, vs1231);
+            vst1q_f32(s + 4, vs2312);
+            vst1q_f32(s + 8, vs3123);
+
+            sumdst[0] += s[0] + s[3] + s[6] + s[9];
+            sumdst[1] += s[1] + s[4] + s[7] + s[10];
+            sumdst[2] += s[2] + s[5] + s[8] + s[11];
+            for( ; i < width; i+=3)
+            {
+                sumdst[0] += src[i];
+                sumdst[1] += src[i+1];
+                sumdst[2] += src[i+2];
+            }
+        }
+        else
+        {
+            float32x4_t vs = vdupq_n_f32(0);
+            for(; i <= width-4; i += 4)
+            {
+                internal::prefetch(src + i);
+                vs = vaddq_f32(vs, vld1q_f32(src+i));
+            }
+
+            if (channels == 1)
+            {
+                float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs));
+                f32 s[2];
+                vst1_f32(s, vs2);
+
+                sumdst[0] += s[0] + s[1];
+                for( ; i < width; i++)
+                    sumdst[0] += src[i];
+            }
+            else if (channels == 4)
+            {
+                f32 s[4];
+                vst1q_f32(s, vs);
+
+                sumdst[0] += s[0];
+                sumdst[1] += s[1];
+                sumdst[2] += s[2];
+                sumdst[3] += s[3];
+            }
+            else//if (channels == 2)
+            {
+                float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs));
+                f32 s[2];
+                vst1_f32(s, vs2);
+
+                sumdst[0] += s[0];
+                sumdst[1] += s[1];
+
+                if(i < width)
+                {
+                    sumdst[0] += src[i];
+                    sumdst[1] += src[i+1];
+                }
+            }
+        }//channels != 3
+    }
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumdst;
+    (void)channels;
+#endif
+}
+
+bool isSqsumSupported(u32 channels)
+{
+    return (channels && ((4/channels)*channels == 4));
+}
+
+void sqsum(const Size2D &_size,
+           const u8 * srcBase, ptrdiff_t srcStride,
+           f64 * sumdst, f64 * sqsumdst, u32 channels)
+{
+    internal::assertSupportedConfiguration(isSqsumSupported(channels));
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width*channels))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const size_t width = size.width * channels;
+
+    size_t blockSize0 = 1 << 23;
+    size_t roiw8 = width & ~7;
+
+    uint32x4_t v_zero = vdupq_n_u32(0u);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0u;
+
+        while (j < roiw8)
+        {
+            size_t blockSize = std::min(roiw8 - j, blockSize0) + j;
+            uint32x4_t v_sum = v_zero;
+            uint32x4_t v_sqsum = v_zero;
+
+            for ( ; j < blockSize ; j += 8, src += 8)
+            {
+                internal::prefetch(src);
+                uint8x8_t v_src0 = vld1_u8(src);
+
+                uint16x8_t v_src = vmovl_u8(v_src0);
+                uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src);
+                v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi));
+                v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo);
+                v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi);
+            }
+
+            u32 arsum[8];
+            vst1q_u32(arsum, v_sum);
+            vst1q_u32(arsum + 4, v_sqsum);
+
+            sumdst[0] += (f64)arsum[0];
+            sumdst[1 % channels] += (f64)arsum[1];
+            sumdst[2 % channels] += (f64)arsum[2];
+            sumdst[3 % channels] += (f64)arsum[3];
+            sqsumdst[0] += (f64)arsum[4];
+            sqsumdst[1 % channels] += (f64)arsum[5];
+            sqsumdst[2 % channels] += (f64)arsum[6];
+            sqsumdst[3 % channels] += (f64)arsum[7];
+        }
+        // collect a few last elements in the current row
+        // it's ok to process channels elements per step
+        // since we could handle 1,2 or 4 channels
+        // we always have channels-fold amount of elements remaining
+        for ( ; j < width; j+=channels, src+=channels)
+        {
+            for (u32 kk = 0; kk < channels; kk++)
+            {
+                u32 srcval = src[kk];
+                sumdst[kk] += srcval;
+                sqsumdst[kk] += srcval * srcval;
+            }
+        }
+    }
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumdst;
+    (void)sqsumdst;
+    (void)channels;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/template_matching.cpp b/3rdparty/carotene/src/template_matching.cpp
new file mode 100644
index 0000000000..ad87085188
--- /dev/null
+++ b/3rdparty/carotene/src/template_matching.cpp
@@ -0,0 +1,241 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#define ENABLE4LINESMATCHING false  //Disabled since overall time for simultaneous 4 lines matching is greater than
+                                    //time for simultaneous 2 lines matching for the same amount of data
+
+bool isMatchTemplateSupported(const Size2D &tmplSize)
+{
+    return isSupportedConfiguration() &&
+           tmplSize.width >= 8 && // Actually the function could process even shorter templates
+                                  // but there will be no NEON optimization in this case
+           (tmplSize.width * tmplSize.height) <= 256;
+}
+
+void matchTemplate(const Size2D &srcSize,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   const Size2D &tmplSize,
+                   const u8 * tmplBase, ptrdiff_t tmplStride,
+                   f32 * dstBase, ptrdiff_t dstStride,
+                   bool normalize)
+{
+    internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize));
+#ifdef CAROTENE_NEON
+    const size_t tmplW = tmplSize.width;
+    const size_t tmplH = tmplSize.height;
+    const size_t dstW  = srcSize.width  - tmplSize.width  + 1;
+    const size_t dstH  = srcSize.height - tmplSize.height + 1;
+
+    //template correlation part
+    {
+#if ENABLE4LINESMATCHING
+        const size_t dstroiw4 = dstW & ~3u;
+#endif
+        const size_t dstroiw2 = dstW & ~1u;
+        const size_t tmplroiw = tmplW & ~7u;
+        const size_t dstride = dstStride >> 2;
+
+        f32 *corr = dstBase;
+        const u8  *imgrrow = srcBase;
+        for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
+        {
+            size_t c = 0;
+#if ENABLE4LINESMATCHING
+            for(; c < dstroiw4; c+=4)
+            {
+                u32 dot[4] = {0, 0, 0, 0};
+                uint32x4_t vdot0 = vmovq_n_u32(0);
+                uint32x4_t vdot1 = vmovq_n_u32(0);
+                uint32x4_t vdot2 = vmovq_n_u32(0);
+                uint32x4_t vdot3 = vmovq_n_u32(0);
+
+                const u8  *img = imgrrow;
+                const u8 *tmpl = tmplBase;
+                for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
+                {
+                    size_t j = 0;
+                    for(; j < tmplroiw; j+=8)
+                    {
+                        uint8x8_t vtmpl = vld1_u8(tmpl + j);
+
+                        uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
+                        uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
+                        uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
+                        uint8x8_t vimg3 = vld1_u8(img + j + c + 3);
+
+                        uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
+                        uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
+                        uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
+                        uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);
+
+                        vdot0 = vpadalq_u16(vdot0, vd0);
+                        vdot1 = vpadalq_u16(vdot1, vd1);
+                        vdot2 = vpadalq_u16(vdot2, vd2);
+                        vdot3 = vpadalq_u16(vdot3, vd3);
+                    }
+                    for(; j < tmplW; ++j)
+                    {
+                        dot[0] += tmpl[j] * img[j + c + 0];
+                        dot[1] += tmpl[j] * img[j + c + 1];
+                        dot[2] += tmpl[j] * img[j + c + 2];
+                        dot[3] += tmpl[j] * img[j + c + 3];
+                    }
+                }
+                uint32x4_t vdotx   = vld1q_u32(dot);
+                uint32x2_t vdot_0  = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
+                uint32x2_t vdot_1  = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
+                uint32x2_t vdot_2  = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
+                uint32x2_t vdot_3  = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
+                uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
+                uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);
+
+                vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
+            }
+#endif
+
+            for(; c < dstroiw2; c+=2)
+            {
+                u32 dot[2] = {0, 0};
+                uint32x4_t vdot0 = vmovq_n_u32(0);
+                uint32x4_t vdot1 = vmovq_n_u32(0);
+                const u8  *img = imgrrow;
+                const u8 *tmpl = tmplBase;
+                for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
+                {
+                    size_t j = 0;
+                    for(; j < tmplroiw; j+=8)
+                    {
+                        uint8x8_t vtmpl = vld1_u8(tmpl + j);
+
+                        uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
+                        uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
+
+                        uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
+                        uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
+
+                        vdot0 = vpadalq_u16(vdot0, vd0);
+                        vdot1 = vpadalq_u16(vdot1, vd1);
+                    }
+                    for(; j < tmplW; ++j)
+                    {
+                        dot[0] += tmpl[j] * img[j + c + 0];
+                        dot[1] += tmpl[j] * img[j + c + 1];
+                    }
+                }
+                uint32x2_t vdotx  = vld1_u32(dot);
+                uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
+                uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
+                uint32x2_t vdot_  = vpadd_u32(vdot_0, vdot_1);
+                vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
+            }
+
+            for(; c < dstW; ++c)
+            {
+                u32 dot = 0;
+                uint32x4_t vdot = vmovq_n_u32(0);
+                const u8  *img = imgrrow;
+                const u8 *tmpl = tmplBase;
+                for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
+                {
+                    size_t j = 0;
+                    for(; j < tmplroiw; j+=8)
+                    {
+                        uint8x8_t vtmpl = vld1_u8(tmpl + j);
+                        uint8x8_t vimg  = vld1_u8(img + j + c);
+                        uint16x8_t vd   = vmull_u8(vtmpl, vimg);
+                        vdot = vpadalq_u16(vdot, vd);
+                    }
+                    for(; j < tmplW; ++j)
+                        dot += tmpl[j] * img[j + c];
+                }
+                u32 wdot[2];
+                vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
+                dot += wdot[0] + wdot[1];
+                corr[c] = (f32)dot;
+            }
+        }
+    }
+
+    if(normalize)
+    {
+        f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
+
+        size_t iw = srcSize.width+1;
+        size_t ih = srcSize.height+1;
+        std::vector<f64> _sqsum(iw*ih);
+        f64 *sqsum = &_sqsum[0];
+        memset(sqsum, 0, iw*sizeof(f64));
+        for(size_t i = 1; i < ih; ++i)
+            sqsum[iw*i] = 0.;
+        sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));
+
+        for(size_t i = 0; i < dstH; ++i)
+        {
+            f32 *result = internal::getRowPtr(dstBase, dstStride, i);
+            for(size_t j = 0; j < dstW; ++j)
+            {
+                double s2 = sqsum[iw*i + j] +
+                            sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
+                            sqsum[iw*(i + tmplSize.height) + j] -
+                            sqsum[iw*i + j + tmplSize.width];
+
+                result[j] /= tn * std::sqrt(s2);
+            }
+        }
+    }
+#else
+    (void)srcSize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)tmplBase;
+    (void)tmplStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)normalize;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/threshold.cpp b/3rdparty/carotene/src/threshold.cpp
new file mode 100644
index 0000000000..8e03798b02
--- /dev/null
+++ b/3rdparty/carotene/src/threshold.cpp
@@ -0,0 +1,1627 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+void thresholdBinary(const Size2D &size,
+                     const u8 *srcBase, ptrdiff_t srcStride,
+                     u8 *dstBase, ptrdiff_t dstStride,
+                     u8 threshold, u8 trueValue, u8 falseValue)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint8x16_t vthreshold = vdupq_n_u8(threshold);
+    uint8x8_t  vthreshold8 = vdup_n_u8(threshold);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    if(trueValue == 255 && falseValue == 0)
+    {
+        for (size_t i = 0; i < size.height; ++i) {
+            const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+            u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw32; j += 32) {
+                internal::prefetch(src + j);
+                uint8x16_t v0 = vld1q_u8(src + j);
+                uint8x16_t v1 = vld1q_u8(src + j + 16);
+                uint8x16_t r0 = vcgtq_u8(v0, vthreshold);
+                uint8x16_t r1 = vcgtq_u8(v1, vthreshold);
+                vst1q_u8(dst + j, r0);
+                vst1q_u8(dst + j + 16, r1);
+            }
+            for (; j < roiw8; j += 8) {
+                uint8x8_t v0 = vld1_u8(src + j);
+                uint8x8_t r0 = vcgt_u8(v0, vthreshold8);
+                vst1_u8(dst + j, r0);
+            }
+
+            for (; j < size.width; j++) {
+                *(dst + j) = *(src + j) > threshold ? 255 : 0;
+            }
+        }
+    }
+    else
+    {
+        uint8x16_t vtrue_value = vdupq_n_u8(trueValue);
+        uint8x8_t  vtrue_value8 = vdup_n_u8(trueValue);
+        uint8x16_t vfalse_value = vdupq_n_u8(falseValue);
+        uint8x8_t  vfalse_value8 = vdup_n_u8(falseValue);
+
+        for (size_t i = 0; i < size.height; ++i) {
+            const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+            u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw32; j += 32) {
+                internal::prefetch(src + j);
+                uint8x16_t v0 = vld1q_u8(src + j);
+                uint8x16_t v1 = vld1q_u8(src + j + 16);
+                uint8x16_t r0 = vcgtq_u8(v0, vthreshold);
+                uint8x16_t r1 = vcgtq_u8(v1, vthreshold);
+                uint8x16_t r0a = vbslq_u8(r0, vtrue_value, vfalse_value);
+                uint8x16_t r1a = vbslq_u8(r1, vtrue_value, vfalse_value);
+                vst1q_u8(dst + j, r0a);
+                vst1q_u8(dst + j + 16, r1a);
+            }
+            for (; j < roiw8; j += 8) {
+                uint8x8_t v0 = vld1_u8(src + j);
+                uint8x8_t r0 = vcgt_u8(v0, vthreshold8);
+                uint8x8_t r0a = vbsl_u8(r0, vtrue_value8, vfalse_value8);
+                vst1_u8(dst + j, r0a);
+            }
+
+            for (; j < size.width; j++) {
+                *(dst + j) = *(src + j) > threshold ? trueValue : falseValue;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)trueValue;
+    (void)falseValue;
+#endif
+}
+
+void thresholdRange(const Size2D &size,
+                    const u8 *srcBase, ptrdiff_t srcStride,
+                    u8 *dstBase, ptrdiff_t dstStride,
+                    u8 lowerThreshold, u8 upperThreshold,
+                    u8 trueValue, u8 falseValue)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    uint8x16_t v_lower = vdupq_n_u8(lowerThreshold), v_upper = vdupq_n_u8(upperThreshold);
+    uint8x8_t  v_lower8 = vdup_n_u8(lowerThreshold), v_upper8 = vdup_n_u8(upperThreshold);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    if(trueValue == 255 && falseValue == 0)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+            u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw32; j += 32)
+            {
+                internal::prefetch(src + j);
+                uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
+                uint8x16_t v_dst0 = vandq_u8(vcgeq_u8(v_src0, v_lower), vcleq_u8(v_src0, v_upper));
+                uint8x16_t v_dst1 = vandq_u8(vcgeq_u8(v_src1, v_lower), vcleq_u8(v_src1, v_upper));
+                vst1q_u8(dst + j, v_dst0);
+                vst1q_u8(dst + j + 16, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                uint8x8_t v_src = vld1_u8(src + j);
+                uint8x8_t v_dst = vand_u8(vcge_u8(v_src, v_lower8), vcle_u8(v_src, v_upper8));
+                vst1_u8(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+            {
+                u8 srcVal = src[j];
+                dst[j] = lowerThreshold <= srcVal && srcVal <= upperThreshold ? 255 : 0;
+            }
+        }
+    }
+    else
+    {
+        uint8x16_t vtrue_value = vdupq_n_u8(trueValue);
+        uint8x8_t  vtrue_value8 = vdup_n_u8(trueValue);
+        uint8x16_t vfalse_value = vdupq_n_u8(falseValue);
+        uint8x8_t  vfalse_value8 = vdup_n_u8(falseValue);
+
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+            u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw32; j += 32)
+            {
+                internal::prefetch(src + j);
+                uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
+                uint8x16_t v_dst0 = vandq_u8(vcgeq_u8(v_src0, v_lower), vcleq_u8(v_src0, v_upper));
+                uint8x16_t v_dst1 = vandq_u8(vcgeq_u8(v_src1, v_lower), vcleq_u8(v_src1, v_upper));
+                v_dst0 = vbslq_u8(v_dst0, vtrue_value, vfalse_value);
+                v_dst1 = vbslq_u8(v_dst1, vtrue_value, vfalse_value);
+                vst1q_u8(dst + j, v_dst0);
+                vst1q_u8(dst + j + 16, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                uint8x8_t v_src = vld1_u8(src + j);
+                uint8x8_t v_dst = vand_u8(vcge_u8(v_src, v_lower8), vcle_u8(v_src, v_upper8));
+                v_dst = vbsl_u8(v_dst, vtrue_value8, vfalse_value8);
+                vst1_u8(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+            {
+                u8 srcVal = src[j];
+                dst[j] = lowerThreshold <= srcVal && srcVal <= upperThreshold ? trueValue : falseValue;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)lowerThreshold;
+    (void)upperThreshold;
+    (void)trueValue;
+    (void)falseValue;
+#endif
+}
+
+void thresholdBinary(const Size2D &size,
+                     const u8 *srcBase, ptrdiff_t srcStride,
+                     u8 *dstBase, ptrdiff_t dstStride,
+                     u8 threshold, u8 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint8x16_t vthreshold = vdupq_n_u8(threshold);
+    uint8x16_t vvalue = vdupq_n_u8(value);
+    uint8x8_t  vthreshold8 = vdup_n_u8(threshold);
+    uint8x8_t  vvalue8 = vdup_n_u8(value);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v0  = vld1q_u8(src + j);
+            uint8x16_t v1  = vld1q_u8(src + j + 16);
+            uint8x16_t r0 = vcgtq_u8(v0, vthreshold);
+            uint8x16_t r1 = vcgtq_u8(v1, vthreshold);
+            uint8x16_t r0a = vandq_u8(r0, vvalue);
+            uint8x16_t r1a = vandq_u8(r1, vvalue);
+            vst1q_u8(dst + j, r0a);
+            vst1q_u8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v0  = vld1_u8(src + j);
+            uint8x8_t r0 = vcgt_u8(v0, vthreshold8);
+            uint8x8_t r0a = vand_u8(r0, vvalue8);
+            vst1_u8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? value : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdBinaryInv(const Size2D &size,
+                        const u8 *srcBase, ptrdiff_t srcStride,
+                        u8 *dstBase, ptrdiff_t dstStride,
+                        u8 threshold, u8 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint8x16_t vthreshold = vdupq_n_u8(threshold);
+    uint8x16_t vvalue = vdupq_n_u8(value);
+    uint8x8_t  vthreshold8 = vdup_n_u8(threshold);
+    uint8x8_t  vvalue8 = vdup_n_u8(value);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v0  = vld1q_u8(src + j);
+            uint8x16_t v1  = vld1q_u8(src + j + 16);
+            uint8x16_t r0 = vcleq_u8(v0, vthreshold);
+            uint8x16_t r1 = vcleq_u8(v1, vthreshold);
+            uint8x16_t r0a = vandq_u8(r0, vvalue);
+            uint8x16_t r1a = vandq_u8(r1, vvalue);
+            vst1q_u8(dst + j, r0a);
+            vst1q_u8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v0  = vld1_u8(src + j);
+            uint8x8_t r0 = vcle_u8(v0, vthreshold8);
+            uint8x8_t r0a = vand_u8(r0, vvalue8);
+            vst1_u8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : value;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdTruncate(const Size2D &size,
+                       const u8 *srcBase, ptrdiff_t srcStride,
+                       u8 *dstBase, ptrdiff_t dstStride,
+                       u8 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint8x16_t vthreshold = vdupq_n_u8(threshold);
+    uint8x8_t  vthreshold8 = vdup_n_u8(threshold);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v0  = vld1q_u8(src + j);
+            uint8x16_t v1  = vld1q_u8(src + j + 16);
+            uint8x16_t r0 = vqsubq_u8(v0, vthreshold);
+            uint8x16_t r1 = vqsubq_u8(v1, vthreshold);
+            uint8x16_t r0a = vqsubq_u8(v0, r0);
+            uint8x16_t r1a = vqsubq_u8(v1, r1);
+            vst1q_u8(dst + j, r0a);
+            vst1q_u8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v0  = vld1_u8(src + j);
+            uint8x8_t r0 = vqsub_u8(v0, vthreshold8);
+            uint8x8_t r0a = vqsub_u8(v0, r0);
+            vst1_u8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? threshold : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZero(const Size2D &size,
+                     const u8 *srcBase, ptrdiff_t srcStride,
+                     u8 *dstBase, ptrdiff_t dstStride,
+                     u8 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint8x16_t vthreshold = vdupq_n_u8(threshold);
+    uint8x8_t  vthreshold8 = vdup_n_u8(threshold);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v0  = vld1q_u8(src + j);
+            uint8x16_t v1  = vld1q_u8(src + j + 16);
+            uint8x16_t r0 = vcgtq_u8(v0, vthreshold);
+            uint8x16_t r1 = vcgtq_u8(v1, vthreshold);
+            uint8x16_t r0a = vandq_u8(v0, r0);
+            uint8x16_t r1a = vandq_u8(v1, r1);
+            vst1q_u8(dst + j, r0a);
+            vst1q_u8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v0  = vld1_u8(src + j);
+            uint8x8_t r0 = vcgt_u8(v0, vthreshold8);
+            uint8x8_t r0a = vand_u8(v0, r0);
+            vst1_u8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? *(src + j) : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZeroInv(const Size2D &size,
+                        const u8 *srcBase, ptrdiff_t srcStride,
+                        u8 *dstBase, ptrdiff_t dstStride,
+                        u8 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint8x16_t vthreshold = vdupq_n_u8(threshold);
+    uint8x8_t  vthreshold8 = vdup_n_u8(threshold);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v0  = vld1q_u8(src + j);
+            uint8x16_t v1  = vld1q_u8(src + j + 16);
+            uint8x16_t r0 = vcgtq_u8(v0, vthreshold);
+            uint8x16_t r1 = vcgtq_u8(v1, vthreshold);
+            uint8x16_t r0a = vbicq_u8(v0, r0);
+            uint8x16_t r1a = vbicq_u8(v1, r1);
+            vst1q_u8(dst + j, r0a);
+            vst1q_u8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v0  = vld1_u8(src + j);
+            uint8x8_t r0 = vcgt_u8(v0, vthreshold8);
+            uint8x8_t r0a = vbic_u8(v0, r0);
+            vst1_u8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdBinary(const Size2D &size,
+                     const s8 *srcBase, ptrdiff_t srcStride,
+                     s8 *dstBase, ptrdiff_t dstStride,
+                     s8 threshold, s8 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int8x16_t vthreshold = vdupq_n_s8(threshold);
+    int8x16_t vvalue = vdupq_n_s8(value);
+    int8x8_t  vthreshold8 = vdup_n_s8(threshold);
+    int8x8_t  vvalue8 = vdup_n_s8(value);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            int8x16_t v0  = vld1q_s8(src + j);
+            int8x16_t v1  = vld1q_s8(src + j + 16);
+            int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold));
+            int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold));
+            int8x16_t r0a = vandq_s8(r0, vvalue);
+            int8x16_t r1a = vandq_s8(r1, vvalue);
+            vst1q_s8(dst + j, r0a);
+            vst1q_s8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int8x8_t v0  = vld1_s8(src + j);
+            int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8));
+            int8x8_t r0a = vand_s8(r0, vvalue8);
+            vst1_s8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? value : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdBinaryInv(const Size2D &size,
+                        const s8 *srcBase, ptrdiff_t srcStride,
+                        s8 *dstBase, ptrdiff_t dstStride,
+                        s8 threshold, s8 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int8x16_t vthreshold = vdupq_n_s8(threshold);
+    int8x16_t vvalue = vdupq_n_s8(value);
+    int8x8_t  vthreshold8 = vdup_n_s8(threshold);
+    int8x8_t  vvalue8 = vdup_n_s8(value);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            int8x16_t v0  = vld1q_s8(src + j);
+            int8x16_t v1  = vld1q_s8(src + j + 16);
+            int8x16_t r0 = vreinterpretq_s8_u8(vcleq_s8(v0, vthreshold));
+            int8x16_t r1 = vreinterpretq_s8_u8(vcleq_s8(v1, vthreshold));
+            int8x16_t r0a = vandq_s8(r0, vvalue);
+            int8x16_t r1a = vandq_s8(r1, vvalue);
+            vst1q_s8(dst + j, r0a);
+            vst1q_s8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int8x8_t v0  = vld1_s8(src + j);
+            int8x8_t r0 = vreinterpret_s8_u8(vcle_s8(v0, vthreshold8));
+            int8x8_t r0a = vand_s8(r0, vvalue8);
+            vst1_s8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : value;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdTruncate(const Size2D &size,
+                       const s8 *srcBase, ptrdiff_t srcStride,
+                       s8 *dstBase, ptrdiff_t dstStride,
+                       s8 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int8x16_t vthreshold = vdupq_n_s8(threshold);
+    int8x8_t  vthreshold8 = vdup_n_s8(threshold);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            int8x16_t v0  = vld1q_s8(src + j);
+            int8x16_t v1  = vld1q_s8(src + j + 16);
+            int8x16_t r0 = vqsubq_s8(v0, vthreshold);
+            int8x16_t r1 = vqsubq_s8(v1, vthreshold);
+            int8x16_t r0a = vqsubq_s8(v0, r0);
+            int8x16_t r1a = vqsubq_s8(v1, r1);
+            vst1q_s8(dst + j, r0a);
+            vst1q_s8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int8x8_t v0  = vld1_s8(src + j);
+            int8x8_t r0 = vqsub_s8(v0, vthreshold8);
+            int8x8_t r0a = vqsub_s8(v0, r0);
+            vst1_s8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? threshold : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZero(const Size2D &size,
+                     const s8 *srcBase, ptrdiff_t srcStride,
+                     s8 *dstBase, ptrdiff_t dstStride,
+                     s8 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int8x16_t vthreshold = vdupq_n_s8(threshold);
+    int8x8_t  vthreshold8 = vdup_n_s8(threshold);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            int8x16_t v0  = vld1q_s8(src + j);
+            int8x16_t v1  = vld1q_s8(src + j + 16);
+            int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold));
+            int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold));
+            int8x16_t r0a = vandq_s8(v0, r0);
+            int8x16_t r1a = vandq_s8(v1, r1);
+            vst1q_s8(dst + j, r0a);
+            vst1q_s8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int8x8_t v0  = vld1_s8(src + j);
+            int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8));
+            int8x8_t r0a = vand_s8(v0, r0);
+            vst1_s8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? *(src + j) : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZeroInv(const Size2D &size,
+                        const s8 *srcBase, ptrdiff_t srcStride,
+                        s8 *dstBase, ptrdiff_t dstStride,
+                        s8 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int8x16_t vthreshold = vdupq_n_s8(threshold);
+    int8x8_t  vthreshold8 = vdup_n_s8(threshold);
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            int8x16_t v0  = vld1q_s8(src + j);
+            int8x16_t v1  = vld1q_s8(src + j + 16);
+            int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold));
+            int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold));
+            int8x16_t r0a = vbicq_s8(v0, r0);
+            int8x16_t r1a = vbicq_s8(v1, r1);
+            vst1q_s8(dst + j, r0a);
+            vst1q_s8(dst + j + 16, r1a);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int8x8_t v0  = vld1_s8(src + j);
+            int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8));
+            int8x8_t r0a = vbic_s8(v0, r0);
+            vst1_s8(dst + j, r0a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdBinary(const Size2D &size,
+                     const s16 *srcBase, ptrdiff_t srcStride,
+                     s16 *dstBase, ptrdiff_t dstStride,
+                     s16 threshold, s16 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int16x8_t vthreshold16 = vdupq_n_s16(threshold);
+    int16x8_t vvalue16 = vdupq_n_s16(value);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v0  = vld1q_s16(src + j);
+            int16x8_t v1  = vld1q_s16(src + j + 8);
+            uint16x8_t r0 = vcgtq_s16(v0, vthreshold16);
+            uint16x8_t r1 = vcgtq_s16(v1, vthreshold16);
+            uint16x8_t r0a = vandq_u16(r0, vreinterpretq_u16_s16(vvalue16));
+            uint16x8_t r1a = vandq_u16(r1, vreinterpretq_u16_s16(vvalue16));
+            vst1q_u16((u16*)dst + j, r0a);
+            vst1q_u16((u16*)dst + j + 8, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? value : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdBinaryInv(const Size2D &size,
+                        const s16 *srcBase, ptrdiff_t srcStride,
+                        s16 *dstBase, ptrdiff_t dstStride,
+                        s16 threshold, s16 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int16x8_t vthreshold16 = vdupq_n_s16(threshold);
+    int16x8_t vvalue16 = vdupq_n_s16(value);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v0  = vld1q_s16(src + j);
+            int16x8_t v1  = vld1q_s16(src + j + 8);
+            uint16x8_t r0 = vcleq_s16(v0, vthreshold16);
+            uint16x8_t r1 = vcleq_s16(v1, vthreshold16);
+            uint16x8_t r0a = vandq_u16(r0, vreinterpretq_u16_s16(vvalue16));
+            uint16x8_t r1a = vandq_u16(r1, vreinterpretq_u16_s16(vvalue16));
+            vst1q_s16(dst + j, vreinterpretq_s16_u16(r0a));
+            vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(r1a));
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : value;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdTruncate(const Size2D &size,
+                       const s16 *srcBase, ptrdiff_t srcStride,
+                       s16 *dstBase, ptrdiff_t dstStride,
+                       s16 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int16x8_t vthreshold16 = vdupq_n_s16(threshold);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v0  = vld1q_s16(src + j);
+            int16x8_t v1  = vld1q_s16(src + j + 8);
+            int16x8_t r0 = vminq_s16(v0, vthreshold16);
+            int16x8_t r1 = vminq_s16(v1, vthreshold16);
+            vst1q_s16(dst + j, r0);
+            vst1q_s16(dst + j + 8, r1);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? threshold : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZero(const Size2D &size,
+                     const s16 *srcBase, ptrdiff_t srcStride,
+                     s16 *dstBase, ptrdiff_t dstStride,
+                     s16 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int16x8_t vthreshold16 = vdupq_n_s16(threshold);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v0  = vld1q_s16(src + j);
+            int16x8_t v1  = vld1q_s16(src + j + 8);
+            uint16x8_t r0 = vcgtq_s16(v0, vthreshold16);
+            uint16x8_t r1 = vcgtq_s16(v1, vthreshold16);
+            uint16x8_t r0a = vandq_u16(vreinterpretq_u16_s16(v0), r0);
+            uint16x8_t r1a = vandq_u16(vreinterpretq_u16_s16(v1), r1);
+            vst1q_u16((u16*)dst + j, r0a);
+            vst1q_u16((u16*)dst + j + 8, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? *(src + j) : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZeroInv(const Size2D &size,
+                        const s16 *srcBase, ptrdiff_t srcStride,
+                        s16 *dstBase, ptrdiff_t dstStride,
+                        s16 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int16x8_t vthreshold16 = vdupq_n_s16(threshold);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v0  = vld1q_s16(src + j);
+            int16x8_t v1  = vld1q_s16(src + j + 8);
+            uint16x8_t r0 = vcgtq_s16(v0, vthreshold16);
+            uint16x8_t r1 = vcgtq_s16(v1, vthreshold16);
+            uint16x8_t r0a = vbicq_u16(vreinterpretq_u16_s16(v0), r0);
+            uint16x8_t r1a = vbicq_u16(vreinterpretq_u16_s16(v1), r1);
+            vst1q_u16((u16*)dst + j, r0a);
+            vst1q_u16((u16*)dst + j + 8, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdBinary(const Size2D &size,
+                     const u16 *srcBase, ptrdiff_t srcStride,
+                     u16 *dstBase, ptrdiff_t dstStride,
+                     u16 threshold, u16 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint16x8_t vthreshold16 = vdupq_n_u16(threshold);
+    uint16x8_t vvalue16 = vdupq_n_u16(value);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16* src = internal::getRowPtr(srcBase, srcStride, i);
+        u16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v0  = vld1q_u16(src + j);
+            uint16x8_t v1  = vld1q_u16(src + j + 8);
+            uint16x8_t r0 = vcgtq_u16(v0, vthreshold16);
+            uint16x8_t r1 = vcgtq_u16(v1, vthreshold16);
+            uint16x8_t r0a = vandq_u16(r0, vvalue16);
+            uint16x8_t r1a = vandq_u16(r1, vvalue16);
+            vst1q_u16(dst + j, r0a);
+            vst1q_u16(dst + j + 8, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? value : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdBinaryInv(const Size2D &size,
+                        const u16 *srcBase, ptrdiff_t srcStride,
+                        u16 *dstBase, ptrdiff_t dstStride,
+                        u16 threshold, u16 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint16x8_t vthreshold16 = vdupq_n_u16(threshold);
+    uint16x8_t vvalue16 = vdupq_n_u16(value);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16* src = internal::getRowPtr(srcBase, srcStride, i);
+        u16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v0  = vld1q_u16(src + j);
+            uint16x8_t v1  = vld1q_u16(src + j + 8);
+            uint16x8_t r0 = vcleq_u16(v0, vthreshold16);
+            uint16x8_t r1 = vcleq_u16(v1, vthreshold16);
+            uint16x8_t r0a = vandq_u16(r0, vvalue16);
+            uint16x8_t r1a = vandq_u16(r1, vvalue16);
+            vst1q_u16(dst + j, r0a);
+            vst1q_u16(dst + j + 8, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : value;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdTruncate(const Size2D &size,
+                       const u16 *srcBase, ptrdiff_t srcStride,
+                       u16 *dstBase, ptrdiff_t dstStride,
+                       u16 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint16x8_t vthreshold16 = vdupq_n_u16(threshold);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16* src = internal::getRowPtr(srcBase, srcStride, i);
+        u16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v0  = vld1q_u16(src + j);
+            uint16x8_t v1  = vld1q_u16(src + j + 8);
+            uint16x8_t r0 = vminq_u16(v0, vthreshold16);
+            uint16x8_t r1 = vminq_u16(v1, vthreshold16);
+            vst1q_u16(dst + j, r0);
+            vst1q_u16(dst + j + 8, r1);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? threshold : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZero(const Size2D &size,
+                     const u16 *srcBase, ptrdiff_t srcStride,
+                     u16 *dstBase, ptrdiff_t dstStride,
+                     u16 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint16x8_t vthreshold16 = vdupq_n_u16(threshold);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16* src = internal::getRowPtr(srcBase, srcStride, i);
+        u16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v0  = vld1q_u16(src + j);
+            uint16x8_t v1  = vld1q_u16(src + j + 8);
+            uint16x8_t r0 = vcgtq_u16(v0, vthreshold16);
+            uint16x8_t r1 = vcgtq_u16(v1, vthreshold16);
+            uint16x8_t r0a = vandq_u16(v0, r0);
+            uint16x8_t r1a = vandq_u16(v1, r1);
+            vst1q_u16(dst + j, r0a);
+            vst1q_u16(dst + j + 8, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? *(src + j) : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZeroInv(const Size2D &size,
+                        const u16 *srcBase, ptrdiff_t srcStride,
+                        u16 *dstBase, ptrdiff_t dstStride,
+                        u16 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint16x8_t vthreshold16 = vdupq_n_u16(threshold);
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16* src = internal::getRowPtr(srcBase, srcStride, i);
+        u16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v0  = vld1q_u16(src + j);
+            uint16x8_t v1  = vld1q_u16(src + j + 8);
+            uint16x8_t r0 = vcgtq_u16(v0, vthreshold16);
+            uint16x8_t r1 = vcgtq_u16(v1, vthreshold16);
+            uint16x8_t r0a = vbicq_u16(v0, r0);
+            uint16x8_t r1a = vbicq_u16(v1, r1);
+            vst1q_u16(dst + j, r0a);
+            vst1q_u16(dst + j + 8, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdBinary(const Size2D &size,
+                     const s32 *srcBase, ptrdiff_t srcStride,
+                     s32 *dstBase, ptrdiff_t dstStride,
+                     s32 threshold, s32 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int32x4_t  vthreshold8 = vdupq_n_s32(threshold);
+    int32x4_t  vvalue8 = vdupq_n_s32(value);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32* src = internal::getRowPtr(srcBase, srcStride, i);
+        s32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int32x4_t v0  = vld1q_s32(src + j);
+            int32x4_t v1  = vld1q_s32(src + j + 4);
+            uint32x4_t r0 = vcgtq_s32(v0, vthreshold8);
+            uint32x4_t r1 = vcgtq_s32(v1, vthreshold8);
+            uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_s32(vvalue8));
+            uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_s32(vvalue8));
+            vst1q_u32((u32*)dst + j, r0a);
+            vst1q_u32((u32*)dst + j + 4, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? value : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdBinaryInv(const Size2D &size,
+                        const s32 *srcBase, ptrdiff_t srcStride,
+                        s32 *dstBase, ptrdiff_t dstStride,
+                        s32 threshold, s32 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int32x4_t  vthreshold8 = vdupq_n_s32(threshold);
+    int32x4_t  vvalue8 = vdupq_n_s32(value);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32* src = internal::getRowPtr(srcBase, srcStride, i);
+        s32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int32x4_t v0  = vld1q_s32(src + j);
+            int32x4_t v1  = vld1q_s32(src + j + 4);
+            uint32x4_t r0 = vcleq_s32(v0, vthreshold8);
+            uint32x4_t r1 = vcleq_s32(v1, vthreshold8);
+            uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_s32(vvalue8));
+            uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_s32(vvalue8));
+            vst1q_s32(dst + j, vreinterpretq_s32_u32(r0a));
+            vst1q_s32(dst + j + 4, vreinterpretq_s32_u32(r1a));
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : value;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdTruncate(const Size2D &size,
+                       const s32 *srcBase, ptrdiff_t srcStride,
+                       s32 *dstBase, ptrdiff_t dstStride,
+                       s32 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int32x4_t  vthreshold8 = vdupq_n_s32(threshold);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32* src = internal::getRowPtr(srcBase, srcStride, i);
+        s32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int32x4_t v0  = vld1q_s32(src + j);
+            int32x4_t v1  = vld1q_s32(src + j + 4);
+            int32x4_t r0 = vminq_s32(v0, vthreshold8);
+            int32x4_t r1 = vminq_s32(v1, vthreshold8);
+            vst1q_s32(dst + j, r0);
+            vst1q_s32(dst + j + 4, r1);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? threshold : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZero(const Size2D &size,
+                     const s32 *srcBase, ptrdiff_t srcStride,
+                     s32 *dstBase, ptrdiff_t dstStride,
+                     s32 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int32x4_t  vthreshold8 = vdupq_n_s32(threshold);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32* src = internal::getRowPtr(srcBase, srcStride, i);
+        s32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int32x4_t v0  = vld1q_s32(src + j);
+            int32x4_t v1  = vld1q_s32(src + j + 4);
+            uint32x4_t r0 = vcgtq_s32(v0, vthreshold8);
+            uint32x4_t r1 = vcgtq_s32(v1, vthreshold8);
+            uint32x4_t r0a = vandq_u32(vreinterpretq_u32_s32(v0), r0);
+            uint32x4_t r1a = vandq_u32(vreinterpretq_u32_s32(v1), r1);
+            vst1q_u32((u32*)dst + j, r0a);
+            vst1q_u32((u32*)dst + j + 4, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? *(src + j) : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZeroInv(const Size2D &size,
+                        const s32 *srcBase, ptrdiff_t srcStride,
+                        s32 *dstBase, ptrdiff_t dstStride,
+                        s32 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    int32x4_t  vthreshold8 = vdupq_n_s32(threshold);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32* src = internal::getRowPtr(srcBase, srcStride, i);
+        s32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int32x4_t v0  = vld1q_s32(src + j);
+            int32x4_t v1  = vld1q_s32(src + j + 4);
+            uint32x4_t r0 = vcgtq_s32(v0, vthreshold8);
+            uint32x4_t r1 = vcgtq_s32(v1, vthreshold8);
+            uint32x4_t r0a = vbicq_u32(vreinterpretq_u32_s32(v0), r0);
+            uint32x4_t r1a = vbicq_u32(vreinterpretq_u32_s32(v1), r1);
+            vst1q_u32((u32*)dst + j, r0a);
+            vst1q_u32((u32*)dst + j + 4, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdBinary(const Size2D &size,
+                     const f32 *srcBase, ptrdiff_t srcStride,
+                     f32 *dstBase, ptrdiff_t dstStride,
+                     f32 threshold, f32 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    float32x4_t  vthreshold8 = vdupq_n_f32(threshold);
+    float32x4_t  vvalue8 = vdupq_n_f32(value);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const f32* src = internal::getRowPtr(srcBase, srcStride, i);
+        f32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            float32x4_t v0  = vld1q_f32(src + j);
+            float32x4_t v1  = vld1q_f32(src + j + 4);
+            uint32x4_t r0 = vcgtq_f32(v0, vthreshold8);
+            uint32x4_t r1 = vcgtq_f32(v1, vthreshold8);
+            uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_f32(vvalue8));
+            uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_f32(vvalue8));
+            vst1q_u32((u32*)dst + j, r0a);
+            vst1q_u32((u32*)dst + j + 4, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? value : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdBinaryInv(const Size2D &size,
+                        const f32 *srcBase, ptrdiff_t srcStride,
+                        f32 *dstBase, ptrdiff_t dstStride,
+                        f32 threshold, f32 value)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    float32x4_t  vthreshold8 = vdupq_n_f32(threshold);
+    float32x4_t  vvalue8 = vdupq_n_f32(value);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const f32* src = internal::getRowPtr(srcBase, srcStride, i);
+        f32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            float32x4_t v0  = vld1q_f32(src + j);
+            float32x4_t v1  = vld1q_f32(src + j + 4);
+            uint32x4_t r0 = vcleq_f32(v0, vthreshold8);
+            uint32x4_t r1 = vcleq_f32(v1, vthreshold8);
+            uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_f32(vvalue8));
+            uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_f32(vvalue8));
+            vst1q_f32(dst + j, vreinterpretq_f32_u32(r0a));
+            vst1q_f32(dst + j + 4, vreinterpretq_f32_u32(r1a));
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : value;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+    (void)value;
+#endif
+}
+
+void thresholdTruncate(const Size2D &size,
+                       const f32 *srcBase, ptrdiff_t srcStride,
+                       f32 *dstBase, ptrdiff_t dstStride,
+                       f32 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    float32x4_t  vthreshold8 = vdupq_n_f32(threshold);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const f32* src = internal::getRowPtr(srcBase, srcStride, i);
+        f32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            float32x4_t v0  = vld1q_f32(src + j);
+            float32x4_t v1  = vld1q_f32(src + j + 4);
+            float32x4_t r0 = vminq_f32(v0, vthreshold8);
+            float32x4_t r1 = vminq_f32(v1, vthreshold8);
+            vst1q_f32(dst + j, r0);
+            vst1q_f32(dst + j + 4, r1);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? threshold : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZero(const Size2D &size,
+                     const f32 *srcBase, ptrdiff_t srcStride,
+                     f32 *dstBase, ptrdiff_t dstStride,
+                     f32 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    float32x4_t  vthreshold8 = vdupq_n_f32(threshold);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const f32* src = internal::getRowPtr(srcBase, srcStride, i);
+        f32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            float32x4_t v0  = vld1q_f32(src + j);
+            float32x4_t v1  = vld1q_f32(src + j + 4);
+            uint32x4_t r0 = vcgtq_f32(v0, vthreshold8);
+            uint32x4_t r1 = vcgtq_f32(v1, vthreshold8);
+            uint32x4_t r0a = vandq_u32(vreinterpretq_u32_f32(v0), r0);
+            uint32x4_t r1a = vandq_u32(vreinterpretq_u32_f32(v1), r1);
+            vst1q_u32((u32*)dst + j, r0a);
+            vst1q_u32((u32*)dst + j + 4, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? *(src + j) : 0;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+void thresholdToZeroInv(const Size2D &size,
+                        const f32 *srcBase, ptrdiff_t srcStride,
+                        f32 *dstBase, ptrdiff_t dstStride,
+                        f32 threshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    float32x4_t  vthreshold8 = vdupq_n_f32(threshold);
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const f32* src = internal::getRowPtr(srcBase, srcStride, i);
+        f32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            float32x4_t v0  = vld1q_f32(src + j);
+            float32x4_t v1  = vld1q_f32(src + j + 4);
+            uint32x4_t r0 = vcgtq_f32(v0, vthreshold8);
+            uint32x4_t r1 = vcgtq_f32(v1, vthreshold8);
+            uint32x4_t r0a = vbicq_u32(vreinterpretq_u32_f32(v0), r0);
+            uint32x4_t r1a = vbicq_u32(vreinterpretq_u32_f32(v1), r1);
+            vst1q_u32((u32*)dst + j, r0a);
+            vst1q_u32((u32*)dst + j + 4, r1a);
+        }
+        for (; j < size.width; j++)
+        {
+            *(dst + j) = *(src + j) > threshold ? 0 : *(src + j);
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)threshold;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/vtransform.hpp b/3rdparty/carotene/src/vtransform.hpp
new file mode 100644
index 0000000000..08841a2263
--- /dev/null
+++ b/3rdparty/carotene/src/vtransform.hpp
@@ -0,0 +1,689 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_VTRANSFORM_HPP
+#define CAROTENE_SRC_VTRANSFORM_HPP
+
+#include "common.hpp"
+
+#include <carotene/types.hpp>
+
+#ifdef CAROTENE_NEON
+
+namespace CAROTENE_NS { namespace internal {
+
+////////////////////////////// Type Traits ///////////////////////
+
+template <typename T, int cn = 1>
+struct VecTraits;
+
+template <> struct VecTraits< u8, 1> { typedef  uint8x16_t vec128; typedef   uint8x8_t vec64; typedef VecTraits<  u8, 1> unsign; };
+template <> struct VecTraits< s8, 1> { typedef   int8x16_t vec128; typedef    int8x8_t vec64; typedef VecTraits<  u8, 1> unsign; };
+template <> struct VecTraits<u16, 1> { typedef  uint16x8_t vec128; typedef  uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
+template <> struct VecTraits<s16, 1> { typedef   int16x8_t vec128; typedef   int16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
+template <> struct VecTraits<s32, 1> { typedef   int32x4_t vec128; typedef   int32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
+template <> struct VecTraits<u32, 1> { typedef  uint32x4_t vec128; typedef  uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
+template <> struct VecTraits<s64, 1> { typedef   int64x2_t vec128; typedef   int64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
+template <> struct VecTraits<u64, 1> { typedef  uint64x2_t vec128; typedef  uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
+template <> struct VecTraits<f32, 1> { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
+
+template <> struct VecTraits< u8, 2> { typedef  uint8x16x2_t vec128; typedef   uint8x8x2_t vec64; typedef VecTraits<  u8, 2> unsign; };
+template <> struct VecTraits< s8, 2> { typedef   int8x16x2_t vec128; typedef    int8x8x2_t vec64; typedef VecTraits<  u8, 2> unsign; };
+template <> struct VecTraits<u16, 2> { typedef  uint16x8x2_t vec128; typedef  uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
+template <> struct VecTraits<s16, 2> { typedef   int16x8x2_t vec128; typedef   int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
+template <> struct VecTraits<s32, 2> { typedef   int32x4x2_t vec128; typedef   int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
+template <> struct VecTraits<u32, 2> { typedef  uint32x4x2_t vec128; typedef  uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
+template <> struct VecTraits<s64, 2> { typedef   int64x2x2_t vec128; typedef   int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<u64, 2> { typedef  uint64x2x2_t vec128; typedef  uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<f32, 2> { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
+
+template <> struct VecTraits< u8, 3> { typedef  uint8x16x3_t vec128; typedef   uint8x8x3_t vec64; typedef VecTraits<  u8, 3> unsign; };
+template <> struct VecTraits< s8, 3> { typedef   int8x16x3_t vec128; typedef    int8x8x3_t vec64; typedef VecTraits<  u8, 3> unsign; };
+template <> struct VecTraits<u16, 3> { typedef  uint16x8x3_t vec128; typedef  uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
+template <> struct VecTraits<s16, 3> { typedef   int16x8x3_t vec128; typedef   int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
+template <> struct VecTraits<s32, 3> { typedef   int32x4x3_t vec128; typedef   int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
+template <> struct VecTraits<u32, 3> { typedef  uint32x4x3_t vec128; typedef  uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
+template <> struct VecTraits<s64, 3> { typedef   int64x2x3_t vec128; typedef   int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<u64, 3> { typedef  uint64x2x3_t vec128; typedef  uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<f32, 3> { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
+
+template <> struct VecTraits< u8, 4> { typedef  uint8x16x4_t vec128; typedef   uint8x8x4_t vec64; typedef VecTraits<  u8, 3> unsign; };
+template <> struct VecTraits< s8, 4> { typedef   int8x16x4_t vec128; typedef    int8x8x4_t vec64; typedef VecTraits<  u8, 3> unsign; };
+template <> struct VecTraits<u16, 4> { typedef  uint16x8x4_t vec128; typedef  uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
+template <> struct VecTraits<s16, 4> { typedef   int16x8x4_t vec128; typedef   int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
+template <> struct VecTraits<s32, 4> { typedef   int32x4x4_t vec128; typedef   int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
+template <> struct VecTraits<u32, 4> { typedef  uint32x4x4_t vec128; typedef  uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
+template <> struct VecTraits<s64, 4> { typedef   int64x2x4_t vec128; typedef   int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<u64, 4> { typedef  uint64x2x4_t vec128; typedef  uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<f32, 4> { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
+
+////////////////////////////// vld1q ///////////////////////
+
+inline  uint8x16_t vld1q(const u8  * ptr) { return  vld1q_u8(ptr); }
+inline   int8x16_t vld1q(const s8  * ptr) { return  vld1q_s8(ptr); }
+inline  uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }
+inline   int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); }
+inline  uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }
+inline   int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
+inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); }
+
+////////////////////////////// vld1 ///////////////////////
+
+inline   uint8x8_t vld1(const u8  * ptr) { return  vld1_u8(ptr); }
+inline    int8x8_t vld1(const s8  * ptr) { return  vld1_s8(ptr); }
+inline  uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); }
+inline   int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); }
+inline  uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); }
+inline   int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
+inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }
+
+////////////////////////////// vld2q ///////////////////////
+
+inline  uint8x16x2_t vld2q(const u8  * ptr) { return  vld2q_u8(ptr); }
+inline   int8x16x2_t vld2q(const s8  * ptr) { return  vld2q_s8(ptr); }
+inline  uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); }
+inline   int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); }
+inline  uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); }
+inline   int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); }
+inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); }
+
+////////////////////////////// vld2 ///////////////////////
+
+inline   uint8x8x2_t vld2(const u8  * ptr) { return  vld2_u8(ptr); }
+inline    int8x8x2_t vld2(const s8  * ptr) { return  vld2_s8(ptr); }
+inline  uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); }
+inline   int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
+inline  uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); }
+inline   int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); }
+inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); }
+
+////////////////////////////// vld3q ///////////////////////
+
+inline  uint8x16x3_t vld3q(const u8  * ptr) { return  vld3q_u8(ptr); }
+inline   int8x16x3_t vld3q(const s8  * ptr) { return  vld3q_s8(ptr); }
+inline  uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); }
+inline   int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); }
+inline  uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); }
+inline   int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); }
+inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); }
+
+////////////////////////////// vld3 ///////////////////////
+
+inline   uint8x8x3_t vld3(const u8  * ptr) { return  vld3_u8(ptr); }
+inline    int8x8x3_t vld3(const s8  * ptr) { return  vld3_s8(ptr); }
+inline  uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); }
+inline   int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); }
+inline  uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); }
+inline   int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); }
+inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); }
+
+////////////////////////////// vld4q ///////////////////////
+
+inline  uint8x16x4_t vld4q(const u8  * ptr) { return  vld4q_u8(ptr); }
+inline   int8x16x4_t vld4q(const s8  * ptr) { return  vld4q_s8(ptr); }
+inline  uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); }
+inline   int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); }
+inline  uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); }
+inline   int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); }
+inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); }
+
+////////////////////////////// vld4 ///////////////////////
+
+inline   uint8x8x4_t vld4(const u8  * ptr) { return  vld4_u8(ptr); }
+inline    int8x8x4_t vld4(const s8  * ptr) { return  vld4_s8(ptr); }
+inline  uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); }
+inline   int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); }
+inline  uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); }
+inline   int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); }
+inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); }
+
+////////////////////////////// vst1q ///////////////////////
+
+inline void vst1q(u8  * ptr, const uint8x16_t  & v) { return vst1q_u8(ptr,  v); }
+inline void vst1q(s8  * ptr, const int8x16_t   & v) { return vst1q_s8(ptr,  v); }
+inline void vst1q(u16 * ptr, const uint16x8_t  & v) { return vst1q_u16(ptr, v); }
+inline void vst1q(s16 * ptr, const int16x8_t   & v) { return vst1q_s16(ptr, v); }
+inline void vst1q(u32 * ptr, const uint32x4_t  & v) { return vst1q_u32(ptr, v); }
+inline void vst1q(s32 * ptr, const int32x4_t   & v) { return vst1q_s32(ptr, v); }
+inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
+
+////////////////////////////// vst1 ///////////////////////
+
+inline void vst1(u8  * ptr, const uint8x8_t   & v) { return vst1_u8(ptr,  v); }
+inline void vst1(s8  * ptr, const int8x8_t    & v) { return vst1_s8(ptr,  v); }
+inline void vst1(u16 * ptr, const uint16x4_t  & v) { return vst1_u16(ptr, v); }
+inline void vst1(s16 * ptr, const int16x4_t   & v) { return vst1_s16(ptr, v); }
+inline void vst1(u32 * ptr, const uint32x2_t  & v) { return vst1_u32(ptr, v); }
+inline void vst1(s32 * ptr, const int32x2_t   & v) { return vst1_s32(ptr, v); }
+inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }
+
+////////////////////////////// vst2q ///////////////////////
+
+inline void vst2q(u8  * ptr, const uint8x16x2_t  & v) { return vst2q_u8(ptr,  v); }
+inline void vst2q(s8  * ptr, const int8x16x2_t   & v) { return vst2q_s8(ptr,  v); }
+inline void vst2q(u16 * ptr, const uint16x8x2_t  & v) { return vst2q_u16(ptr, v); }
+inline void vst2q(s16 * ptr, const int16x8x2_t   & v) { return vst2q_s16(ptr, v); }
+inline void vst2q(u32 * ptr, const uint32x4x2_t  & v) { return vst2q_u32(ptr, v); }
+inline void vst2q(s32 * ptr, const int32x4x2_t   & v) { return vst2q_s32(ptr, v); }
+inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); }
+
+////////////////////////////// vst2 ///////////////////////
+
+inline void vst2(u8  * ptr, const uint8x8x2_t   & v) { return vst2_u8(ptr,  v); }
+inline void vst2(s8  * ptr, const int8x8x2_t    & v) { return vst2_s8(ptr,  v); }
+inline void vst2(u16 * ptr, const uint16x4x2_t  & v) { return vst2_u16(ptr, v); }
+inline void vst2(s16 * ptr, const int16x4x2_t   & v) { return vst2_s16(ptr, v); }
+inline void vst2(u32 * ptr, const uint32x2x2_t  & v) { return vst2_u32(ptr, v); }
+inline void vst2(s32 * ptr, const int32x2x2_t   & v) { return vst2_s32(ptr, v); }
+inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); }
+
+////////////////////////////// vst3q ///////////////////////
+
+inline void vst3q(u8  * ptr, const uint8x16x3_t  & v) { return vst3q_u8(ptr,  v); }
+inline void vst3q(s8  * ptr, const int8x16x3_t   & v) { return vst3q_s8(ptr,  v); }
+inline void vst3q(u16 * ptr, const uint16x8x3_t  & v) { return vst3q_u16(ptr, v); }
+inline void vst3q(s16 * ptr, const int16x8x3_t   & v) { return vst3q_s16(ptr, v); }
+inline void vst3q(u32 * ptr, const uint32x4x3_t  & v) { return vst3q_u32(ptr, v); }
+inline void vst3q(s32 * ptr, const int32x4x3_t   & v) { return vst3q_s32(ptr, v); }
+inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); }
+
+////////////////////////////// vst3 ///////////////////////
+
+inline void vst3(u8  * ptr, const uint8x8x3_t   & v) { return vst3_u8(ptr,  v); }
+inline void vst3(s8  * ptr, const int8x8x3_t    & v) { return vst3_s8(ptr,  v); }
+inline void vst3(u16 * ptr, const uint16x4x3_t  & v) { return vst3_u16(ptr, v); }
+inline void vst3(s16 * ptr, const int16x4x3_t   & v) { return vst3_s16(ptr, v); }
+inline void vst3(u32 * ptr, const uint32x2x3_t  & v) { return vst3_u32(ptr, v); }
+inline void vst3(s32 * ptr, const int32x2x3_t   & v) { return vst3_s32(ptr, v); }
+inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); }
+
+////////////////////////////// vst4q ///////////////////////
+
+inline void vst4q(u8  * ptr, const uint8x16x4_t  & v) { return vst4q_u8(ptr,  v); }
+inline void vst4q(s8  * ptr, const int8x16x4_t   & v) { return vst4q_s8(ptr,  v); }
+inline void vst4q(u16 * ptr, const uint16x8x4_t  & v) { return vst4q_u16(ptr, v); }
+inline void vst4q(s16 * ptr, const int16x8x4_t   & v) { return vst4q_s16(ptr, v); }
+inline void vst4q(u32 * ptr, const uint32x4x4_t  & v) { return vst4q_u32(ptr, v); }
+inline void vst4q(s32 * ptr, const int32x4x4_t   & v) { return vst4q_s32(ptr, v); }
+inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); }
+
+////////////////////////////// vst4 ///////////////////////
+
+inline void vst4(u8  * ptr, const uint8x8x4_t   & v) { return vst4_u8(ptr,  v); }
+inline void vst4(s8  * ptr, const int8x8x4_t    & v) { return vst4_s8(ptr,  v); }
+inline void vst4(u16 * ptr, const uint16x4x4_t  & v) { return vst4_u16(ptr, v); }
+inline void vst4(s16 * ptr, const int16x4x4_t   & v) { return vst4_s16(ptr, v); }
+inline void vst4(u32 * ptr, const uint32x2x4_t  & v) { return vst4_u32(ptr, v); }
+inline void vst4(s32 * ptr, const int32x2x4_t   & v) { return vst4_s32(ptr, v); }
+inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); }
+
+////////////////////////////// vabdq ///////////////////////
+
+inline  uint8x16_t vabdq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vabdq_u8 (v0, v1); }
+inline   int8x16_t vabdq(const int8x16_t   & v0, const int8x16_t   & v1) { return vabdq_s8 (v0, v1); }
+inline  uint16x8_t vabdq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vabdq_u16(v0, v1); }
+inline   int16x8_t vabdq(const int16x8_t   & v0, const int16x8_t   & v1) { return vabdq_s16(v0, v1); }
+inline  uint32x4_t vabdq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vabdq_u32(v0, v1); }
+inline   int32x4_t vabdq(const int32x4_t   & v0, const int32x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); }
+
+////////////////////////////// vabd ///////////////////////
+
+inline   uint8x8_t vabd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vabd_u8 (v0, v1); }
+inline    int8x8_t vabd(const int8x8_t    & v0, const int8x8_t    & v1) { return vabd_s8 (v0, v1); }
+inline  uint16x4_t vabd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vabd_u16(v0, v1); }
+inline   int16x4_t vabd(const int16x4_t   & v0, const int16x4_t   & v1) { return vabd_s16(v0, v1); }
+inline  uint32x2_t vabd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vabd_u32(v0, v1); }
+inline   int32x2_t vabd(const int32x2_t   & v0, const int32x2_t   & v1) { return vabd_s32(v0, v1); }
+inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); }
+
+////////////////////////////// vminq ///////////////////////
+
+inline  uint8x16_t vminq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vminq_u8 (v0, v1); }
+inline   int8x16_t vminq(const int8x16_t   & v0, const int8x16_t   & v1) { return vminq_s8 (v0, v1); }
+inline  uint16x8_t vminq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vminq_u16(v0, v1); }
+inline   int16x8_t vminq(const int16x8_t   & v0, const int16x8_t   & v1) { return vminq_s16(v0, v1); }
+inline  uint32x4_t vminq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vminq_u32(v0, v1); }
+inline   int32x4_t vminq(const int32x4_t   & v0, const int32x4_t   & v1) { return vminq_s32(v0, v1); }
+inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
+
+////////////////////////////// vmin ///////////////////////
+
+inline   uint8x8_t vmin(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmin_u8 (v0, v1); }
+inline    int8x8_t vmin(const int8x8_t    & v0, const int8x8_t    & v1) { return vmin_s8 (v0, v1); }
+inline  uint16x4_t vmin(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmin_u16(v0, v1); }
+inline   int16x4_t vmin(const int16x4_t   & v0, const int16x4_t   & v1) { return vmin_s16(v0, v1); }
+inline  uint32x2_t vmin(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmin_u32(v0, v1); }
+inline   int32x2_t vmin(const int32x2_t   & v0, const int32x2_t   & v1) { return vmin_s32(v0, v1); }
+inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); }
+
+////////////////////////////// vmaxq ///////////////////////
+
+inline  uint8x16_t vmaxq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vmaxq_u8 (v0, v1); }
+inline   int8x16_t vmaxq(const int8x16_t   & v0, const int8x16_t   & v1) { return vmaxq_s8 (v0, v1); }
+inline  uint16x8_t vmaxq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vmaxq_u16(v0, v1); }
+inline   int16x8_t vmaxq(const int16x8_t   & v0, const int16x8_t   & v1) { return vmaxq_s16(v0, v1); }
+inline  uint32x4_t vmaxq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vmaxq_u32(v0, v1); }
+inline   int32x4_t vmaxq(const int32x4_t   & v0, const int32x4_t   & v1) { return vmaxq_s32(v0, v1); }
+inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }
+
+////////////////////////////// vmax ///////////////////////
+
+inline   uint8x8_t vmax(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmax_u8 (v0, v1); }
+inline    int8x8_t vmax(const int8x8_t    & v0, const int8x8_t    & v1) { return vmax_s8 (v0, v1); }
+inline  uint16x4_t vmax(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmax_u16(v0, v1); }
+inline   int16x4_t vmax(const int16x4_t   & v0, const int16x4_t   & v1) { return vmax_s16(v0, v1); }
+inline  uint32x2_t vmax(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmax_u32(v0, v1); }
+inline   int32x2_t vmax(const int32x2_t   & v0, const int32x2_t   & v1) { return vmax_s32(v0, v1); }
+inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }
+
+////////////////////////////// vdupq_n ///////////////////////
+
+inline  uint8x16_t vdupq_n(const u8  & val) { return  vdupq_n_u8(val); }
+inline   int8x16_t vdupq_n(const s8  & val) { return  vdupq_n_s8(val); }
+inline  uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
+inline   int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); }
+inline  uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
+inline   int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); }
+inline  uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }
+inline   int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); }
+inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); }
+
+////////////////////////////// vdup_n ///////////////////////
+
+inline   uint8x8_t vdup_n(const u8  & val) { return  vdup_n_u8(val); }
+inline    int8x8_t vdup_n(const s8  & val) { return  vdup_n_s8(val); }
+inline  uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
+inline   int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
+inline  uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); }
+inline   int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
+inline  uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); }
+inline   int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); }
+inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); }
+
+////////////////////////////// vget_low ///////////////////////
+
+inline uint8x8_t   vget_low(const uint8x16_t  & v) { return vget_low_u8 (v); }
+inline int8x8_t    vget_low(const int8x16_t   & v) { return vget_low_s8 (v); }
+inline uint16x4_t  vget_low(const uint16x8_t  & v) { return vget_low_u16(v); }
+inline int16x4_t   vget_low(const int16x8_t   & v) { return vget_low_s16(v); }
+inline uint32x2_t  vget_low(const uint32x4_t  & v) { return vget_low_u32(v); }
+inline int32x2_t   vget_low(const int32x4_t   & v) { return vget_low_s32(v); }
+inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); }
+
+////////////////////////////// vget_high ///////////////////////
+
+inline uint8x8_t   vget_high(const uint8x16_t  & v) { return vget_high_u8 (v); }
+inline int8x8_t    vget_high(const int8x16_t   & v) { return vget_high_s8 (v); }
+inline uint16x4_t  vget_high(const uint16x8_t  & v) { return vget_high_u16(v); }
+inline int16x4_t   vget_high(const int16x8_t   & v) { return vget_high_s16(v); }
+inline uint32x2_t  vget_high(const uint32x4_t  & v) { return vget_high_u32(v); }
+inline int32x2_t   vget_high(const int32x4_t   & v) { return vget_high_s32(v); }
+inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); }
+
+////////////////////////////// vcombine ///////////////////////
+
+inline   uint8x16_t vcombine(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcombine_u8 (v0, v1); }
+inline    int8x16_t vcombine(const int8x8_t    & v0, const int8x8_t    & v1) { return vcombine_s8 (v0, v1); }
+inline  uint16x8_t  vcombine(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcombine_u16(v0, v1); }
+inline   int16x8_t  vcombine(const int16x4_t   & v0, const int16x4_t   & v1) { return vcombine_s16(v0, v1); }
+inline  uint32x4_t  vcombine(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcombine_u32(v0, v1); }
+inline   int32x4_t  vcombine(const int32x2_t   & v0, const int32x2_t   & v1) { return vcombine_s32(v0, v1); }
+inline float32x4_t  vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); }
+
+////////////////////////////// vaddq ///////////////////////
+
+inline  uint8x16_t vaddq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vaddq_u8 (v0, v1); }
+inline   int8x16_t vaddq(const int8x16_t   & v0, const int8x16_t   & v1) { return vaddq_s8 (v0, v1); }
+inline  uint16x8_t vaddq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vaddq_u16(v0, v1); }
+inline   int16x8_t vaddq(const int16x8_t   & v0, const int16x8_t   & v1) { return vaddq_s16(v0, v1); }
+inline  uint32x4_t vaddq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vaddq_u32(v0, v1); }
+inline   int32x4_t vaddq(const int32x4_t   & v0, const int32x4_t   & v1) { return vaddq_s32(v0, v1); }
+inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); }
+
+////////////////////////////// vadd ///////////////////////
+
+inline   uint8x8_t vadd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vadd_u8 (v0, v1); }
+inline    int8x8_t vadd(const int8x8_t    & v0, const int8x8_t    & v1) { return vadd_s8 (v0, v1); }
+inline  uint16x4_t vadd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vadd_u16(v0, v1); }
+inline   int16x4_t vadd(const int16x4_t   & v0, const int16x4_t   & v1) { return vadd_s16(v0, v1); }
+inline  uint32x2_t vadd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vadd_u32(v0, v1); }
+inline   int32x2_t vadd(const int32x2_t   & v0, const int32x2_t   & v1) { return vadd_s32(v0, v1); }
+inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); }
+
+////////////////////////////// vqaddq ///////////////////////
+
+inline  uint8x16_t vqaddq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vqaddq_u8 (v0, v1); }
+inline   int8x16_t vqaddq(const int8x16_t   & v0, const int8x16_t   & v1) { return vqaddq_s8 (v0, v1); }
+inline  uint16x8_t vqaddq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vqaddq_u16(v0, v1); }
+inline   int16x8_t vqaddq(const int16x8_t   & v0, const int16x8_t   & v1) { return vqaddq_s16(v0, v1); }
+inline  uint32x4_t vqaddq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vqaddq_u32(v0, v1); }
+inline   int32x4_t vqaddq(const int32x4_t   & v0, const int32x4_t   & v1) { return vqaddq_s32(v0, v1); }
+
+////////////////////////////// vqadd ///////////////////////
+
+inline   uint8x8_t vqadd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vqadd_u8 (v0, v1); }
+inline    int8x8_t vqadd(const int8x8_t    & v0, const int8x8_t    & v1) { return vqadd_s8 (v0, v1); }
+inline  uint16x4_t vqadd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vqadd_u16(v0, v1); }
+inline   int16x4_t vqadd(const int16x4_t   & v0, const int16x4_t   & v1) { return vqadd_s16(v0, v1); }
+inline  uint32x2_t vqadd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vqadd_u32(v0, v1); }
+inline   int32x2_t vqadd(const int32x2_t   & v0, const int32x2_t   & v1) { return vqadd_s32(v0, v1); }
+
+////////////////////////////// vsubq ///////////////////////
+
+inline  uint8x16_t vsubq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vsubq_u8 (v0, v1); }
+inline   int8x16_t vsubq(const int8x16_t   & v0, const int8x16_t   & v1) { return vsubq_s8 (v0, v1); }
+inline  uint16x8_t vsubq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vsubq_u16(v0, v1); }
+inline   int16x8_t vsubq(const int16x8_t   & v0, const int16x8_t   & v1) { return vsubq_s16(v0, v1); }
+inline  uint32x4_t vsubq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vsubq_u32(v0, v1); }
+inline   int32x4_t vsubq(const int32x4_t   & v0, const int32x4_t   & v1) { return vsubq_s32(v0, v1); }
+inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); }
+
+////////////////////////////// vsub ///////////////////////
+
+inline   uint8x8_t vsub(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vsub_u8 (v0, v1); }
+inline    int8x8_t vsub(const int8x8_t    & v0, const int8x8_t    & v1) { return vsub_s8 (v0, v1); }
+inline  uint16x4_t vsub(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vsub_u16(v0, v1); }
+inline   int16x4_t vsub(const int16x4_t   & v0, const int16x4_t   & v1) { return vsub_s16(v0, v1); }
+inline  uint32x2_t vsub(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vsub_u32(v0, v1); }
+inline   int32x2_t vsub(const int32x2_t   & v0, const int32x2_t   & v1) { return vsub_s32(v0, v1); }
+inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); }
+
+////////////////////////////// vqsubq ///////////////////////
+
+inline  uint8x16_t vqsubq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vqsubq_u8 (v0, v1); }
+inline   int8x16_t vqsubq(const int8x16_t   & v0, const int8x16_t   & v1) { return vqsubq_s8 (v0, v1); }
+inline  uint16x8_t vqsubq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vqsubq_u16(v0, v1); }
+inline   int16x8_t vqsubq(const int16x8_t   & v0, const int16x8_t   & v1) { return vqsubq_s16(v0, v1); }
+inline  uint32x4_t vqsubq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vqsubq_u32(v0, v1); }
+inline   int32x4_t vqsubq(const int32x4_t   & v0, const int32x4_t   & v1) { return vqsubq_s32(v0, v1); }
+inline  uint64x2_t vqsubq(const uint64x2_t  & v0, const uint64x2_t  & v1) { return vqsubq_u64(v0, v1); }
+inline   int64x2_t vqsubq(const int64x2_t   & v0, const int64x2_t   & v1) { return vqsubq_s64(v0, v1); }
+
+////////////////////////////// vqsub ///////////////////////
+
+inline   uint8x8_t vqsub(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vqsub_u8 (v0, v1); }
+inline    int8x8_t vqsub(const int8x8_t    & v0, const int8x8_t    & v1) { return vqsub_s8 (v0, v1); }
+inline  uint16x4_t vqsub(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vqsub_u16(v0, v1); }
+inline   int16x4_t vqsub(const int16x4_t   & v0, const int16x4_t   & v1) { return vqsub_s16(v0, v1); }
+inline  uint32x2_t vqsub(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vqsub_u32(v0, v1); }
+inline   int32x2_t vqsub(const int32x2_t   & v0, const int32x2_t   & v1) { return vqsub_s32(v0, v1); }
+inline  uint64x1_t vqsub(const uint64x1_t  & v0, const uint64x1_t  & v1) { return vqsub_u64(v0, v1); }
+inline   int64x1_t vqsub(const int64x1_t   & v0, const int64x1_t   & v1) { return vqsub_s64(v0, v1); }
+
+////////////////////////////// vmull ///////////////////////
+
+inline  uint16x8_t vmull(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmull_u8 (v0, v1); }
+inline   int16x8_t vmull(const int8x8_t    & v0, const int8x8_t    & v1) { return vmull_s8 (v0, v1); }
+inline  uint32x4_t vmull(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmull_u16(v0, v1); }
+inline   int32x4_t vmull(const int16x4_t   & v0, const int16x4_t   & v1) { return vmull_s16(v0, v1); }
+inline  uint64x2_t vmull(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmull_u32(v0, v1); }
+inline   int64x2_t vmull(const int32x2_t   & v0, const int32x2_t   & v1) { return vmull_s32(v0, v1); }
+
+////////////////////////////// vrev64q ///////////////////////
+
+inline uint8x16_t  vrev64q(const uint8x16_t  & v) { return vrev64q_u8 (v); }
+inline int8x16_t   vrev64q(const int8x16_t   & v) { return vrev64q_s8 (v); }
+inline uint16x8_t  vrev64q(const uint16x8_t  & v) { return vrev64q_u16(v); }
+inline int16x8_t   vrev64q(const int16x8_t   & v) { return vrev64q_s16(v); }
+inline uint32x4_t  vrev64q(const uint32x4_t  & v) { return vrev64q_u32(v); }
+inline int32x4_t   vrev64q(const int32x4_t   & v) { return vrev64q_s32(v); }
+inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); }
+
+////////////////////////////// vrev64 ///////////////////////
+
+inline uint8x8_t   vrev64(const uint8x8_t   & v) { return vrev64_u8 (v); }
+inline int8x8_t    vrev64(const int8x8_t    & v) { return vrev64_s8 (v); }
+inline uint16x4_t  vrev64(const uint16x4_t  & v) { return vrev64_u16(v); }
+inline int16x4_t   vrev64(const int16x4_t   & v) { return vrev64_s16(v); }
+inline uint32x2_t  vrev64(const uint32x2_t  & v) { return vrev64_u32(v); }
+inline int32x2_t   vrev64(const int32x2_t   & v) { return vrev64_s32(v); }
+inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); }
+
+////////////////////////////// vceqq ///////////////////////
+
+inline  uint8x16_t vceqq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vceqq_u8 (v0, v1); }
+inline  uint8x16_t vceqq(const int8x16_t   & v0, const int8x16_t   & v1) { return vceqq_s8 (v0, v1); }
+inline  uint16x8_t vceqq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vceqq_u16(v0, v1); }
+inline  uint16x8_t vceqq(const int16x8_t   & v0, const int16x8_t   & v1) { return vceqq_s16(v0, v1); }
+inline  uint32x4_t vceqq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vceqq_u32(v0, v1); }
+inline  uint32x4_t vceqq(const int32x4_t   & v0, const int32x4_t   & v1) { return vceqq_s32(v0, v1); }
+inline  uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }
+
+////////////////////////////// vceq ///////////////////////
+
+inline   uint8x8_t vceq(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vceq_u8 (v0, v1); }
+inline   uint8x8_t vceq(const int8x8_t    & v0, const int8x8_t    & v1) { return vceq_s8 (v0, v1); }
+inline  uint16x4_t vceq(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vceq_u16(v0, v1); }
+inline  uint16x4_t vceq(const int16x4_t   & v0, const int16x4_t   & v1) { return vceq_s16(v0, v1); }
+inline  uint32x2_t vceq(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vceq_u32(v0, v1); }
+inline  uint32x2_t vceq(const int32x2_t   & v0, const int32x2_t   & v1) { return vceq_s32(v0, v1); }
+inline  uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); }
+
+////////////////////////////// vcgtq ///////////////////////
+
+inline  uint8x16_t vcgtq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vcgtq_u8 (v0, v1); }
+inline  uint8x16_t vcgtq(const int8x16_t   & v0, const int8x16_t   & v1) { return vcgtq_s8 (v0, v1); }
+inline  uint16x8_t vcgtq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vcgtq_u16(v0, v1); }
+inline  uint16x8_t vcgtq(const int16x8_t   & v0, const int16x8_t   & v1) { return vcgtq_s16(v0, v1); }
+inline  uint32x4_t vcgtq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vcgtq_u32(v0, v1); }
+inline  uint32x4_t vcgtq(const int32x4_t   & v0, const int32x4_t   & v1) { return vcgtq_s32(v0, v1); }
+inline  uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); }
+
+////////////////////////////// vcgt ///////////////////////
+
+inline   uint8x8_t vcgt(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcgt_u8 (v0, v1); }
+inline   uint8x8_t vcgt(const int8x8_t    & v0, const int8x8_t    & v1) { return vcgt_s8 (v0, v1); }
+inline  uint16x4_t vcgt(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcgt_u16(v0, v1); }
+inline  uint16x4_t vcgt(const int16x4_t   & v0, const int16x4_t   & v1) { return vcgt_s16(v0, v1); }
+inline  uint32x2_t vcgt(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcgt_u32(v0, v1); }
+inline  uint32x2_t vcgt(const int32x2_t   & v0, const int32x2_t   & v1) { return vcgt_s32(v0, v1); }
+inline  uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); }
+
+////////////////////////////// vcgeq ///////////////////////
+
+inline  uint8x16_t vcgeq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vcgeq_u8 (v0, v1); }
+inline  uint8x16_t vcgeq(const int8x16_t   & v0, const int8x16_t   & v1) { return vcgeq_s8 (v0, v1); }
+inline  uint16x8_t vcgeq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vcgeq_u16(v0, v1); }
+inline  uint16x8_t vcgeq(const int16x8_t   & v0, const int16x8_t   & v1) { return vcgeq_s16(v0, v1); }
+inline  uint32x4_t vcgeq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vcgeq_u32(v0, v1); }
+inline  uint32x4_t vcgeq(const int32x4_t   & v0, const int32x4_t   & v1) { return vcgeq_s32(v0, v1); }
+inline  uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); }
+
+////////////////////////////// vcge ///////////////////////
+
+inline   uint8x8_t vcge(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcge_u8 (v0, v1); }
+inline   uint8x8_t vcge(const int8x8_t    & v0, const int8x8_t    & v1) { return vcge_s8 (v0, v1); }
+inline  uint16x4_t vcge(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcge_u16(v0, v1); }
+inline  uint16x4_t vcge(const int16x4_t   & v0, const int16x4_t   & v1) { return vcge_s16(v0, v1); }
+inline  uint32x2_t vcge(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcge_u32(v0, v1); }
+inline  uint32x2_t vcge(const int32x2_t   & v0, const int32x2_t   & v1) { return vcge_s32(v0, v1); }
+inline  uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); }
+
+////////////////////////////// vandq ///////////////////////
+
+inline  uint8x16_t vandq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vandq_u8 (v0, v1); }
+inline   int8x16_t vandq(const int8x16_t   & v0, const int8x16_t   & v1) { return vandq_s8 (v0, v1); }
+inline  uint16x8_t vandq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vandq_u16(v0, v1); }
+inline   int16x8_t vandq(const int16x8_t   & v0, const int16x8_t   & v1) { return vandq_s16(v0, v1); }
+inline  uint32x4_t vandq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vandq_u32(v0, v1); }
+inline   int32x4_t vandq(const int32x4_t   & v0, const int32x4_t   & v1) { return vandq_s32(v0, v1); }
+
+////////////////////////////// vand ///////////////////////
+
+inline   uint8x8_t vand(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vand_u8 (v0, v1); }
+inline    int8x8_t vand(const int8x8_t    & v0, const int8x8_t    & v1) { return vand_s8 (v0, v1); }
+inline  uint16x4_t vand(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vand_u16(v0, v1); }
+inline   int16x4_t vand(const int16x4_t   & v0, const int16x4_t   & v1) { return vand_s16(v0, v1); }
+inline  uint32x2_t vand(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vand_u32(v0, v1); }
+inline   int32x2_t vand(const int32x2_t   & v0, const int32x2_t   & v1) { return vand_s32(v0, v1); }
+
+////////////////////////////// vmovn ///////////////////////
+
+inline uint8x8_t   vmovn(const uint16x8_t  & v) { return vmovn_u16(v); }
+inline int8x8_t    vmovn(const int16x8_t   & v) { return vmovn_s16(v); }
+inline uint16x4_t  vmovn(const uint32x4_t  & v) { return vmovn_u32(v); }
+inline int16x4_t   vmovn(const int32x4_t   & v) { return vmovn_s32(v); }
+inline uint32x2_t  vmovn(const uint64x2_t  & v) { return vmovn_u64(v); }
+inline int32x2_t   vmovn(const int64x2_t   & v) { return vmovn_s64(v); }
+
+////////////////////////////// vqmovn ///////////////////////
+
+inline uint8x8_t   vqmovn(const uint16x8_t  & v) { return vqmovn_u16(v); }
+inline int8x8_t    vqmovn(const int16x8_t   & v) { return vqmovn_s16(v); }
+inline uint16x4_t  vqmovn(const uint32x4_t  & v) { return vqmovn_u32(v); }
+inline int16x4_t   vqmovn(const int32x4_t   & v) { return vqmovn_s32(v); }
+inline uint32x2_t  vqmovn(const uint64x2_t  & v) { return vqmovn_u64(v); }
+inline int32x2_t   vqmovn(const int64x2_t   & v) { return vqmovn_s64(v); }
+
+////////////////////////////// vmovl ///////////////////////
+
+inline uint16x8_t  vmovl(const uint8x8_t   & v) { return  vmovl_u8(v); }
+inline int16x8_t   vmovl(const int8x8_t    & v) { return  vmovl_s8(v); }
+inline uint32x4_t  vmovl(const uint16x4_t  & v) { return vmovl_u16(v); }
+inline int32x4_t   vmovl(const int16x4_t   & v) { return vmovl_s16(v); }
+
+////////////////////////////// vmvnq ///////////////////////
+
+inline uint8x16_t  vmvnq(const uint8x16_t  & v) { return vmvnq_u8 (v); }
+inline int8x16_t   vmvnq(const int8x16_t   & v) { return vmvnq_s8 (v); }
+inline uint16x8_t  vmvnq(const uint16x8_t  & v) { return vmvnq_u16(v); }
+inline int16x8_t   vmvnq(const int16x8_t   & v) { return vmvnq_s16(v); }
+inline uint32x4_t  vmvnq(const uint32x4_t  & v) { return vmvnq_u32(v); }
+inline int32x4_t   vmvnq(const int32x4_t   & v) { return vmvnq_s32(v); }
+
+////////////////////////////// vmvn ///////////////////////
+
+inline uint8x8_t   vmvn(const uint8x8_t   & v) { return vmvn_u8 (v); }
+inline int8x8_t    vmvn(const int8x8_t    & v) { return vmvn_s8 (v); }
+inline uint16x4_t  vmvn(const uint16x4_t  & v) { return vmvn_u16(v); }
+inline int16x4_t   vmvn(const int16x4_t   & v) { return vmvn_s16(v); }
+inline uint32x2_t  vmvn(const uint32x2_t  & v) { return vmvn_u32(v); }
+inline int32x2_t   vmvn(const int32x2_t   & v) { return vmvn_s32(v); }
+
+////////////////////////////// vbicq ///////////////////////
+
+inline  uint8x16_t vbicq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vbicq_u8 (v0, v1); }
+inline   int8x16_t vbicq(const int8x16_t   & v0, const int8x16_t   & v1) { return vbicq_s8 (v0, v1); }
+inline  uint16x8_t vbicq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vbicq_u16(v0, v1); }
+inline   int16x8_t vbicq(const int16x8_t   & v0, const int16x8_t   & v1) { return vbicq_s16(v0, v1); }
+inline  uint32x4_t vbicq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vbicq_u32(v0, v1); }
+inline   int32x4_t vbicq(const int32x4_t   & v0, const int32x4_t   & v1) { return vbicq_s32(v0, v1); }
+inline  uint64x2_t vbicq(const uint64x2_t  & v0, const uint64x2_t  & v1) { return vbicq_u64(v0, v1); }
+inline   int64x2_t vbicq(const int64x2_t   & v0, const int64x2_t   & v1) { return vbicq_s64(v0, v1); }
+
+////////////////////////////// vbic ///////////////////////
+
+inline   uint8x8_t vbic(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vbic_u8 (v0, v1); }
+inline    int8x8_t vbic(const int8x8_t    & v0, const int8x8_t    & v1) { return vbic_s8 (v0, v1); }
+inline  uint16x4_t vbic(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vbic_u16(v0, v1); }
+inline   int16x4_t vbic(const int16x4_t   & v0, const int16x4_t   & v1) { return vbic_s16(v0, v1); }
+inline  uint32x2_t vbic(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vbic_u32(v0, v1); }
+inline   int32x2_t vbic(const int32x2_t   & v0, const int32x2_t   & v1) { return vbic_s32(v0, v1); }
+inline  uint64x1_t vbic(const uint64x1_t  & v0, const uint64x1_t  & v1) { return vbic_u64(v0, v1); }
+inline   int64x1_t vbic(const int64x1_t   & v0, const int64x1_t   & v1) { return vbic_s64(v0, v1); }
+
+////////////////////////////// vtransform ///////////////////////
+
+template <typename Op>
+void vtransform(Size2D size,
+                const typename Op::type * src0Base, ptrdiff_t src0Stride,
+                const typename Op::type * src1Base, ptrdiff_t src1Stride,
+                typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op)
+{
+    typedef typename Op::type type;
+    typedef typename VecTraits<type>::vec128 vec128;
+    typedef typename VecTraits<type>::vec64 vec64;
+
+    if (src0Stride == src1Stride && src0Stride == dstStride &&
+        src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+    const size_t step_base = 32 / sizeof(type);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+    const size_t step_tail = 8 / sizeof(type);
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
+        const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
+        typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y);
+        size_t x = 0;
+
+        for( ; x < roiw_base; x += step_base )
+        {
+            internal::prefetch(src0 + x);
+            internal::prefetch(src1 + x);
+
+            vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type));
+            vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type));
+            vec128 v_dst;
+
+            op(v_src00, v_src10, v_dst);
+            vst1q(dst + x, v_dst);
+
+            op(v_src01, v_src11, v_dst);
+            vst1q(dst + x + 16 / sizeof(type), v_dst);
+        }
+        for( ; x < roiw_tail; x += step_tail )
+        {
+            vec64 v_src0 = vld1(src0 + x);
+            vec64 v_src1 = vld1(src1 + x);
+            vec64 v_dst;
+
+            op(v_src0, v_src1, v_dst);
+            vst1(dst + x, v_dst);
+        }
+
+        for (; x < size.width; ++x)
+        {
+            op(src0 + x, src1 + x, dst + x);
+        }
+    }
+}
+
+} }
+
+#endif // CAROTENE_NEON
+
+#endif
diff --git a/3rdparty/carotene/src/warp_affine.cpp b/3rdparty/carotene/src/warp_affine.cpp
new file mode 100644
index 0000000000..d546efbc10
--- /dev/null
+++ b/3rdparty/carotene/src/warp_affine.cpp
@@ -0,0 +1,434 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "remap.hpp"
+
+namespace CAROTENE_NS {
+
+bool isWarpAffineNearestNeighborSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+bool isWarpAffineLinearSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                               const u8 * srcBase, ptrdiff_t srcStride,
+                               const f32 * m,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+    s32 * map = alignPtr(_map, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride);
+    float32x4_t v_4 = vdupq_n_f32(4.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(m[0]);
+    float32x4_t v_m1 = vdupq_n_f32(m[1]);
+    float32x4_t v_m2 = vdupq_n_f32(m[2]);
+    float32x4_t v_m3 = vdupq_n_f32(m[3]);
+    float32x4_t v_m4 = vdupq_n_f32(m[4]);
+    float32x4_t v_m5 = vdupq_n_f32(m[5]);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                        int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
+                        int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
+                        int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
+                        vst1q_s32(map_row + x, v_src_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 src_x_f = m[0] * x_ + yx;
+                        f32 src_y_f = m[1] * x_ + yy;
+                        s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
+
+                        src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
+                        src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
+                        map_row[x] = src_y * srcStride + src_x;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                                        getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+                        uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
+                                                      vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
+                        int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x, v_src_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 src_x_f = m[0] * x_ + yx;
+                        f32 src_y_f = m[1] * x_ + yy;
+                        s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
+
+                        map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
+                                     (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                                    getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)m;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+void warpAffineLinear(const Size2D &ssize, const Size2D &dsize,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      const f32 * m,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+    f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+    s32 * map = alignPtr(_map, 16);
+    f32 * coeffs = alignPtr(_coeffs, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
+    float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(m[0]);
+    float32x4_t v_m1 = vdupq_n_f32(m[1]);
+    float32x4_t v_m2 = vdupq_n_f32(m[2]);
+    float32x4_t v_m3 = vdupq_n_f32(m[3]);
+    float32x4_t v_m4 = vdupq_n_f32(m[4]);
+    float32x4_t v_m5 = vdupq_n_f32(m[5]);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
+                    float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
+                        v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
+                        v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
+
+                        int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
+                        int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
+                        int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
+                        int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 src_x_f = m[0] * x_ + yx;
+                        f32 src_y_f = m[1] * x_ + yy;
+
+                        s32 src0_x = (s32)floorf(src_x_f);
+                        s32 src0_y = (s32)floorf(src_y_f);
+
+                        coeff_row[(x << 1) + 0] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
+                        src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
+                        s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
+                        src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
+
+                        map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
+                        map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
+                        map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
+                        map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
+                    }
+                }
+
+                remapLinearReplicate(Size2D(blockWidth, blockHeight),
+                                     srcBase, &map[0], &coeffs[0],
+                                     getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
+                    float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                        int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
+                        v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                        v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                        int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                        int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                        uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
+                        uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                        uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
+                        uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                        v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                        v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                        v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                        v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 src_x_f = m[0] * x_ + yx;
+                        f32 src_y_f = m[1] * x_ + yy;
+
+                        s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
+                        s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
+
+                        coeff_row[(x << 1) + 0] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
+                        map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
+                    }
+                }
+
+                remapLinearConst(Size2D(blockWidth, blockHeight),
+                                 srcBase, &map[0], &coeffs[0],
+                                 getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)m;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/carotene/src/warp_perspective.cpp b/3rdparty/carotene/src/warp_perspective.cpp
new file mode 100644
index 0000000000..4437661413
--- /dev/null
+++ b/3rdparty/carotene/src/warp_perspective.cpp
@@ -0,0 +1,464 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+
+
+#include "remap.hpp"
+
+namespace CAROTENE_NS {
+
+bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+bool isWarpPerspectiveLinearSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                                    const u8 * srcBase, ptrdiff_t srcStride,
+                                    const f32 * m,
+                                    u8 * dstBase, ptrdiff_t dstStride,
+                                    BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+    s32 * map = alignPtr(_map, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride);
+    float32x4_t v_4 = vdupq_n_f32(4.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(m[0]);
+    float32x4_t v_m1 = vdupq_n_f32(m[1]);
+    float32x4_t v_m2 = vdupq_n_f32(m[2]);
+    float32x4_t v_m3 = vdupq_n_f32(m[3]);
+    float32x4_t v_m4 = vdupq_n_f32(m[4]);
+    float32x4_t v_m5 = vdupq_n_f32(m[5]);
+    float32x4_t v_m6 = vdupq_n_f32(m[6]);
+    float32x4_t v_m7 = vdupq_n_f32(m[7]);
+    float32x4_t v_m8 = vdupq_n_f32(m[8]);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+                        v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                        float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                        v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                        v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                        int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
+                        int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
+                        int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
+                        vst1q_s32(map_row + x, v_src_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 w_f = 1.0f / (m[2] * x_ + yw);
+                        f32 src_x_f = (m[0] * x_ + yx) * w_f;
+                        f32 src_y_f = (m[1] * x_ + yy) * w_f;
+                        s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
+
+                        src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
+                        src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
+                        map_row[x] = src_y * srcStride + src_x;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                              getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+                        v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                        float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                        v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                        v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+                        uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
+                                                      vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
+                        int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x, v_src_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 w_f = 1.0f / (m[2] * x_ + yw);
+                        f32 src_x_f = (m[0] * x_ + yx) * w_f;
+                        f32 src_y_f = (m[1] * x_ + yy) * w_f;
+                        s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
+
+                        map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
+                                     (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                          getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)m;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize,
+                           const u8 * srcBase, ptrdiff_t srcStride,
+                           const f32 * m,
+                           u8 * dstBase, ptrdiff_t dstStride,
+                           BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+    f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+    s32 * map = alignPtr(_map, 16);
+    f32 * coeffs = alignPtr(_coeffs, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
+    float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+    float32x4_t v_4 = vdupq_n_f32(4.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(m[0]);
+    float32x4_t v_m1 = vdupq_n_f32(m[1]);
+    float32x4_t v_m2 = vdupq_n_f32(m[2]);
+    float32x4_t v_m3 = vdupq_n_f32(m[3]);
+    float32x4_t v_m4 = vdupq_n_f32(m[4]);
+    float32x4_t v_m5 = vdupq_n_f32(m[5]);
+    float32x4_t v_m6 = vdupq_n_f32(m[6]);
+    float32x4_t v_m7 = vdupq_n_f32(m[7]);
+    float32x4_t v_m8 = vdupq_n_f32(m[8]);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+                        v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                        float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                        v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                        v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
+                        v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
+                        v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
+
+                        int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
+                        int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
+                        int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
+                        int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 w_f = 1.0f / (m[2] * x_ + yw);
+                        f32 src_x_f = (m[0] * x_ + yx) * w_f;
+                        f32 src_y_f = (m[1] * x_ + yy) * w_f;
+
+                        s32 src0_x = (s32)floorf(src_x_f);
+                        s32 src0_y = (s32)floorf(src_y_f);
+
+                        coeff_row[(x << 1) + 0] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
+                        src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
+                        s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
+                        src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
+
+                        map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
+                        map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
+                        map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
+                        map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
+                    }
+                }
+
+                remapLinearReplicate(Size2D(blockWidth, blockHeight),
+                                     srcBase, &map[0], &coeffs[0],
+                                     getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+                        v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                        float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                        v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                        v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                        int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
+                        v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                        v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                        int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                        int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                        uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
+                        uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                        uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
+                        uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                        v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                        v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                        v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                        v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 w_f = 1.0f / (m[2] * x_ + yw);
+                        f32 src_x_f = (m[0] * x_ + yx) * w_f;
+                        f32 src_y_f = (m[1] * x_ + yy) * w_f;
+
+                        s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
+                        s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
+
+                        coeff_row[(x << 1) + 0] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
+                        map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
+                    }
+                }
+
+                remapLinearConst(Size2D(blockWidth, blockHeight),
+                                 srcBase, &map[0], &coeffs[0],
+                                 getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)m;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index a76854d4a3..eddeaef56a 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -5,9 +5,9 @@ if (WIN32 AND NOT ARM)
   message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!")
 endif()
 
-set(tbb_ver "tbb43_20141204oss")
-set(tbb_url "http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb43_20141204oss_src.tgz")
-set(tbb_md5 "e903dd92d9433701f097fa7ca29a3c1f")
+set(tbb_ver "tbb44_20160128oss")
+set(tbb_url "http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb44_20160128oss_src_0.tgz")
+set(tbb_md5 "9d8a4cdf43496f1b3f7c473a5248e5cc")
 set(tbb_version_file "version_string.ver")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4702)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f043acd614..7ea42680b9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,13 +81,14 @@ if(POLICY CMP0026)
   cmake_policy(SET CMP0026 OLD)
 endif()
 
-if (POLICY CMP0042)
-  # silence cmake 3.0+ warnings about MACOSX_RPATH
-  cmake_policy(SET CMP0042 OLD)
+if(POLICY CMP0042)
+  cmake_policy(SET CMP0042 NEW)
 endif()
 
+include(cmake/OpenCVUtils.cmake)
+
 # must go before the project command
-set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE)
+ocv_update(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE)
 if(DEFINED CMAKE_BUILD_TYPE)
   set_property( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES} )
 endif()
@@ -100,8 +101,6 @@ if(MSVC)
   set(CMAKE_USE_RELATIVE_PATHS ON CACHE INTERNAL "" FORCE)
 endif()
 
-include(cmake/OpenCVUtils.cmake)
-
 ocv_cmake_eval(DEBUG_PRE ONCE)
 
 ocv_clear_vars(OpenCVModules_TARGETS)
@@ -170,6 +169,7 @@ endif()
 OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
+OCV_OPTION(WITH_CAROTENE       "Use NVidia carotene acceleration library for ARM platform"                   ON  IF (ARM OR AARCH64) AND NOT IOS AND NOT (CMAKE_VERSION VERSION_LESS "2.8.11"))
 OCV_OPTION(WITH_VTK            "Include VTK library support (and build opencv_viz module eiher)"             ON  IF (NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING) )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"                                         ON  IF (NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT) library support"            ON  IF (NOT IOS AND NOT WINRT) )
@@ -304,50 +304,50 @@ include(cmake/OpenCVVersion.cmake)
 # ----------------------------------------------------------------------------
 
 # Save libs and executables in the same place
-set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" )
+set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications")
 
-if (ANDROID)
-  if (ANDROID_ABI MATCHES "NEON")
+if(ANDROID)
+  if(ANDROID_ABI MATCHES "NEON")
     set(ENABLE_NEON ON)
   endif()
-  if (ANDROID_ABI MATCHES "VFPV3")
+  if(ANDROID_ABI MATCHES "VFPV3")
     set(ENABLE_VFPV3 ON)
   endif()
 endif()
 
 if(ANDROID OR WIN32)
-  set(OPENCV_DOC_INSTALL_PATH doc)
+  ocv_update(OPENCV_DOC_INSTALL_PATH doc)
 else()
-  set(OPENCV_DOC_INSTALL_PATH share/OpenCV/doc)
+  ocv_update(OPENCV_DOC_INSTALL_PATH share/OpenCV/doc)
 endif()
 
 if(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
   if(DEFINED OpenCV_RUNTIME AND DEFINED OpenCV_ARCH)
-    set(OpenCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/")
+    ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/")
   else()
     message(STATUS "Can't detect runtime and/or arch")
-    set(OpenCV_INSTALL_BINARIES_PREFIX "")
+    ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "")
   endif()
 elseif(ANDROID)
-  set(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/")
+  ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/")
 else()
-  set(OpenCV_INSTALL_BINARIES_PREFIX "")
+  ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "")
 endif()
 
 if(ANDROID)
-  set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}")
+  ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}")
 else()
-  set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples")
+  ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples")
 endif()
 
 if(ANDROID)
-  set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}")
+  ocv_update(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}")
 else()
-  set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin")
+  ocv_update(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin")
 endif()
 
 if(NOT OPENCV_TEST_INSTALL_PATH)
-  set(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}")
+  ocv_update(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}")
 endif()
 
 if (OPENCV_TEST_DATA_PATH)
@@ -356,66 +356,74 @@ endif()
 
 if(OPENCV_TEST_DATA_PATH AND NOT OPENCV_TEST_DATA_INSTALL_PATH)
   if(ANDROID)
-    set(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata")
+    ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata")
   elseif(WIN32)
-    set(OPENCV_TEST_DATA_INSTALL_PATH "testdata")
+    ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "testdata")
   else()
-    set(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata")
+    ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata")
   endif()
 endif()
 
 if(ANDROID)
-  set(LIBRARY_OUTPUT_PATH         "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}")
-  set(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}")
-  set(OPENCV_LIB_INSTALL_PATH     sdk/native/libs/${ANDROID_NDK_ABI_NAME})
-  set(OPENCV_3P_LIB_INSTALL_PATH  sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME})
-  set(OPENCV_CONFIG_INSTALL_PATH  sdk/native/jni)
-  set(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include)
-  set(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
-  set(OPENCV_OTHER_INSTALL_PATH   sdk/etc)
+  set(LIBRARY_OUTPUT_PATH                "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}")
+  ocv_update(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}")
+  ocv_update(OPENCV_LIB_INSTALL_PATH     sdk/native/libs/${ANDROID_NDK_ABI_NAME})
+  ocv_update(OPENCV_3P_LIB_INSTALL_PATH  sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME})
+  ocv_update(OPENCV_CONFIG_INSTALL_PATH  sdk/native/jni)
+  ocv_update(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include)
+  ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
+  ocv_update(OPENCV_OTHER_INSTALL_PATH   sdk/etc)
 else()
-  set(LIBRARY_OUTPUT_PATH         "${OpenCV_BINARY_DIR}/lib")
-  set(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}")
+  set(LIBRARY_OUTPUT_PATH                "${OpenCV_BINARY_DIR}/lib")
+  ocv_update(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}")
 
   if(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
     if(OpenCV_STATIC)
-      set(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
+      ocv_update(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
     else()
-      set(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}")
+      ocv_update(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}")
     endif()
-    set(OPENCV_3P_LIB_INSTALL_PATH  "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
-    set(OPENCV_SAMPLES_SRC_INSTALL_PATH    samples/native)
-    set(OPENCV_JAR_INSTALL_PATH java)
-    set(OPENCV_OTHER_INSTALL_PATH   etc)
+    ocv_update(OPENCV_3P_LIB_INSTALL_PATH  "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
+    ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH    samples/native)
+    ocv_update(OPENCV_JAR_INSTALL_PATH java)
+    ocv_update(OPENCV_OTHER_INSTALL_PATH   etc)
+    ocv_update(OPENCV_CONFIG_INSTALL_PATH  ".")
   else()
-    set(OPENCV_LIB_INSTALL_PATH     lib${LIB_SUFFIX})
-    set(OPENCV_3P_LIB_INSTALL_PATH  share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH})
-    set(OPENCV_SAMPLES_SRC_INSTALL_PATH    share/OpenCV/samples)
-    set(OPENCV_JAR_INSTALL_PATH share/OpenCV/java)
-    set(OPENCV_OTHER_INSTALL_PATH   share/OpenCV)
-  endif()
-  set(OPENCV_INCLUDE_INSTALL_PATH "include")
-
-  math(EXPR SIZEOF_VOID_P_BITS "8 * ${CMAKE_SIZEOF_VOID_P}")
-  if(LIB_SUFFIX AND NOT SIZEOF_VOID_P_BITS EQUAL LIB_SUFFIX)
-    set(OPENCV_CONFIG_INSTALL_PATH lib${LIB_SUFFIX}/cmake/opencv)
-  else()
-    set(OPENCV_CONFIG_INSTALL_PATH share/OpenCV)
+    ocv_update(OPENCV_LIB_INSTALL_PATH     lib${LIB_SUFFIX})
+    ocv_update(OPENCV_3P_LIB_INSTALL_PATH  share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH})
+    ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH    share/OpenCV/samples)
+    ocv_update(OPENCV_JAR_INSTALL_PATH share/OpenCV/java)
+    ocv_update(OPENCV_OTHER_INSTALL_PATH   share/OpenCV)
+
+    if(NOT DEFINED OPENCV_CONFIG_INSTALL_PATH)
+      math(EXPR SIZEOF_VOID_P_BITS "8 * ${CMAKE_SIZEOF_VOID_P}")
+      if(LIB_SUFFIX AND NOT SIZEOF_VOID_P_BITS EQUAL LIB_SUFFIX)
+        ocv_update(OPENCV_CONFIG_INSTALL_PATH lib${LIB_SUFFIX}/cmake/opencv)
+      else()
+        ocv_update(OPENCV_CONFIG_INSTALL_PATH share/OpenCV)
+      endif()
+    endif()
   endif()
+  ocv_update(OPENCV_INCLUDE_INSTALL_PATH "include")
 endif()
 
-set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}")
+ocv_update(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}")
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 
 if(INSTALL_TO_MANGLED_PATHS)
   set(OPENCV_INCLUDE_INSTALL_PATH ${OPENCV_INCLUDE_INSTALL_PATH}/opencv-${OPENCV_VERSION})
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_3P_LIB_INSTALL_PATH "${OPENCV_3P_LIB_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_SAMPLES_SRC_INSTALL_PATH "${OPENCV_SAMPLES_SRC_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_CONFIG_INSTALL_PATH "${OPENCV_CONFIG_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_DOC_INSTALL_PATH "${OPENCV_DOC_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_JAR_INSTALL_PATH "${OPENCV_JAR_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_TEST_DATA_INSTALL_PATH "${OPENCV_TEST_DATA_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_OTHER_INSTALL_PATH "${OPENCV_OTHER_INSTALL_PATH}")
+  foreach(v
+      OPENCV_3P_LIB_INSTALL_PATH
+      OPENCV_SAMPLES_SRC_INSTALL_PATH
+      OPENCV_CONFIG_INSTALL_PATH
+      OPENCV_DOC_INSTALL_PATH
+      OPENCV_JAR_INSTALL_PATH
+      OPENCV_TEST_DATA_INSTALL_PATH
+      OPENCV_OTHER_INSTALL_PATH
+    )
+    string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" ${v} "${${v}}")
+    string(REPLACE "opencv" "opencv-${OPENCV_VERSION}" ${v} "${${v}}")
+  endforeach()
 endif()
 
 
@@ -440,7 +448,7 @@ endif()
 # ----------------------------------------------------------------------------
 #  Path for build/platform -specific headers
 # ----------------------------------------------------------------------------
-set(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to create the platform-dependant cvconfig.h")
+ocv_update(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to create the platform-dependant cvconfig.h")
 ocv_include_directories(${OPENCV_CONFIG_FILE_INCLUDE_DIR})
 
 # ----------------------------------------------------------------------------
@@ -453,7 +461,7 @@ set(OPENCV_EXTRA_MODULES_PATH "" CACHE PATH "Where to look for additional OpenCV
 # ----------------------------------------------------------------------------
 find_host_package(Git QUIET)
 
-if(GIT_FOUND)
+if(NOT DEFINED OPENCV_VCSVERSION AND GIT_FOUND)
   execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*"
     WORKING_DIRECTORY "${OpenCV_SOURCE_DIR}"
     OUTPUT_VARIABLE OPENCV_VCSVERSION
@@ -464,7 +472,7 @@ if(GIT_FOUND)
   if(NOT GIT_RESULT EQUAL 0)
     set(OPENCV_VCSVERSION "unknown")
   endif()
-else()
+elseif(NOT DEFINED OPENCV_VCSVERSION)
   # We don't have git:
   set(OPENCV_VCSVERSION "unknown")
 endif()
@@ -627,7 +635,20 @@ endmacro()
 if(NOT DEFINED OpenCV_HAL)
   set(OpenCV_HAL "OpenCV_HAL")
 endif()
+
+if(WITH_CAROTENE)
+  ocv_debug_message(STATUS "Enable carotene acceleration")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";carotene;")
+    set(OpenCV_HAL "carotene;${OpenCV_HAL}")
+  endif()
+endif()
+
 foreach(hal ${OpenCV_HAL})
+  if(hal STREQUAL "carotene")
+    add_subdirectory(3rdparty/carotene/hal)
+    ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
+    list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
+  else()
     ocv_debug_message(STATUS "OpenCV HAL: ${hal} ...")
     ocv_clear_vars(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS)
     find_package(${hal} NO_MODULE QUIET)
@@ -635,6 +656,7 @@ foreach(hal ${OpenCV_HAL})
       ocv_hal_register(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS)
       list(APPEND OpenCV_USED_HAL "${hal} (ver ${${hal}_VERSION})")
     endif()
+  endif()
 endforeach()
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY)
 unset(_hal_includes)
@@ -652,7 +674,7 @@ if(HAVE_CUDA)
     set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY})
   endif()
   foreach(p ${CUDA_LIBS_PATH})
-    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} -L${p})
+    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CMAKE_LIBRARY_PATH_FLAG}${p})
   endforeach()
 endif()
 # ----------------------------------------------------------------------------
diff --git a/apps/visualisation/opencv_visualisation.cpp b/apps/visualisation/opencv_visualisation.cpp
index 2c685f521a..75703bd528 100644
--- a/apps/visualisation/opencv_visualisation.cpp
+++ b/apps/visualisation/opencv_visualisation.cpp
@@ -47,7 +47,7 @@ Software for visualising cascade classifier models trained by OpenCV and to get
 understanding of the used features.
 
 USAGE:
-./visualise_models -model <model.xml> -image <ref.png> -data <output folder>
+./opencv_visualisation --model=<model.xml> --image=<ref.png> --data=<video output folder>
 
 LIMITS
 - Use an absolute path for the output folder to ensure the tool works
@@ -81,20 +81,23 @@ struct rect_data{
 
 int main( int argc, const char** argv )
 {
+    CommandLineParser parser(argc, argv,
+        "{ help h usage ? |      | show this message }"
+        "{ image i        |      | (required) path to reference image }"
+        "{ model m        |      | (required) path to cascade xml file }"
+        "{ data d         |      | (optional) path to video output folder }"
+    );
     // Read in the input arguments
-    string model = "";
-    string output_folder = "";
-    string image_ref = "";
-    for(int i = 1; i < argc; ++i )
-    {
-        if( !strcmp( argv[i], "-model" ) )
-        {
-            model = argv[++i];
-        }else if( !strcmp( argv[i], "-image" ) ){
-            image_ref = argv[++i];
-        }else if( !strcmp( argv[i], "-data" ) ){
-            output_folder = argv[++i];
-        }
+    if (parser.has("help")){
+        parser.printMessage();
+        return 0;
+    }
+    string model(parser.get<string>("model"));
+    string output_folder(parser.get<string>("data"));
+    string image_ref = (parser.get<string>("image"));
+    if (model.empty() || image_ref.empty()){
+        parser.printMessage();
+        return -1;
     }
 
     // Value for timing
@@ -106,8 +109,11 @@ int main( int argc, const char** argv )
 
     // Open the XML model
     FileStorage fs;
-    fs.open(model, FileStorage::READ);
-
+    bool model_ok = fs.open(model, FileStorage::READ);
+    if (!model_ok){
+        cerr << "the cascade file '" << model << "' could not be loaded." << endl;
+        return  -1;
+    }
     // Get a the required information
     // First decide which feature type we are using
     FileNode cascade = fs["cascade"];
@@ -129,6 +135,10 @@ int main( int argc, const char** argv )
     int resize_factor = 10;
     int resize_storage_factor = 10;
     Mat reference_image = imread(image_ref, IMREAD_GRAYSCALE );
+    if (reference_image.empty()){
+        cerr << "the reference image '" << image_ref << "'' could not be loaded." << endl;
+        return -1;
+    }
     Mat visualization;
     resize(reference_image, visualization, Size(reference_image.cols * resize_factor, reference_image.rows * resize_factor));
 
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 3534754fea..de17cee30c 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -43,7 +43,7 @@ if(CUDA_FOUND)
 
   message(STATUS "CUDA detected: " ${CUDA_VERSION})
 
-  set(_generations "Fermi" "Kepler")
+  set(_generations "Fermi" "Kepler" "Maxwell" "Pascal")
   if(NOT CMAKE_CROSSCOMPILING)
     list(APPEND _generations "Auto")
   endif()
@@ -63,13 +63,13 @@ if(CUDA_FOUND)
 
   set(__cuda_arch_ptx "")
   if(CUDA_GENERATION STREQUAL "Fermi")
-    set(__cuda_arch_bin "2.0 2.1(2.0)")
+    set(__cuda_arch_bin "2.0")
   elseif(CUDA_GENERATION STREQUAL "Kepler")
-    if(${CUDA_VERSION} VERSION_LESS "5.0")
-      set(__cuda_arch_bin "3.0")
-    else()
-      set(__cuda_arch_bin "3.0 3.5")
-    endif()
+    set(__cuda_arch_bin "3.0 3.5")
+  elseif(CUDA_GENERATION STREQUAL "Maxwell")
+    set(__cuda_arch_bin "5.0")
+  elseif(CUDA_GENERATION STREQUAL "Pascal")
+    set(__cuda_arch_bin "6.0")
   elseif(CUDA_GENERATION STREQUAL "Auto")
     execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run"
                      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
@@ -91,14 +91,12 @@ if(CUDA_FOUND)
       set(__cuda_arch_bin "5.3")
       set(__cuda_arch_ptx "")
     else()
-      if(${CUDA_VERSION} VERSION_LESS "5.0")
-        set(__cuda_arch_bin "1.1 1.2 1.3 2.0 2.1(2.0) 3.0")
-      elseif(${CUDA_VERSION} VERSION_GREATER "6.5")
-        set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5")
+      if(${CUDA_VERSION} VERSION_LESS "8.0")
+        set(__cuda_arch_bin "2.0 3.0 3.5 5.0")
       else()
-        set(__cuda_arch_bin "1.1 1.2 1.3 2.0 2.1(2.0) 3.0 3.5")
+        set(__cuda_arch_bin "2.0 3.0 3.5 5.0 6.0")
       endif()
-      set(__cuda_arch_ptx "3.0")
+      set(__cuda_arch_ptx "")
     endif()
   endif()
 
diff --git a/cmake/OpenCVFindIPP.cmake b/cmake/OpenCVFindIPP.cmake
index 3bff766930..43172112f1 100644
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@@ -146,7 +146,7 @@ macro(ipp_detect_version)
           IMPORTED_LOCATION ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
         )
         list(APPEND IPP_LIBRARIES ipp${name})
-        if (NOT BUILD_SHARED_LIBS OR NOT INSTALL_CREATE_DISTRIB)
+        if (NOT BUILD_SHARED_LIBS)
           # CMake doesn't support "install(TARGETS ${IPP_PREFIX}${name} " command with imported targets
           install(FILES ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
                   DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
diff --git a/cmake/OpenCVFindLibsPerf.cmake b/cmake/OpenCVFindLibsPerf.cmake
index 59ee42d32c..c679102d37 100644
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@@ -6,14 +6,17 @@
 if(WITH_LAPACK)
   find_package(LAPACK)
   if(LAPACK_FOUND)
-      find_path(LAPACK_INCLUDE_DIR "lapacke.h")
-      if(LAPACK_INCLUDE_DIR)
+    find_path(LAPACKE_INCLUDE_DIR "lapacke.h")
+    if(LAPACKE_INCLUDE_DIR)
+      find_path(CBLAS_INCLUDE_DIR "cblas.h")
+      if(CBLAS_INCLUDE_DIR)
         set(HAVE_LAPACK 1)
-        ocv_include_directories(${LAPACK_INCLUDE_DIR})
+        ocv_include_directories(${LAPACKE_INCLUDE_DIR} ${CBLAS_INCLUDE_DIR})
         list(APPEND OPENCV_LINKER_LIBS ${LAPACK_LIBRARIES})
       endif()
-  endif(LAPACK_FOUND)
-endif(WITH_LAPACK)
+    endif()
+  endif()
+endif()
 
 # --- TBB ---
 if(WITH_TBB)
diff --git a/cmake/OpenCVGenConfig.cmake b/cmake/OpenCVGenConfig.cmake
index 3770d05b4a..206acfdad9 100644
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@@ -11,47 +11,20 @@ else()
   set(OpenCV_USE_MANGLED_PATHS_CONFIGCMAKE FALSE)
 endif()
 
-if(NOT OpenCV_CUDA_CC)
-  set(OpenCV_CUDA_CC_CONFIGCMAKE "\"\"")
-  set(OpenCV_CUDA_VERSION "")
-else()
-  set(OpenCV_CUDA_CC_CONFIGCMAKE "${OpenCV_CUDA_CC}")
-  set(OpenCV_CUDA_VERSION ${CUDA_VERSION_STRING})
-endif()
-
-if(NOT ANDROID_NATIVE_API_LEVEL)
-  set(OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE 0)
-else()
-  set(OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE "${ANDROID_NATIVE_API_LEVEL}")
+if(HAVE_CUDA)
+  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-CUDA.cmake.in" CUDA_CONFIGCMAKE @ONLY)
 endif()
 
-if(CMAKE_GENERATOR MATCHES "Visual" OR CMAKE_GENERATOR MATCHES "Xcode")
-  set(OpenCV_ADD_DEBUG_RELEASE_CONFIGCMAKE TRUE)
-else()
-  set(OpenCV_ADD_DEBUG_RELEASE_CONFIGCMAKE FALSE)
-endif()
-
-
-
-if(WIN32)
-  if(MINGW)
-    set(OPENCV_LINK_LIBRARY_SUFFIX ".dll.a")
+if(ANDROID)
+  if(NOT ANDROID_NATIVE_API_LEVEL)
+    set(OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE 0)
   else()
-    set(OPENCV_LINK_LIBRARY_SUFFIX ".lib")
+    set(OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE "${ANDROID_NATIVE_API_LEVEL}")
   endif()
+  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-ANDROID.cmake.in" ANDROID_CONFIGCMAKE @ONLY)
 endif()
 
-#build list of modules available for the OpenCV user
-set(OpenCV_LIB_COMPONENTS "")
-foreach(m ${OPENCV_MODULES_PUBLIC})
-  list(INSERT OpenCV_LIB_COMPONENTS 0 ${${m}_MODULE_DEPS_OPT} ${m})
-endforeach()
-ocv_list_unique(OpenCV_LIB_COMPONENTS)
-set(OPENCV_MODULES_CONFIGCMAKE ${OpenCV_LIB_COMPONENTS})
-ocv_list_filterout(OpenCV_LIB_COMPONENTS "^opencv_")
-if(OpenCV_LIB_COMPONENTS)
-  list(REMOVE_ITEM OPENCV_MODULES_CONFIGCMAKE ${OpenCV_LIB_COMPONENTS})
-endif()
+set(OPENCV_MODULES_CONFIGCMAKE ${OPENCV_MODULES_PUBLIC})
 
 if(BUILD_FAT_JAVA_LIB AND HAVE_opencv_java)
   list(APPEND OPENCV_MODULES_CONFIGCMAKE opencv_java)
@@ -62,33 +35,20 @@ endif()
 # -------------------------------------------------------------------------------------------
 set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"${OPENCV_CONFIG_FILE_INCLUDE_DIR}\" \"${OpenCV_SOURCE_DIR}/include\" \"${OpenCV_SOURCE_DIR}/include/opencv\"")
 
-set(OpenCV2_INCLUDE_DIRS_CONFIGCMAKE "")
 foreach(m ${OPENCV_MODULES_BUILD})
   if(EXISTS "${OPENCV_MODULE_${m}_LOCATION}/include")
-    list(APPEND OpenCV2_INCLUDE_DIRS_CONFIGCMAKE "${OPENCV_MODULE_${m}_LOCATION}/include")
+    set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "${OpenCV_INCLUDE_DIRS_CONFIGCMAKE} \"${OPENCV_MODULE_${m}_LOCATION}/include\"")
   endif()
 endforeach()
 
-if(ANDROID AND NOT BUILD_SHARED_LIBS AND HAVE_TBB)
-  #export TBB headers location because static linkage of TBB might be troublesome if application wants to use TBB itself
-  list(APPEND OpenCV2_INCLUDE_DIRS_CONFIGCMAKE ${TBB_INCLUDE_DIRS})
-endif()
-
-set(modules_file_suffix "")
-if(ANDROID)
-  # the REPLACE here is needed, because OpenCVModules_armeabi.cmake includes
-  # OpenCVModules_armeabi-*.cmake, which would match OpenCVModules_armeabi-v7a*.cmake.
-  string(REPLACE - _ modules_file_suffix "_${ANDROID_NDK_ABI_NAME}")
-endif()
+export(TARGETS ${OpenCVModules_TARGETS} FILE "${CMAKE_BINARY_DIR}/OpenCVModules.cmake")
 
-export(TARGETS ${OpenCVModules_TARGETS} FILE "${CMAKE_BINARY_DIR}/OpenCVModules${modules_file_suffix}.cmake")
-
-if(TARGET ippicv AND (NOT BUILD_SHARED_LIBS OR NOT INSTALL_CREATE_DISTRIB))
+if(TARGET ippicv AND NOT BUILD_SHARED_LIBS)
   set(USE_IPPICV TRUE)
-  file(RELATIVE_PATH INSTALL_PATH_RELATIVE_IPPICV ${CMAKE_BINARY_DIR} ${IPPICV_LOCATION_PATH})
+  file(RELATIVE_PATH IPPICV_INSTALL_PATH_RELATIVE_CONFIGCMAKE ${CMAKE_BINARY_DIR} ${IPPICV_LOCATION_PATH})
+  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-IPPICV.cmake.in" IPPICV_CONFIGCMAKE @ONLY)
 else()
   set(USE_IPPICV FALSE)
-  set(INSTALL_PATH_RELATIVE_IPPICV "non-existed-path")
 endif()
 
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/OpenCVConfig.cmake" @ONLY)
@@ -98,58 +58,60 @@ configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.
 # --------------------------------------------------------------------------------------------
 #  Part 2/3: ${BIN_DIR}/unix-install/OpenCVConfig.cmake -> For use *with* "make install"
 # -------------------------------------------------------------------------------------------
-set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}/opencv" "\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}\"")
-
-set(OpenCV2_INCLUDE_DIRS_CONFIGCMAKE "\"\"")
-set(OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_3P_LIB_INSTALL_PATH}\"")
-
-if(UNIX) # ANDROID configuration is created here also
-  #http://www.vtk.org/Wiki/CMake/Tutorials/Packaging reference
-  # For a command "find_package(<name> [major[.minor]] [EXACT] [REQUIRED|QUIET])"
-  # cmake will look in the following dir on unix:
-  #                <prefix>/(share|lib)/cmake/<name>*/                     (U)
-  #                <prefix>/(share|lib)/<name>*/                           (U)
-  #                <prefix>/(share|lib)/<name>*/(cmake|CMake)/             (U)
-  if(USE_IPPICV)
-    file(RELATIVE_PATH INSTALL_PATH_RELATIVE_IPPICV "${CMAKE_INSTALL_PREFIX}/${OPENCV_CONFIG_INSTALL_PATH}/" ${IPPICV_INSTALL_PATH})
+file(RELATIVE_PATH OpenCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}/${OPENCV_CONFIG_INSTALL_PATH}/" ${CMAKE_INSTALL_PREFIX})
+set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}\" \"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}/opencv\"")
+
+if(USE_IPPICV)
+  file(RELATIVE_PATH IPPICV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}" ${IPPICV_INSTALL_PATH})
+  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-IPPICV.cmake.in" IPPICV_CONFIGCMAKE @ONLY)
+endif()
+
+function(ocv_gen_config TMP_DIR NESTED_PATH ROOT_NAME)
+  ocv_path_join(__install_nested "${OPENCV_CONFIG_INSTALL_PATH}" "${NESTED_PATH}")
+  ocv_path_join(__tmp_nested "${TMP_DIR}" "${NESTED_PATH}")
+
+  file(RELATIVE_PATH OpenCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}/${__install_nested}" "${CMAKE_INSTALL_PREFIX}/")
+
+  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${TMP_DIR}/OpenCVConfig-version.cmake" @ONLY)
+
+  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${__tmp_nested}/OpenCVConfig.cmake" @ONLY)
+  install(EXPORT OpenCVModules DESTINATION "${__install_nested}" FILE OpenCVModules.cmake COMPONENT dev)
+  install(FILES
+      "${TMP_DIR}/OpenCVConfig-version.cmake"
+      "${__tmp_nested}/OpenCVConfig.cmake"
+      DESTINATION "${__install_nested}" COMPONENT dev)
+
+  if(ROOT_NAME)
+    # Root config file
+    configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/${ROOT_NAME}" "${TMP_DIR}/OpenCVConfig.cmake" @ONLY)
+    install(FILES
+        "${TMP_DIR}/OpenCVConfig-version.cmake"
+        "${TMP_DIR}/OpenCVConfig.cmake"
+        DESTINATION "${OPENCV_CONFIG_INSTALL_PATH}" COMPONENT dev)
   endif()
-  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig.cmake" @ONLY)
-  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig-version.cmake" @ONLY)
-  install(FILES "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig.cmake" DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ COMPONENT dev)
-  install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig-version.cmake DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ COMPONENT dev)
-  install(EXPORT OpenCVModules DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ FILE OpenCVModules${modules_file_suffix}.cmake COMPONENT dev)
+endfunction()
+
+if(UNIX AND NOT ANDROID)
+  ocv_gen_config("${CMAKE_BINARY_DIR}/unix-install" "" "")
 endif()
 
 if(ANDROID)
-  install(FILES "${OpenCV_SOURCE_DIR}/platforms/android/android.toolchain.cmake" DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ COMPONENT dev)
+  ocv_gen_config("${CMAKE_BINARY_DIR}/unix-install" "abi-${ANDROID_NDK_ABI_NAME}" "OpenCVConfig.root-ANDROID.cmake.in")
+  install(FILES "${OpenCV_SOURCE_DIR}/platforms/android/android.toolchain.cmake" DESTINATION "${OPENCV_CONFIG_INSTALL_PATH}" COMPONENT dev)
 endif()
 
 # --------------------------------------------------------------------------------------------
 #  Part 3/3: ${BIN_DIR}/win-install/OpenCVConfig.cmake  -> For use within binary installers/packages
 # --------------------------------------------------------------------------------------------
 if(WIN32)
-  set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"\${OpenCV_CONFIG_PATH}/include\" \"\${OpenCV_CONFIG_PATH}/include/opencv\"")
-  set(OpenCV2_INCLUDE_DIRS_CONFIGCMAKE "\"\"")
-
-  exec_program(mkdir ARGS "-p \"${CMAKE_BINARY_DIR}/win-install/\"" OUTPUT_VARIABLE RET_VAL)
-  if(USE_IPPICV)
-    file(RELATIVE_PATH INSTALL_PATH_RELATIVE_IPPICV "${CMAKE_INSTALL_PREFIX}/${OpenCV_INSTALL_BINARIES_PREFIX}staticlib" ${IPPICV_INSTALL_PATH})
-  endif()
-  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" @ONLY)
-  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig-version.cmake" @ONLY)
-  if (CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
+  if(CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
     if(BUILD_SHARED_LIBS)
-      install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}lib" COMPONENT dev)
-      install(EXPORT OpenCVModules DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}lib" FILE OpenCVModules${modules_file_suffix}.cmake COMPONENT dev)
+      set(_lib_suffix "lib")
     else()
-      install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib" COMPONENT dev)
-      install(EXPORT OpenCVModules DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib" FILE OpenCVModules${modules_file_suffix}.cmake COMPONENT dev)
+      set(_lib_suffix "staticlib")
     endif()
-    install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig-version.cmake" DESTINATION ./ COMPONENT dev)
-    install(FILES "${OpenCV_SOURCE_DIR}/cmake/OpenCVConfig.cmake" DESTINATION ./ COMPONENT dev)
-  else ()
-    install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}lib/cmake/opencv-${OPENCV_VERSION}" COMPONENT dev)
-    install(EXPORT OpenCVModules DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}lib/cmake/opencv-${OPENCV_VERSION}" FILE OpenCVModules${modules_file_suffix}.cmake COMPONENT dev)
-    install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig-version.cmake" DESTINATION "lib/cmake/opencv-${OPENCV_VERSION}" COMPONENT dev)
-  endif ()
+    ocv_gen_config("${CMAKE_BINARY_DIR}/win-install" "${OpenCV_INSTALL_BINARIES_PREFIX}${_lib_suffix}" "OpenCVConfig.root-WIN32.cmake.in")
+  else()
+    ocv_gen_config("${CMAKE_BINARY_DIR}/win-install" "" "")
+  endif()
 endif()
diff --git a/cmake/OpenCVMinDepVersions.cmake b/cmake/OpenCVMinDepVersions.cmake
index e8591e26e2..44dbf6eefe 100644
--- a/cmake/OpenCVMinDepVersions.cmake
+++ b/cmake/OpenCVMinDepVersions.cmake
@@ -1,5 +1,5 @@
 set(MIN_VER_CMAKE 2.8.7)
-set(MIN_VER_CUDA 4.2)
+set(MIN_VER_CUDA 6.5)
 set(MIN_VER_PYTHON2 2.6)
 set(MIN_VER_PYTHON3 3.2)
 set(MIN_VER_ZLIB 1.2.3)
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index bd3286c2a2..197aead0d2 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -859,7 +859,8 @@ macro(_ocv_create_module)
   endif()
 
   get_target_property(_target_type ${the_module} TYPE)
-  if("${_target_type}" STREQUAL "SHARED_LIBRARY" OR (NOT BUILD_SHARED_LIBS OR NOT INSTALL_CREATE_DISTRIB))
+  if(OPENCV_MODULE_${the_module}_CLASS STREQUAL "PUBLIC" AND
+      ("${_target_type}" STREQUAL "SHARED_LIBRARY" OR (NOT BUILD_SHARED_LIBS OR NOT INSTALL_CREATE_DISTRIB)))
     ocv_install_target(${the_module} EXPORT OpenCVModules OPTIONAL
       RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs
       LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT libs NAMELINK_SKIP
@@ -1109,6 +1110,10 @@ function(ocv_add_accuracy_tests)
         set_target_properties(${the_target} PROPERTIES FOLDER "tests accuracy")
       endif()
 
+      if(OPENCV_TEST_BIGDATA)
+        ocv_append_target_property(${the_target} COMPILE_DEFINITIONS "OPENCV_TEST_BIGDATA=1")
+      endif()
+
       if(NOT BUILD_opencv_world)
         _ocv_add_precompiled_headers(${the_target})
       endif()
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index a17e255488..b1a099e695 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -30,6 +30,19 @@ function(ocv_cmake_eval var_name)
   endif()
 endfunction()
 
+macro(ocv_cmake_configure file_name var_name)
+  configure_file(${file_name} "${CMAKE_BINARY_DIR}/CMakeConfig-${var_name}.cmake" ${ARGN})
+  file(READ "${CMAKE_BINARY_DIR}/CMakeConfig-${var_name}.cmake" ${var_name})
+endmacro()
+
+macro(ocv_update VAR)
+  if(NOT DEFINED ${VAR})
+    set(${VAR} ${ARGN})
+  else()
+    #ocv_debug_message("Preserve old value for ${VAR}: ${${VAR}}")
+  endif()
+endmacro()
+
 # Search packages for host system instead of packages for target system
 # in case of cross compilation thess macro should be defined by toolchain file
 if(NOT COMMAND find_host_package)
@@ -71,6 +84,24 @@ macro(ocv_check_environment_variables)
   endforeach()
 endmacro()
 
+macro(ocv_path_join result_var P1 P2_)
+  string(REGEX REPLACE "^[/]+" "" P2 "${P2_}")
+  if("${P1}" STREQUAL "" OR "${P1}" STREQUAL ".")
+    set(${result_var} "${P2}")
+  elseif("${P1}" STREQUAL "/")
+    set(${result_var} "/${P2}")
+  elseif("${P2}" STREQUAL "")
+    set(${result_var} "${P1}")
+  else()
+    set(${result_var} "${P1}/${P2}")
+  endif()
+  string(REGEX REPLACE "([/\\]?)[\\.][/\\]" "\\1" ${result_var} "${${result_var}}")
+  if("${${result_var}}" STREQUAL "")
+    set(${result_var} ".")
+  endif()
+  #message(STATUS "'${P1}' '${P2_}' => '${${result_var}}'")
+endmacro()
+
 # rename modules target to world if needed
 macro(_ocv_fix_target target_var)
   if(BUILD_opencv_world)
@@ -97,6 +128,16 @@ function(ocv_include_directories)
   include_directories(BEFORE ${__add_before})
 endfunction()
 
+function(ocv_append_target_property target prop)
+  get_target_property(val ${target} ${prop})
+  if(val)
+    set(val "${val} ${ARGN}")
+    set_target_properties(${target} PROPERTIES ${prop} "${val}")
+  else()
+    set_target_properties(${target} PROPERTIES ${prop} "${ARGN}")
+  endif()
+endfunction()
+
 # adds include directories in such way that directories from the OpenCV source tree go first
 function(ocv_target_include_directories target)
   _ocv_fix_target(target)
@@ -359,7 +400,7 @@ macro(CHECK_MODULE module_name define)
 endmacro()
 
 
-set(OPENCV_BUILD_INFO_FILE "${OpenCV_BINARY_DIR}/version_string.tmp")
+set(OPENCV_BUILD_INFO_FILE "${CMAKE_BINARY_DIR}/version_string.tmp")
 file(REMOVE "${OPENCV_BUILD_INFO_FILE}")
 function(ocv_output_status msg)
   message(STATUS "${msg}")
diff --git a/cmake/templates/OpenCVConfig-ANDROID.cmake.in b/cmake/templates/OpenCVConfig-ANDROID.cmake.in
new file mode 100644
index 0000000000..1787acab38
--- /dev/null
+++ b/cmake/templates/OpenCVConfig-ANDROID.cmake.in
@@ -0,0 +1,13 @@
+# Android API level from which OpenCV has been compiled is remembered
+set(OpenCV_ANDROID_NATIVE_API_LEVEL "@OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE@")
+
+# ==============================================================
+#  Check OpenCV availability
+# ==============================================================
+if(OpenCV_ANDROID_NATIVE_API_LEVEL GREATER ANDROID_NATIVE_API_LEVEL)
+  if(NOT OpenCV_FIND_QUIETLY)
+    message(WARNING "Minimum required by OpenCV API level is android-${OpenCV_ANDROID_NATIVE_API_LEVEL}")
+  endif()
+  set(OpenCV_FOUND 0)
+  return()
+endif()
diff --git a/cmake/templates/OpenCVConfig-CUDA.cmake.in b/cmake/templates/OpenCVConfig-CUDA.cmake.in
new file mode 100644
index 0000000000..0d261dd84b
--- /dev/null
+++ b/cmake/templates/OpenCVConfig-CUDA.cmake.in
@@ -0,0 +1,53 @@
+# Version Compute Capability from which OpenCV has been compiled is remembered
+set(OpenCV_COMPUTE_CAPABILITIES "@OpenCV_CUDA_CC@")
+
+set(OpenCV_CUDA_VERSION "@CUDA_VERSION_STRING@")
+set(OpenCV_USE_CUBLAS   "@HAVE_CUBLAS@")
+set(OpenCV_USE_CUFFT    "@HAVE_CUFFT@")
+set(OpenCV_USE_NVCUVID  "@HAVE_NVCUVID@")
+
+if(NOT CUDA_FOUND)
+  find_host_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
+else()
+  if(NOT CUDA_VERSION_STRING VERSION_EQUAL OpenCV_CUDA_VERSION)
+    message(FATAL_ERROR "OpenCV static library was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+  endif()
+endif()
+
+set(OpenCV_CUDA_LIBS_ABSPATH ${CUDA_LIBRARIES})
+
+if(${CUDA_VERSION} VERSION_LESS "5.5")
+  list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_npp_LIBRARY})
+else()
+  find_cuda_helper_libs(nppc)
+  find_cuda_helper_libs(nppi)
+  find_cuda_helper_libs(npps)
+  list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
+endif()
+
+if(OpenCV_USE_CUBLAS)
+  list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_CUBLAS_LIBRARIES})
+endif()
+
+if(OpenCV_USE_CUFFT)
+  list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_CUFFT_LIBRARIES})
+endif()
+
+if(OpenCV_USE_NVCUVID)
+  list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nvcuvid_LIBRARIES})
+endif()
+
+if(WIN32)
+  list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nvcuvenc_LIBRARIES})
+endif()
+
+set(OpenCV_CUDA_LIBS_RELPATH "")
+foreach(l ${OpenCV_CUDA_LIBS_ABSPATH})
+  get_filename_component(_tmp ${l} PATH)
+  if(NOT ${_tmp} MATCHES "-Wl.*")
+    list(APPEND OpenCV_CUDA_LIBS_RELPATH ${_tmp})
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES OpenCV_CUDA_LIBS_RELPATH)
+link_directories(${OpenCV_CUDA_LIBS_RELPATH})
diff --git a/cmake/templates/OpenCVConfig-IPPICV.cmake.in b/cmake/templates/OpenCVConfig-IPPICV.cmake.in
new file mode 100644
index 0000000000..33cf2d4374
--- /dev/null
+++ b/cmake/templates/OpenCVConfig-IPPICV.cmake.in
@@ -0,0 +1,7 @@
+if(NOT TARGET ippicv)
+  add_library(ippicv STATIC IMPORTED)
+  set_target_properties(ippicv PROPERTIES
+    IMPORTED_LINK_INTERFACE_LIBRARIES ""
+    IMPORTED_LOCATION "${OpenCV_INSTALL_PATH}/@IPPICV_INSTALL_PATH_RELATIVE_CONFIGCMAKE@"
+  )
+endif()
diff --git a/cmake/templates/OpenCVConfig.cmake.in b/cmake/templates/OpenCVConfig.cmake.in
index 468732b8b0..dfe9aeafe1 100644
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@@ -29,79 +29,55 @@
 #
 #    Advanced variables:
 #      - OpenCV_SHARED                   : Use OpenCV as shared library
-#      - OpenCV_CONFIG_PATH              : Path to this OpenCVConfig.cmake
-#      - OpenCV_INSTALL_PATH             : OpenCV location (not set on Windows)
+#      - OpenCV_INSTALL_PATH             : OpenCV location
 #      - OpenCV_LIB_COMPONENTS           : Present OpenCV modules list
 #      - OpenCV_USE_MANGLED_PATHS        : Mangled OpenCV path flag
-#      - OpenCV_MODULES_SUFFIX           : The suffix for OpenCVModules-XXX.cmake file
 #
 #    Deprecated variables:
 #      - OpenCV_VERSION_TWEAK            : Always "0"
 #
 # ===================================================================================
 
+# ======================================================
+#  Version variables:
+# ======================================================
+SET(OpenCV_VERSION @OPENCV_VERSION_PLAIN@)
+SET(OpenCV_VERSION_MAJOR  @OPENCV_VERSION_MAJOR@)
+SET(OpenCV_VERSION_MINOR  @OPENCV_VERSION_MINOR@)
+SET(OpenCV_VERSION_PATCH  @OPENCV_VERSION_PATCH@)
+SET(OpenCV_VERSION_TWEAK  0)
+SET(OpenCV_VERSION_STATUS "@OPENCV_VERSION_STATUS@")
+
+# Extract directory name from full path of the file currently being processed.
+# Note that CMake 2.8.3 introduced CMAKE_CURRENT_LIST_DIR. We reimplement it
+# for older versions of CMake to support these as well.
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+endif()
+
+# Extract the directory where *this* file has been installed (determined at cmake run-time)
+# Get the absolute path with no ../.. relative marks, to eliminate implicit linker warnings
+set(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_DIR}")
+get_filename_component(OpenCV_INSTALL_PATH "${OpenCV_CONFIG_PATH}/@OpenCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE@" REALPATH)
+
 # Search packages for host system instead of packages for target system.
 # in case of cross compilation thess macro should be defined by toolchain file
-
 if(NOT COMMAND find_host_package)
     macro(find_host_package)
         find_package(${ARGN})
     endmacro()
 endif()
-
 if(NOT COMMAND find_host_program)
     macro(find_host_program)
         find_program(${ARGN})
     endmacro()
 endif()
 
-if(NOT DEFINED OpenCV_MODULES_SUFFIX)
-  if(ANDROID)
-    string(REPLACE - _ OpenCV_MODULES_SUFFIX "_${ANDROID_NDK_ABI_NAME}")
-  else()
-    set(OpenCV_MODULES_SUFFIX "")
-  endif()
-endif()
-
-if("@USE_IPPICV@" STREQUAL "TRUE") # value is defined by package builder (use STREQUAL to comply new CMake policy CMP0012)
-  if(NOT TARGET ippicv)
-    if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/@INSTALL_PATH_RELATIVE_IPPICV@")
-      add_library(ippicv STATIC IMPORTED)
-      set_target_properties(ippicv PROPERTIES
-        IMPORTED_LINK_INTERFACE_LIBRARIES ""
-        IMPORTED_LOCATION "${CMAKE_CURRENT_LIST_DIR}/@INSTALL_PATH_RELATIVE_IPPICV@"
-      )
-    endif()
-  endif()
-endif()
-
-if(NOT TARGET opencv_core)
-  # Extract directory name from full path of the file currently being processed.
-  # Note that CMake 2.8.3 introduced CMAKE_CURRENT_LIST_DIR. We reimplement it
-  # for older versions of CMake to support these as well.
-  if(CMAKE_VERSION VERSION_LESS "2.8.3")
-    get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-  endif()
-
-  include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${OpenCV_MODULES_SUFFIX}.cmake)
-endif()
-
-# TODO All things below should be reviewed. What is about of moving this code into related modules (special vars/hooks/files)
-
-# Version Compute Capability from which OpenCV has been compiled is remembered
-set(OpenCV_COMPUTE_CAPABILITIES @OpenCV_CUDA_CC_CONFIGCMAKE@)
 
-set(OpenCV_CUDA_VERSION @OpenCV_CUDA_VERSION@)
-set(OpenCV_USE_CUBLAS   @HAVE_CUBLAS@)
-set(OpenCV_USE_CUFFT    @HAVE_CUFFT@)
-set(OpenCV_USE_NVCUVID  @HAVE_NVCUVID@)
+@CUDA_CONFIGCMAKE@
+@ANDROID_CONFIGCMAKE@
 
-# Android API level from which OpenCV has been compiled is remembered
-if(ANDROID)
-  set(OpenCV_ANDROID_NATIVE_API_LEVEL @OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE@)
-else()
-  set(OpenCV_ANDROID_NATIVE_API_LEVEL 0)
-endif()
+@IPPICV_CONFIGCMAKE@
 
 # Some additional settings are required if OpenCV is built as static libs
 set(OpenCV_SHARED @BUILD_SHARED_LIBS@)
@@ -109,77 +85,11 @@ set(OpenCV_SHARED @BUILD_SHARED_LIBS@)
 # Enables mangled install paths, that help with side by side installs
 set(OpenCV_USE_MANGLED_PATHS @OpenCV_USE_MANGLED_PATHS_CONFIGCMAKE@)
 
-# Extract the directory where *this* file has been installed (determined at cmake run-time)
-if(CMAKE_VERSION VERSION_LESS "2.8.12")
-  get_filename_component(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" PATH CACHE)
-else()
-  get_filename_component(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" DIRECTORY CACHE)
-endif()
-
-if(NOT WIN32 OR ANDROID)
-  if(ANDROID)
-    set(OpenCV_INSTALL_PATH "${OpenCV_CONFIG_PATH}/../../..")
-  else()
-    set(OpenCV_INSTALL_PATH "${OpenCV_CONFIG_PATH}/../..")
-  endif()
-  # Get the absolute path with no ../.. relative marks, to eliminate implicit linker warnings
-  if(${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 2.8)
-    get_filename_component(OpenCV_INSTALL_PATH "${OpenCV_INSTALL_PATH}" ABSOLUTE)
-  else()
-    get_filename_component(OpenCV_INSTALL_PATH "${OpenCV_INSTALL_PATH}" REALPATH)
-  endif()
-endif()
-
-# ======================================================
-# Include directories to add to the user project:
-# ======================================================
-
-# Provide the include directories to the caller
+set(OpenCV_LIB_COMPONENTS @OPENCV_MODULES_CONFIGCMAKE@)
 set(OpenCV_INCLUDE_DIRS @OpenCV_INCLUDE_DIRS_CONFIGCMAKE@)
 
-# ======================================================
-# Link directories to add to the user project:
-# ======================================================
-
-# Provide the libs directories to the caller
-set(OpenCV_LIB_DIR_OPT @OpenCV_LIB_DIRS_CONFIGCMAKE@ CACHE PATH "Path where release OpenCV libraries are located")
-set(OpenCV_LIB_DIR_DBG @OpenCV_LIB_DIRS_CONFIGCMAKE@ CACHE PATH "Path where debug OpenCV libraries are located")
-set(OpenCV_3RDPARTY_LIB_DIR_OPT @OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE@ CACHE PATH "Path where release 3rdparty OpenCV dependencies are located")
-set(OpenCV_3RDPARTY_LIB_DIR_DBG @OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE@ CACHE PATH "Path where debug 3rdparty OpenCV dependencies are located")
-mark_as_advanced(FORCE OpenCV_LIB_DIR_OPT OpenCV_LIB_DIR_DBG OpenCV_3RDPARTY_LIB_DIR_OPT OpenCV_3RDPARTY_LIB_DIR_DBG OpenCV_CONFIG_PATH)
-
-# ======================================================
-#  Version variables:
-# ======================================================
-SET(OpenCV_VERSION @OPENCV_VERSION_PLAIN@)
-SET(OpenCV_VERSION_MAJOR  @OPENCV_VERSION_MAJOR@)
-SET(OpenCV_VERSION_MINOR  @OPENCV_VERSION_MINOR@)
-SET(OpenCV_VERSION_PATCH  @OPENCV_VERSION_PATCH@)
-SET(OpenCV_VERSION_TWEAK  0)
-SET(OpenCV_VERSION_STATUS "@OPENCV_VERSION_STATUS@")
-
-# ====================================================================
-# Link libraries: e.g. opencv_core;opencv_imgproc; etc...
-# ====================================================================
-
-SET(OpenCV_LIB_COMPONENTS @OPENCV_MODULES_CONFIGCMAKE@)
-list(REMOVE_ITEM OpenCV_LIB_COMPONENTS opencv_hal)
-SET(OpenCV_WORLD_COMPONENTS @OPENCV_WORLD_MODULES@)
-
-# ==============================================================
-#  Extra include directories, needed by OpenCV 2 new structure
-# ==============================================================
-SET(OpenCV2_INCLUDE_DIRS @OpenCV2_INCLUDE_DIRS_CONFIGCMAKE@)
-if(OpenCV2_INCLUDE_DIRS)
-  list(APPEND OpenCV_INCLUDE_DIRS ${OpenCV2_INCLUDE_DIRS})
-
-  set(OpenCV_ADD_DEBUG_RELEASE @OpenCV_ADD_DEBUG_RELEASE_CONFIGCMAKE@)
-  if(OpenCV_ADD_DEBUG_RELEASE)
-    set(OpenCV_LIB_DIR_OPT "${OpenCV_LIB_DIR_OPT}/Release")
-    set(OpenCV_LIB_DIR_DBG "${OpenCV_LIB_DIR_DBG}/Debug")
-    set(OpenCV_3RDPARTY_LIB_DIR_OPT "${OpenCV_3RDPARTY_LIB_DIR_OPT}/Release")
-    set(OpenCV_3RDPARTY_LIB_DIR_DBG "${OpenCV_3RDPARTY_LIB_DIR_DBG}/Debug")
-  endif()
+if(NOT TARGET opencv_core)
+  include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${OpenCV_MODULES_SUFFIX}.cmake)
 endif()
 
 if(NOT CMAKE_VERSION VERSION_LESS "2.8.11")
@@ -196,22 +106,6 @@ if(NOT CMAKE_VERSION VERSION_LESS "2.8.11")
   endforeach()
 endif()
 
-# ==============================================================
-#  Check OpenCV availability
-# ==============================================================
-if(ANDROID AND OpenCV_ANDROID_NATIVE_API_LEVEL GREATER ANDROID_NATIVE_API_LEVEL)
-  message(FATAL_ERROR "Minimum required by OpenCV API level is android-${OpenCV_ANDROID_NATIVE_API_LEVEL}")
-  #always FATAL_ERROR because we can't say to the caller that OpenCV is not found
-  #http://www.mail-archive.com/cmake@cmake.org/msg37831.html
-  if(OpenCV_FIND_REQUIRED)
-    message(FATAL_ERROR "Minimum required by OpenCV API level is android-${OpenCV_ANDROID_NATIVE_API_LEVEL}")
-  elseif(NOT OpenCV_FIND_QUIETLY)
-    message(WARNING "Minimum required by OpenCV API level is android-${OpenCV_ANDROID_NATIVE_API_LEVEL}")
-  endif()
-  set(OpenCV_FOUND "OpenCV_FOUND-NOTFOUND")
-  return()#Android toolchain requires CMake > 2.6
-endif()
-
 # ==============================================================
 #  Form list of modules (components) to find
 # ==============================================================
@@ -223,6 +117,8 @@ if(NOT OpenCV_FIND_COMPONENTS)
   endif()
 endif()
 
+set(OpenCV_WORLD_COMPONENTS @OPENCV_WORLD_MODULES@)
+
 # expand short module names and see if requested components exist
 set(OpenCV_FIND_COMPONENTS_ "")
 foreach(__cvcomponent ${OpenCV_FIND_COMPONENTS})
@@ -276,89 +172,11 @@ foreach(__cvcomponent ${OpenCV_FIND_COMPONENTS})
 endforeach()
 set(OpenCV_FIND_COMPONENTS ${OpenCV_FIND_COMPONENTS_})
 
-# ==============================================================
-#  Resolve dependencies
-# ==============================================================
-if(OpenCV_USE_MANGLED_PATHS)
-  set(OpenCV_LIB_SUFFIX ".${OpenCV_VERSION_MAJOR}.${OpenCV_VERSION_MINOR}.${OpenCV_VERSION_PATCH}")
-else()
-  set(OpenCV_LIB_SUFFIX "")
-endif()
-
-foreach(__opttype OPT DBG)
-  SET(OpenCV_LIBS_${__opttype} "${OpenCV_LIBS}")
-  SET(OpenCV_EXTRA_LIBS_${__opttype} "")
-
-  # CUDA
-  if(OpenCV_CUDA_VERSION)
-    if(NOT CUDA_FOUND)
-      find_host_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
-    else()
-      if(NOT CUDA_VERSION_STRING VERSION_EQUAL OpenCV_CUDA_VERSION)
-        message(FATAL_ERROR "OpenCV static library was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
-      endif()
-    endif()
-
-    set(OpenCV_CUDA_LIBS_ABSPATH ${CUDA_LIBRARIES})
-
-    if(${CUDA_VERSION} VERSION_LESS "5.5")
-      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_npp_LIBRARY})
-    else()
-      find_cuda_helper_libs(nppc)
-      find_cuda_helper_libs(nppi)
-      find_cuda_helper_libs(npps)
-      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
-    endif()
-
-    if(OpenCV_USE_CUBLAS)
-      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_CUBLAS_LIBRARIES})
-    endif()
-
-    if(OpenCV_USE_CUFFT)
-      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_CUFFT_LIBRARIES})
-    endif()
-
-    if(OpenCV_USE_NVCUVID)
-      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nvcuvid_LIBRARIES})
-    endif()
-
-    if(WIN32)
-      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nvcuvenc_LIBRARIES})
-    endif()
-
-    set(OpenCV_CUDA_LIBS_RELPATH "")
-    foreach(l ${OpenCV_CUDA_LIBS_ABSPATH})
-      get_filename_component(_tmp ${l} PATH)
-      if(NOT ${_tmp} MATCHES "-Wl.*")
-          list(APPEND OpenCV_CUDA_LIBS_RELPATH ${_tmp})
-      endif()
-    endforeach()
-
-    list(REMOVE_DUPLICATES OpenCV_CUDA_LIBS_RELPATH)
-    link_directories(${OpenCV_CUDA_LIBS_RELPATH})
-  endif()
-endforeach()
-
 # ==============================================================
 # Compatibility stuff
 # ==============================================================
-if(CMAKE_BUILD_TYPE MATCHES "Debug")
-  SET(OpenCV_LIB_DIR ${OpenCV_LIB_DIR_DBG} ${OpenCV_3RDPARTY_LIB_DIR_DBG})
-else()
-  SET(OpenCV_LIB_DIR ${OpenCV_LIB_DIR_OPT} ${OpenCV_3RDPARTY_LIB_DIR_OPT})
-endif()
 set(OpenCV_LIBRARIES ${OpenCV_LIBS})
 
-if(CMAKE_CROSSCOMPILING AND OpenCV_SHARED AND (CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  foreach(dir ${OpenCV_LIB_DIR})
-    set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS}    -Wl,-rpath-link,${dir}")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath-link,${dir}")
-    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-rpath-link,${dir}")
-  endforeach()
-endif()
-
-
-
 #
 # Some macroses for samples
 #
@@ -376,7 +194,7 @@ endmacro()
 # adds include directories in such way that directories from the OpenCV source tree go first
 function(ocv_include_directories)
   set(__add_before "")
-  file(TO_CMAKE_PATH "${OpenCV_DIR}" __baseDir)
+  file(TO_CMAKE_PATH "${OpenCV_INSTALL_PATH}" __baseDir)
   foreach(dir ${ARGN})
     get_filename_component(__abs_dir "${dir}" ABSOLUTE)
     if("${__abs_dir}" MATCHES "^${__baseDir}")
diff --git a/cmake/templates/OpenCVConfig.root-ANDROID.cmake.in b/cmake/templates/OpenCVConfig.root-ANDROID.cmake.in
new file mode 100644
index 0000000000..7ceeec4c7f
--- /dev/null
+++ b/cmake/templates/OpenCVConfig.root-ANDROID.cmake.in
@@ -0,0 +1,50 @@
+# ===================================================================================
+#  The OpenCV CMake configuration file
+#
+#             ** File generated automatically, do not modify **
+#
+#  Usage from an external project:
+#    In your CMakeLists.txt, add these lines:
+#
+#    find_package(OpenCV REQUIRED)
+#    include_directories(${OpenCV_INCLUDE_DIRS}) # Not needed for CMake >= 2.8.11
+#    target_link_libraries(MY_TARGET_NAME ${OpenCV_LIBS})
+#
+#    Or you can search for specific OpenCV modules:
+#
+#    find_package(OpenCV REQUIRED core videoio)
+#
+#    If the module is found then OPENCV_<MODULE>_FOUND is set to TRUE.
+#
+#    This file will define the following variables:
+#      - OpenCV_LIBS                     : The list of all imported targets for OpenCV modules.
+#      - OpenCV_INCLUDE_DIRS             : The OpenCV include directories.
+#      - OpenCV_ANDROID_NATIVE_API_LEVEL : Minimum required level of Android API.
+#      - OpenCV_VERSION                  : The version of this OpenCV build: "@OPENCV_VERSION_PLAIN@"
+#      - OpenCV_VERSION_MAJOR            : Major version part of OpenCV_VERSION: "@OPENCV_VERSION_MAJOR@"
+#      - OpenCV_VERSION_MINOR            : Minor version part of OpenCV_VERSION: "@OPENCV_VERSION_MINOR@"
+#      - OpenCV_VERSION_PATCH            : Patch version part of OpenCV_VERSION: "@OPENCV_VERSION_PATCH@"
+#      - OpenCV_VERSION_STATUS           : Development status of this build: "@OPENCV_VERSION_STATUS@"
+#
+# ===================================================================================
+
+# Extract directory name from full path of the file currently being processed.
+# Note that CMake 2.8.3 introduced CMAKE_CURRENT_LIST_DIR. We reimplement it
+# for older versions of CMake to support these as well.
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+endif()
+
+if(NOT DEFINED OpenCV_CONFIG_SUBDIR)
+  set(OpenCV_CONFIG_SUBDIR "/abi-${ANDROID_NDK_ABI_NAME}")
+endif()
+
+set(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_DIR}${OpenCV_CONFIG_SUBDIR}")
+if(EXISTS "${OpenCV_CONFIG_PATH}/OpenCVConfig.cmake")
+  include("${OpenCV_CONFIG_PATH}/OpenCVConfig.cmake")
+else()
+  if(NOT OpenCV_FIND_QUIETLY)
+    message(WARNING "Found OpenCV Android Pack but it has no binaries compatible with your ABI (can't find: ${OpenCV_CONFIG_SUBDIR})")
+  endif()
+  set(OpenCV_FOUND FALSE)
+endif()
diff --git a/cmake/OpenCVConfig.cmake b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
similarity index 72%
rename from cmake/OpenCVConfig.cmake
rename to cmake/templates/OpenCVConfig.root-WIN32.cmake.in
index fdc371b19f..e40140fb75 100644
--- a/cmake/OpenCVConfig.cmake
+++ b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
@@ -17,20 +17,16 @@
 #
 #    This file will define the following variables:
 #      - OpenCV_LIBS                     : The list of libraries to link against.
-#      - OpenCV_LIB_DIR                  : The directory(es) where lib files are. Calling LINK_DIRECTORIES
-#                                          with this path is NOT needed.
 #      - OpenCV_INCLUDE_DIRS             : The OpenCV include directories.
 #      - OpenCV_COMPUTE_CAPABILITIES     : The version of compute capability
-#      - OpenCV_ANDROID_NATIVE_API_LEVEL : Minimum required level of Android API
-#      - OpenCV_VERSION                  : The version of this OpenCV build. Example: "2.4.0"
-#      - OpenCV_VERSION_MAJOR            : Major version part of OpenCV_VERSION. Example: "2"
-#      - OpenCV_VERSION_MINOR            : Minor version part of OpenCV_VERSION. Example: "4"
-#      - OpenCV_VERSION_PATCH            : Patch version part of OpenCV_VERSION. Example: "0"
+#      - OpenCV_VERSION                  : The version of this OpenCV build: "@OPENCV_VERSION_PLAIN@"
+#      - OpenCV_VERSION_MAJOR            : Major version part of OpenCV_VERSION: "@OPENCV_VERSION_MAJOR@"
+#      - OpenCV_VERSION_MINOR            : Minor version part of OpenCV_VERSION: "@OPENCV_VERSION_MINOR@"
+#      - OpenCV_VERSION_PATCH            : Patch version part of OpenCV_VERSION: "@OPENCV_VERSION_PATCH@"
+#      - OpenCV_VERSION_STATUS           : Development status of this build: "@OPENCV_VERSION_STATUS@"
 #
 #    Advanced variables:
 #      - OpenCV_SHARED
-#      - OpenCV_CONFIG_PATH
-#      - OpenCV_LIB_COMPONENTS
 #
 # ===================================================================================
 #
@@ -64,13 +60,11 @@ endif()
 if(MSVC)
   if(CMAKE_CL_64)
     set(OpenCV_ARCH x64)
-    set(OpenCV_TBB_ARCH intel64)
   elseif((CMAKE_GENERATOR MATCHES "ARM") OR ("${arch_hint}" STREQUAL "ARM") OR (CMAKE_VS_EFFECTIVE_PLATFORMS MATCHES "ARM|arm"))
     # see Modules/CmakeGenericSystem.cmake
     set(OpenCV_ARCH ARM)
   else()
     set(OpenCV_ARCH x86)
-    set(OpenCV_TBB_ARCH ia32)
   endif()
   if(MSVC_VERSION EQUAL 1400)
     set(OpenCV_RUNTIME vc8)
@@ -99,22 +93,13 @@ elseif(MINGW)
   endif()
 endif()
 
-if(CMAKE_VERSION VERSION_GREATER 2.6.2)
-  unset(OpenCV_CONFIG_PATH CACHE)
-endif()
-
 if(NOT OpenCV_FIND_QUIETLY)
   message(STATUS "OpenCV ARCH: ${OpenCV_ARCH}")
   message(STATUS "OpenCV RUNTIME: ${OpenCV_RUNTIME}")
   message(STATUS "OpenCV STATIC: ${OpenCV_STATIC}")
 endif()
 
-if(CMAKE_VERSION VERSION_LESS "2.8.12")
-  get_filename_component(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" PATH CACHE)
-else()
-  get_filename_component(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" DIRECTORY CACHE)
-endif()
-
+get_filename_component(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" PATH)
 if(OpenCV_RUNTIME AND OpenCV_ARCH)
   if(OpenCV_STATIC AND EXISTS "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib/OpenCVConfig.cmake")
     if(OpenCV_CUDA AND EXISTS "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib/OpenCVConfig.cmake")
@@ -132,28 +117,8 @@ if(OpenCV_RUNTIME AND OpenCV_ARCH)
 endif()
 
 if(OpenCV_LIB_PATH AND EXISTS "${OpenCV_LIB_PATH}/OpenCVConfig.cmake")
-  set(OpenCV_LIB_DIR_OPT "${OpenCV_LIB_PATH}" CACHE PATH "Path where release OpenCV libraries are located" FORCE)
-  set(OpenCV_LIB_DIR_DBG "${OpenCV_LIB_PATH}" CACHE PATH "Path where debug OpenCV libraries are located" FORCE)
-  set(OpenCV_3RDPARTY_LIB_DIR_OPT "${OpenCV_LIB_PATH}" CACHE PATH "Path where release 3rdparty OpenCV dependencies are located" FORCE)
-  set(OpenCV_3RDPARTY_LIB_DIR_DBG "${OpenCV_LIB_PATH}" CACHE PATH "Path where debug 3rdparty OpenCV dependencies are located" FORCE)
-
   include("${OpenCV_LIB_PATH}/OpenCVConfig.cmake")
 
-  if(OpenCV_CUDA)
-    set(_OpenCV_LIBS "")
-    foreach(_lib ${OpenCV_LIBS})
-      string(REPLACE "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}" "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}" _lib2 "${_lib}")
-      if(NOT EXISTS "${_lib}" AND EXISTS "${_lib2}")
-        list(APPEND _OpenCV_LIBS "${_lib2}")
-      else()
-        list(APPEND _OpenCV_LIBS "${_lib}")
-      endif()
-    endforeach()
-    set(OpenCV_LIBS ${_OpenCV_LIBS})
-  endif()
-  set(OpenCV_FOUND TRUE CACHE BOOL "" FORCE)
-  set(OPENCV_FOUND TRUE CACHE BOOL "" FORCE)
-
   if(NOT OpenCV_FIND_QUIETLY)
     message(STATUS "Found OpenCV ${OpenCV_VERSION} in ${OpenCV_LIB_PATH}")
     if(NOT OpenCV_LIB_PATH MATCHES "/staticlib")
@@ -173,6 +138,5 @@ else()
 You should manually point CMake variable OpenCV_DIR to your build of OpenCV library."
     )
   endif()
-  set(OpenCV_FOUND FALSE CACHE BOOL "" FORCE)
-  set(OPENCV_FOUND FALSE CACHE BOOL "" FORCE)
+  set(OpenCV_FOUND FALSE)
 endif()
diff --git a/doc/pattern_tools/gen_pattern.py b/doc/pattern_tools/gen_pattern.py
index ebeeb123fe..85b3ea4955 100755
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@@ -61,7 +61,7 @@ class PatternMaker:
 
   def save(self):
     c = canvas(self.g,width="%d%s"%(self.width,self.units),height="%d%s"%(self.height,self.units),viewBox="0 0 %d %d"%(self.width,self.height))
-    c.inkview(self.output)
+    c.save(self.output)
 
 
 def main():
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.markdown b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.markdown
index 06ac8e4a5f..237725ea42 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.markdown
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.markdown
@@ -23,7 +23,7 @@ import numpy as np
 
 img = cv2.imread('star.jpg',0)
 ret,thresh = cv2.threshold(img,127,255,0)
-contours,hierarchy = cv2.findContours(thresh, 1, 2)
+im2,contours,hierarchy = cv2.findContours(thresh, 1, 2)
 
 cnt = contours[0]
 M = cv2.moments(cnt)
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown b/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
index 66ab00613b..2a96cb0ea0 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
@@ -38,8 +38,8 @@ import numpy as np
 
 img = cv2.imread('star.jpg')
 img_gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
-ret, thresh = cv2.threshold(img_gray, 127, 255,0)
-contours,hierarchy = cv2.findContours(thresh,2,1)
+ret,thresh = cv2.threshold(img_gray, 127, 255,0)
+im2,contours,hierarchy = cv2.findContours(thresh,2,1)
 cnt = contours[0]
 
 hull = cv2.convexHull(cnt,returnPoints = False)
@@ -93,9 +93,9 @@ img2 = cv2.imread('star2.jpg',0)
 
 ret, thresh = cv2.threshold(img1, 127, 255,0)
 ret, thresh2 = cv2.threshold(img2, 127, 255,0)
-contours,hierarchy = cv2.findContours(thresh,2,1)
+im2,contours,hierarchy = cv2.findContours(thresh,2,1)
 cnt1 = contours[0]
-contours,hierarchy = cv2.findContours(thresh2,2,1)
+im2,contours,hierarchy = cv2.findContours(thresh2,2,1)
 cnt2 = contours[0]
 
 ret = cv2.matchShapes(cnt1,cnt2,1,0.0)
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 6d1253fc36..9e891afd96 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1782,15 +1782,16 @@ namespace fisheye
 //! @{
 
     enum{
-        CALIB_USE_INTRINSIC_GUESS   = 1,
-        CALIB_RECOMPUTE_EXTRINSIC   = 2,
-        CALIB_CHECK_COND            = 4,
-        CALIB_FIX_SKEW              = 8,
-        CALIB_FIX_K1                = 16,
-        CALIB_FIX_K2                = 32,
-        CALIB_FIX_K3                = 64,
-        CALIB_FIX_K4                = 128,
-        CALIB_FIX_INTRINSIC         = 256
+        CALIB_USE_INTRINSIC_GUESS   = 1 << 0,
+        CALIB_RECOMPUTE_EXTRINSIC   = 1 << 1,
+        CALIB_CHECK_COND            = 1 << 2,
+        CALIB_FIX_SKEW              = 1 << 3,
+        CALIB_FIX_K1                = 1 << 4,
+        CALIB_FIX_K2                = 1 << 5,
+        CALIB_FIX_K3                = 1 << 6,
+        CALIB_FIX_K4                = 1 << 7,
+        CALIB_FIX_INTRINSIC         = 1 << 8,
+        CALIB_FIX_PRINCIPAL_POINT   = 1 << 9
     };
 
     /** @brief Projects points using fisheye model
@@ -1940,8 +1941,10 @@ namespace fisheye
     of intrinsic optimization.
     -   **fisheye::CALIB_CHECK_COND** The functions will check validity of condition number.
     -   **fisheye::CALIB_FIX_SKEW** Skew coefficient (alpha) is set to zero and stay zero.
-    -   **fisheye::CALIB_FIX_K1..4** Selected distortion coefficients are set to zeros and stay
-    zero.
+    -   **fisheye::CALIB_FIX_K1..fisheye::CALIB_FIX_K4** Selected distortion coefficients
+    are set to zeros and stay zero.
+    -   **fisheye::CALIB_FIX_PRINCIPAL_POINT** The principal point is not changed during the global
+optimization. It stays at the center or at a different location specified when CALIB_USE_INTRINSIC_GUESS is set too.
     @param criteria Termination criteria for the iterative optimization algorithm.
      */
     CV_EXPORTS_W double calibrate(InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints, const Size& image_size,
diff --git a/modules/calib3d/src/calibration.cpp b/modules/calib3d/src/calibration.cpp
index fee02f5b9d..85ab74c19e 100644
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -1716,58 +1716,24 @@ void cvCalibrationMatrixValues( const CvMat *calibMatr, CvSize imgSize,
     double apertureWidth, double apertureHeight, double *fovx, double *fovy,
     double *focalLength, CvPoint2D64f *principalPoint, double *pasp )
 {
-    double alphax, alphay, mx, my;
-    int imgWidth = imgSize.width, imgHeight = imgSize.height;
-
     /* Validate parameters. */
-
     if(calibMatr == 0)
         CV_Error(CV_StsNullPtr, "Some of parameters is a NULL pointer!");
 
     if(!CV_IS_MAT(calibMatr))
         CV_Error(CV_StsUnsupportedFormat, "Input parameters must be a matrices!");
 
-    if(calibMatr->cols != 3 || calibMatr->rows != 3)
-        CV_Error(CV_StsUnmatchedSizes, "Size of matrices must be 3x3!");
-
-    alphax = cvmGet(calibMatr, 0, 0);
-    alphay = cvmGet(calibMatr, 1, 1);
-    assert(imgWidth != 0 && imgHeight != 0 && alphax != 0.0 && alphay != 0.0);
-
-    /* Calculate pixel aspect ratio. */
-    if(pasp)
-        *pasp = alphay / alphax;
-
-    /* Calculate number of pixel per realworld unit. */
-
-    if(apertureWidth != 0.0 && apertureHeight != 0.0) {
-        mx = imgWidth / apertureWidth;
-        my = imgHeight / apertureHeight;
-    } else {
-        mx = 1.0;
-        if(pasp)
-            my = *pasp;
-        else
-            my = 1.0;
-    }
-
-    /* Calculate fovx and fovy. */
-
-    if(fovx)
-        *fovx = 2 * atan(imgWidth / (2 * alphax)) * 180.0 / CV_PI;
-
-    if(fovy)
-        *fovy = 2 * atan(imgHeight / (2 * alphay)) * 180.0 / CV_PI;
-
-    /* Calculate focal length. */
-
-    if(focalLength)
-        *focalLength = alphax / mx;
-
-    /* Calculate principle point. */
+    double dummy;
+    Point2d pp;
+    cv::calibrationMatrixValues(cvarrToMat(calibMatr), imgSize, apertureWidth, apertureHeight,
+            fovx ? *fovx : dummy,
+            fovy ? *fovy : dummy,
+            focalLength ? *focalLength : dummy,
+            pp,
+            pasp ? *pasp : dummy);
 
     if(principalPoint)
-        *principalPoint = cvPoint2D64f(cvmGet(calibMatr, 0, 2) / mx, cvmGet(calibMatr, 1, 2) / my);
+        *principalPoint = cvPoint2D64f(pp.x, pp.y);
 }
 
 
@@ -3189,9 +3155,10 @@ static Mat prepareCameraMatrix(Mat& cameraMatrix0, int rtype)
     return cameraMatrix;
 }
 
-static Mat prepareDistCoeffs(Mat& distCoeffs0, int rtype)
+static Mat prepareDistCoeffs(Mat& distCoeffs0, int rtype, int outputSize = 14)
 {
-    Mat distCoeffs = Mat::zeros(distCoeffs0.cols == 1 ? Size(1, 14) : Size(14, 1), rtype);
+    CV_Assert((int)distCoeffs0.total() <= outputSize);
+    Mat distCoeffs = Mat::zeros(distCoeffs0.cols == 1 ? Size(1, outputSize) : Size(outputSize, 1), rtype);
     if( distCoeffs0.size() == Size(1, 4) ||
        distCoeffs0.size() == Size(1, 5) ||
        distCoeffs0.size() == Size(1, 8) ||
@@ -3398,7 +3365,8 @@ double cv::calibrateCamera(InputArrayOfArrays _objectPoints,
     Mat cameraMatrix = _cameraMatrix.getMat();
     cameraMatrix = prepareCameraMatrix(cameraMatrix, rtype);
     Mat distCoeffs = _distCoeffs.getMat();
-    distCoeffs = prepareDistCoeffs(distCoeffs, rtype);
+    distCoeffs = (flags & CALIB_THIN_PRISM_MODEL) && !(flags & CALIB_TILTED_MODEL)  ? prepareDistCoeffs(distCoeffs, rtype, 12) :
+                                                      prepareDistCoeffs(distCoeffs, rtype);
     if( !(flags & CALIB_RATIONAL_MODEL) &&
     (!(flags & CALIB_THIN_PRISM_MODEL)) &&
     (!(flags & CALIB_TILTED_MODEL)))
@@ -3505,10 +3473,37 @@ void cv::calibrationMatrixValues( InputArray _cameraMatrix, Size imageSize,
                                   double& fovx, double& fovy, double& focalLength,
                                   Point2d& principalPoint, double& aspectRatio )
 {
-    Mat cameraMatrix = _cameraMatrix.getMat();
-    CvMat c_cameraMatrix = cameraMatrix;
-    cvCalibrationMatrixValues( &c_cameraMatrix, imageSize, apertureWidth, apertureHeight,
-        &fovx, &fovy, &focalLength, (CvPoint2D64f*)&principalPoint, &aspectRatio );
+    if(_cameraMatrix.size() != Size(3, 3))
+        CV_Error(CV_StsUnmatchedSizes, "Size of cameraMatrix must be 3x3!");
+
+    Matx33d K = _cameraMatrix.getMat();
+
+    CV_DbgAssert(imageSize.width != 0 && imageSize.height != 0 && K(0, 0) != 0.0 && K(1, 1) != 0.0);
+
+    /* Calculate pixel aspect ratio. */
+    aspectRatio = K(1, 1) / K(0, 0);
+
+    /* Calculate number of pixel per realworld unit. */
+    double mx, my;
+    if(apertureWidth != 0.0 && apertureHeight != 0.0) {
+        mx = imageSize.width / apertureWidth;
+        my = imageSize.height / apertureHeight;
+    } else {
+        mx = 1.0;
+        my = aspectRatio;
+    }
+
+    /* Calculate fovx and fovy. */
+    fovx = atan2(K(0, 2), K(0, 0)) + atan2(imageSize.width  - K(0, 2), K(0, 0));
+    fovy = atan2(K(1, 2), K(1, 1)) + atan2(imageSize.height - K(1, 2), K(1, 1));
+    fovx *= 180.0 / CV_PI;
+    fovy *= 180.0 / CV_PI;
+
+    /* Calculate focal length. */
+    focalLength = K(0, 0) / mx;
+
+    /* Calculate principle point. */
+    principalPoint = Point2d(K(0, 2) / mx, K(1, 2) / my);
 }
 
 double cv::stereoCalibrate( InputArrayOfArrays _objectPoints,
diff --git a/modules/calib3d/src/fisheye.cpp b/modules/calib3d/src/fisheye.cpp
index 5aeaf5c2c0..80722f2e98 100644
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@@ -709,8 +709,8 @@ double cv::fisheye::calibrate(InputArrayOfArrays objectPoints, InputArrayOfArray
 
     finalParam.isEstimate[0] = 1;
     finalParam.isEstimate[1] = 1;
-    finalParam.isEstimate[2] = 1;
-    finalParam.isEstimate[3] = 1;
+    finalParam.isEstimate[2] = flags & CALIB_FIX_PRINCIPAL_POINT ? 0 : 1;
+    finalParam.isEstimate[3] = flags & CALIB_FIX_PRINCIPAL_POINT ? 0 : 1;
     finalParam.isEstimate[4] = flags & CALIB_FIX_SKEW ? 0 : 1;
     finalParam.isEstimate[5] = flags & CALIB_FIX_K1 ? 0 : 1;
     finalParam.isEstimate[6] = flags & CALIB_FIX_K2 ? 0 : 1;
diff --git a/modules/calib3d/src/ptsetreg.cpp b/modules/calib3d/src/ptsetreg.cpp
index f417696547..463027d510 100644
--- a/modules/calib3d/src/ptsetreg.cpp
+++ b/modules/calib3d/src/ptsetreg.cpp
@@ -462,7 +462,7 @@ public:
             double b = F[4]*f.x + F[5]*f.y + F[ 6]*f.z + F[ 7] - t.y;
             double c = F[8]*f.x + F[9]*f.y + F[10]*f.z + F[11] - t.z;
 
-            errptr[i] = (float)std::sqrt(a*a + b*b + c*c);
+            errptr[i] = (float)(a*a + b*b + c*c);
         }
     }
 
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 67e150e58f..c7dd8b9fcd 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -759,14 +759,50 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     }
                     else
                     {
-                        for( d = 0; d < D; d++ )
+                    #if CV_SSE2
+                        if( useSIMD )
                         {
-                            int Sval = Sp[d];
-                            if( Sval < minS )
-                            {
-                                minS = Sval;
-                                bestDisp = d;
-                            }
+                             __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
+                             __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
+
+                             for( d = 0; d < D; d+= 8 )
+                             {
+                                 __m128i L0 = _mm_load_si128((const __m128i*)( Sp + d ));
+                                 __m128i mask = _mm_cmplt_epi16( L0, _minS );
+                                 _minS = _mm_min_epi16( L0, _minS );
+                                 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128( _bestDisp, _d8), mask));
+                                 _d8 = _mm_adds_epi16(_d8, _8 );
+                             }
+                             short CV_DECL_ALIGNED(16) bestDispBuf[8];
+                             _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
+                             short CV_DECL_ALIGNED(16) minSBuf[8];
+                             _mm_store_si128((__m128i*)minSBuf, _minS );
+
+                             for( int i = 0; i < 8; i++ )
+                             {
+                                 int Sval = minSBuf[ i ];
+                                 if( Sval <= minS )
+                                 {
+                                     if( ( Sval < minS ) || ( bestDispBuf[i] < bestDisp ) )
+                                     {
+                                         bestDisp = bestDispBuf[i];
+                                     }
+                                     minS = Sval;
+                                 }
+                             }
+                        }
+                        else
+                    #endif
+                        {
+                           for( d = 0; d < D; d++ )
+                           {
+                               int Sval = Sp[d];
+                               if( Sval < minS )
+                               {
+                                   minS = Sval;
+                                   bestDisp = d;
+                               }
+                           }
                         }
                     }
 
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index b82c4b21b9..2dad0cf222 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -949,8 +949,8 @@ void CV_CalibrationMatrixValuesTest::run(int)
         ny = goodAspectRatio;
     }
 
-    goodFovx = 2 * atan( imageSize.width / (2 * fx)) * 180.0 / CV_PI;
-    goodFovy = 2 * atan( imageSize.height / (2 * fy)) * 180.0 / CV_PI;
+    goodFovx = (atan2(cx, fx) + atan2(imageSize.width  - cx, fx)) * 180.0 / CV_PI;
+    goodFovy = (atan2(cy, fy) + atan2(imageSize.height - cy, fy)) * 180.0 / CV_PI;
 
     goodFocalLength = fx / nx;
 
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 171fa9b082..36c4eea5d4 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(the_description "The Core Functionality")
 ocv_add_module(core
-               PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}" "${OPENCV_HAL_LINKER_LIBS}"
+               "${OPENCV_HAL_LINKER_LIBS}"
+               PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}"
                OPTIONAL opencv_cudev
                WRAP java python)
 
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 906c4d2aa8..e71d9c426b 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -826,9 +826,9 @@ CV_EXPORTS void minMaxLoc(const SparseMat& a, double* minVal,
 The function reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
 1D vectors and performing the specified operation on the vectors until a single row/column is
 obtained. For example, the function can be used to compute horizontal and vertical projections of a
-raster image. In case of REDUCE_SUM and REDUCE_AVG , the output may have a larger element
-bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction
-modes.
+raster image. In case of REDUCE_MAX and REDUCE_MIN , the output image should have the same type as the source one.
+In case of REDUCE_SUM and REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
+And multi-channel arrays are also supported in these two reduction modes.
 @param src input 2D matrix.
 @param dst output vector. Its size and type is defined by dim and dtype parameters.
 @param dim dimension index along which the matrix is reduced. 0 means that the matrix is reduced to
@@ -1433,14 +1433,11 @@ CV_EXPORTS_W void exp(InputArray src, OutputArray dst);
 
 /** @brief Calculates the natural logarithm of every array element.
 
-The function log calculates the natural logarithm of the absolute value
-of every element of the input array:
-\f[\texttt{dst} (I) =  \fork{\log |\texttt{src}(I)|}{if \(\texttt{src}(I) \ne 0\) }{\texttt{C}}{otherwise}\f]
+The function log calculates the natural logarithm of every element of the input array:
+\f[\texttt{dst} (I) =  \log (\texttt{src}(I)) \f]
+
+Output on zero, negative and special (NaN, Inf) values is undefined.
 
-where C is a large negative number (about -700 in the current
-implementation). The maximum relative error is about 7e-6 for
-single-precision input and less than 1e-10 for double-precision input.
-Special values (NaN, Inf) are not handled.
 @param src input array.
 @param dst output array of the same size and type as src .
 @sa exp, cartToPolar, polarToCart, phase, pow, sqrt, magnitude
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 64bc53ef51..96685513b1 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -447,7 +447,26 @@ CV_EXPORTS void unregisterPageLocked(Mat& m);
 functions use the constant GPU memory, and next call may update the memory before the previous one
 has been finished. But calling different operations asynchronously is safe because each operation
 has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are
-also safe. :
+also safe.
+
+@note The Stream class is not thread-safe. Please use different Stream objects for different CPU threads.
+
+@code
+void thread1()
+{
+    cv::cuda::Stream stream1;
+    cv::cuda::func1(..., stream1);
+}
+
+void thread2()
+{
+    cv::cuda::Stream stream2;
+    cv::cuda::func2(..., stream2);
+}
+@endcode
+
+@note By default all CUDA routines are launched in Stream::Null() object, if the stream is not specified by user.
+In multi-threading environment the stream objects must be passed explicitly (see previous note).
  */
 class CV_EXPORTS Stream
 {
diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 3829418931..f254b58582 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -89,7 +89,8 @@ CV_EXPORTS void exp64f(const double* src, double* dst, int n);
 CV_EXPORTS void log32f(const float* src, float* dst, int n);
 CV_EXPORTS void log64f(const double* src, double* dst, int n);
 
-CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+CV_EXPORTS void fastAtan32f(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+CV_EXPORTS void fastAtan64f(const double* y, const double* x, double* dst, int n, bool angleInDegrees);
 CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
 CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
 CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
@@ -232,6 +233,7 @@ CV_EXPORTS void exp(const double* src, double* dst, int n);
 CV_EXPORTS void log(const float* src, float* dst, int n);
 CV_EXPORTS void log(const double* src, double* dst, int n);
 
+CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
 CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
 CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n);
 CV_EXPORTS void sqrt(const float* src, float* dst, int len);
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 33e14b486a..6da8fdfd1d 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -317,4 +317,98 @@ template <typename T> struct V_SIMD128Traits
 
 //! @}
 
+//==================================================================================================
+
+//! @cond IGNORED
+
+namespace cv {
+
+template <typename R> struct V_RegTrait128;
+
+template <> struct V_RegTrait128<uchar> {
+    typedef v_uint8x16 reg;
+    typedef v_uint16x8 w_reg;
+    typedef v_uint32x4 q_reg;
+    typedef v_uint8x16 u_reg;
+    static v_uint8x16 zero() { return v_setzero_u8(); }
+    static v_uint8x16 all(uchar val) { return v_setall_u8(val); }
+};
+
+template <> struct V_RegTrait128<schar> {
+    typedef v_int8x16 reg;
+    typedef v_int16x8 w_reg;
+    typedef v_int32x4 q_reg;
+    typedef v_uint8x16 u_reg;
+    static v_int8x16 zero() { return v_setzero_s8(); }
+    static v_int8x16 all(schar val) { return v_setall_s8(val); }
+};
+
+template <> struct V_RegTrait128<ushort> {
+    typedef v_uint16x8 reg;
+    typedef v_uint32x4 w_reg;
+    typedef v_int16x8 int_reg;
+    typedef v_uint16x8 u_reg;
+    static v_uint16x8 zero() { return v_setzero_u16(); }
+    static v_uint16x8 all(ushort val) { return v_setall_u16(val); }
+};
+
+template <> struct V_RegTrait128<short> {
+    typedef v_int16x8 reg;
+    typedef v_int32x4 w_reg;
+    typedef v_uint16x8 u_reg;
+    static v_int16x8 zero() { return v_setzero_s16(); }
+    static v_int16x8 all(short val) { return v_setall_s16(val); }
+};
+
+template <> struct V_RegTrait128<unsigned> {
+    typedef v_uint32x4 reg;
+    typedef v_uint64x2 w_reg;
+    typedef v_int32x4 int_reg;
+    typedef v_uint32x4 u_reg;
+    static v_uint32x4 zero() { return v_setzero_u32(); }
+    static v_uint32x4 all(unsigned val) { return v_setall_u32(val); }
+};
+
+template <> struct V_RegTrait128<int> {
+    typedef v_int32x4 reg;
+    typedef v_int64x2 w_reg;
+    typedef v_uint32x4 u_reg;
+    static v_int32x4 zero() { return v_setzero_s32(); }
+    static v_int32x4 all(int val) { return v_setall_s32(val); }
+};
+
+template <> struct V_RegTrait128<uint64> {
+    typedef v_uint64x2 reg;
+    static v_uint64x2 zero() { return v_setzero_u64(); }
+    static v_uint64x2 all(uint64 val) { return v_setall_u64(val); }
+};
+
+template <> struct V_RegTrait128<int64> {
+    typedef v_int64x2 reg;
+    static v_int64x2 zero() { return v_setzero_s64(); }
+    static v_int64x2 all(int64 val) { return v_setall_s64(val); }
+};
+
+template <> struct V_RegTrait128<float> {
+    typedef v_float32x4 reg;
+    typedef v_int32x4 int_reg;
+    typedef v_float32x4 u_reg;
+    static v_float32x4 zero() { return v_setzero_f32(); }
+    static v_float32x4 all(float val) { return v_setall_f32(val); }
+};
+
+#if CV_SIMD128_64F
+template <> struct V_RegTrait128<double> {
+    typedef v_float64x2 reg;
+    typedef v_int32x4 int_reg;
+    typedef v_float64x2 u_reg;
+    static v_float64x2 zero() { return v_setzero_f64(); }
+    static v_float64x2 all(double val) { return v_setall_f64(val); }
+};
+#endif
+
+} // cv::
+
+//! @endcond
+
 #endif
diff --git a/modules/core/include/opencv2/core/persistence.hpp b/modules/core/include/opencv2/core/persistence.hpp
index 15454165ad..65a1ff4c4c 100644
--- a/modules/core/include/opencv2/core/persistence.hpp
+++ b/modules/core/include/opencv2/core/persistence.hpp
@@ -89,6 +89,8 @@ the extension of the opened file, ".xml" for XML files and ".yml" or ".yaml" for
  */
 typedef struct CvFileStorage CvFileStorage;
 typedef struct CvFileNode CvFileNode;
+typedef struct CvMat CvMat;
+typedef struct CvMatND CvMatND;
 
 //! @} core_c
 
@@ -1238,6 +1240,17 @@ inline String::String(const FileNode& fn): cstr_(0), len_(0) { read(fn, *this, *
 
 //! @endcond
 
+
+CV_EXPORTS void cvStartWriteRawData_Base64(::CvFileStorage * fs, const char* name, int len, const char* dt);
+
+CV_EXPORTS void cvWriteRawData_Base64(::CvFileStorage * fs, const void* _data, int len);
+
+CV_EXPORTS void cvEndWriteRawData_Base64(::CvFileStorage * fs);
+
+CV_EXPORTS void cvWriteMat_Base64(::CvFileStorage* fs, const char* name, const ::CvMat* mat);
+
+CV_EXPORTS void cvWriteMatND_Base64(::CvFileStorage* fs, const char* name, const ::CvMatND* mat);
+
 } // cv
 
-#endif // __OPENCV_CORE_PERSISTENCE_HPP__
+#endif // __OPENCV_CORE_PERSISTENCE_HPP__
\ No newline at end of file
diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index d676ce8506..c25d7885ed 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -64,7 +64,7 @@
 
 #  define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
 
-#  define CUDART_MINIMUM_REQUIRED_VERSION 4020
+#  define CUDART_MINIMUM_REQUIRED_VERSION 6050
 
 #  if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
 #    error "Insufficient Cuda Runtime library version, please update it."
diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
index a5993a36a7..4d0bd2653f 100644
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -267,8 +267,8 @@ public:
     inline operator const T* () const { return (const T*)m_pBuffer;}
 private:
     // Disable copy operations
-    IppAutoBuffer(IppAutoBuffer &) {};
-    IppAutoBuffer& operator =(const IppAutoBuffer &) {return *this;};
+    IppAutoBuffer(IppAutoBuffer &) {}
+    IppAutoBuffer& operator =(const IppAutoBuffer &) {return *this;}
 
     T* m_pBuffer;
 };
diff --git a/modules/core/include/opencv2/core/types.hpp b/modules/core/include/opencv2/core/types.hpp
index e166556af7..5376e92746 100644
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@@ -51,6 +51,7 @@
 #include <climits>
 #include <cfloat>
 #include <vector>
+#include <limits>
 
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
@@ -231,7 +232,11 @@ public:
     //! conversion to another data type
     template<typename _Tp2> operator Point3_<_Tp2>() const;
     //! conversion to cv::Vec<>
+#if OPENCV_ABI_COMPATIBILITY > 300
+    template<typename _Tp2> operator Vec<_Tp2, 3>() const;
+#else
     operator Vec<_Tp, 3>() const;
+#endif
 
     //! dot product
     _Tp dot(const Point3_& pt) const;
@@ -1326,11 +1331,19 @@ Point3_<_Tp>::operator Point3_<_Tp2>() const
     return Point3_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(z));
 }
 
+#if OPENCV_ABI_COMPATIBILITY > 300
+template<typename _Tp> template<typename _Tp2> inline
+Point3_<_Tp>::operator Vec<_Tp2, 3>() const
+{
+    return Vec<_Tp2, 3>(x, y, z);
+}
+#else
 template<typename _Tp> inline
 Point3_<_Tp>::operator Vec<_Tp, 3>() const
 {
     return Vec<_Tp, 3>(x, y, z);
 }
+#endif
 
 template<typename _Tp> inline
 Point3_<_Tp>& Point3_<_Tp>::operator = (const Point3_& pt)
@@ -1832,7 +1845,26 @@ Rect_<_Tp> operator | (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
     return c |= b;
 }
 
+/**
+ * @brief measure dissimilarity between two sample sets
+ *
+ * computes the complement of the Jaccard Index as described in <https://en.wikipedia.org/wiki/Jaccard_index>.
+ * For rectangles this reduces to computing the intersection over the union.
+ */
+template<typename _Tp> static inline
+double jaccardDistance(const Rect_<_Tp>& a, const Rect_<_Tp>& b) {
+    _Tp Aa = a.area();
+    _Tp Ab = b.area();
 
+    if ((Aa + Ab) <= std::numeric_limits<_Tp>::epsilon()) {
+        // jaccard_index = 1 -> distance = 0
+        return 0.0;
+    }
+
+    double Aab = (a & b).area();
+    // distance = 1 - jaccard_index
+    return 1.0 - Aab / (Aa + Ab - Aab);
+}
 
 ////////////////////////////// RotatedRect //////////////////////////////
 
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index 60b2d3a064..768c4d5223 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -251,7 +251,8 @@ CV_EXPORTS_W const String& getBuildInformation();
 
 The function returns the number of ticks after the certain event (for example, when the machine was
 turned on). It can be used to initialize RNG or to measure a function execution time by reading the
-tick count before and after the function call. See also the tick frequency.
+tick count before and after the function call.
+@sa getTickFrequency, TickMeter
  */
 CV_EXPORTS_W int64 getTickCount();
 
@@ -264,9 +265,126 @@ execution time in seconds:
     // do something ...
     t = ((double)getTickCount() - t)/getTickFrequency();
 @endcode
+@sa getTickCount, TickMeter
  */
 CV_EXPORTS_W double getTickFrequency();
 
+/** @brief a Class to measure passing time.
+
+The class computes passing time by counting the number of ticks per second. That is, the following code computes the
+execution time in seconds:
+@code
+TickMeter tm;
+tm.start();
+// do something ...
+tm.stop();
+std::cout << tm.getTimeSec();
+@endcode
+@sa getTickCount, getTickFrequency
+*/
+
+class CV_EXPORTS_W TickMeter
+{
+public:
+    //! the default constructor
+    CV_WRAP TickMeter()
+    {
+    reset();
+    }
+
+    /**
+    starts counting ticks.
+    */
+    CV_WRAP void start()
+    {
+    startTime = cv::getTickCount();
+    }
+
+    /**
+    stops counting ticks.
+    */
+    CV_WRAP void stop()
+    {
+    int64 time = cv::getTickCount();
+    if (startTime == 0)
+    return;
+    ++counter;
+    sumTime += (time - startTime);
+    startTime = 0;
+    }
+
+    /**
+    returns counted ticks.
+    */
+    CV_WRAP int64 getTimeTicks() const
+    {
+    return sumTime;
+    }
+
+    /**
+    returns passed time in microseconds.
+    */
+    CV_WRAP double getTimeMicro() const
+    {
+    return getTimeMilli()*1e3;
+    }
+
+    /**
+    returns passed time in milliseconds.
+    */
+    CV_WRAP double getTimeMilli() const
+    {
+    return getTimeSec()*1e3;
+    }
+
+    /**
+    returns passed time in seconds.
+    */
+    CV_WRAP double getTimeSec()   const
+    {
+    return (double)getTimeTicks() / getTickFrequency();
+    }
+
+    /**
+    returns internal counter value.
+    */
+    CV_WRAP int64 getCounter() const
+    {
+    return counter;
+    }
+
+    /**
+    resets internal values.
+    */
+    CV_WRAP void reset()
+    {
+    startTime = 0;
+    sumTime = 0;
+    counter = 0;
+    }
+
+private:
+    int64 counter;
+    int64 sumTime;
+    int64 startTime;
+};
+
+/** @brief output operator
+@code
+TickMeter tm;
+tm.start();
+// do something ...
+tm.stop();
+std::cout << tm;
+@endcode
+*/
+
+static inline
+std::ostream& operator << (std::ostream& out, const TickMeter& tm)
+{
+    return out << tm.getTimeSec() << "sec";
+}
+
 /** @brief Returns the number of CPU ticks.
 
 The function returns the current number of CPU ticks on some architectures (such as x86, x64,
@@ -817,10 +935,10 @@ AutoBuffer<_Tp, fixed_size>::allocate(size_t _size)
         return;
     }
     deallocate();
+    sz = _size;
     if(_size > fixed_size)
     {
         ptr = new _Tp[_size];
-        sz = _size;
     }
 }
 
diff --git a/modules/core/perf/perf_math.cpp b/modules/core/perf/perf_math.cpp
index 267cc9c409..eb3fbb0b24 100644
--- a/modules/core/perf/perf_math.cpp
+++ b/modules/core/perf/perf_math.cpp
@@ -25,6 +25,20 @@ PERF_TEST_P(VectorLength, phase32f, testing::Values(128, 1000, 128*1024, 512*102
     SANITY_CHECK(angle, 5e-5);
 }
 
+PERF_TEST_P(VectorLength, phase64f, testing::Values(128, 1000, 128*1024, 512*1024, 1024*1024))
+{
+    size_t length = GetParam();
+    vector<double> X(length);
+    vector<double> Y(length);
+    vector<double> angle(length);
+
+    declare.in(X, Y, WARMUP_RNG).out(angle);
+
+    TEST_CYCLE_N(200) cv::phase(X, Y, angle, true);
+
+    SANITY_CHECK(angle, 5e-5);
+}
+
 PERF_TEST_P( MaxDim_MaxPoints, kmeans,
              testing::Combine( testing::Values( 16, 32, 64 ),
                                testing::Values( 300, 400, 500) ) )
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index f41bfa105f..1db170a926 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -50,6 +50,7 @@
 #define CV_NEON 0
 #endif
 
+#define CV_SPLIT_MERGE_MAX_BLOCK_SIZE(cn) ((INT_MAX/4)/cn) // HAL implementation accepts 'int' len, so INT_MAX doesn't work here
 
 /****************************************************************************************\
 *                                       split & merge                                    *
@@ -93,8 +94,8 @@ void cv::split(const Mat& src, Mat* mv)
     SplitFunc func = getSplitFunc(depth);
     CV_Assert( func != 0 );
 
-    int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1();
-    int blocksize0 = (BLOCK_SIZE + esz-1)/esz;
+    size_t esz = src.elemSize(), esz1 = src.elemSize1();
+    size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
     const Mat** arrays = (const Mat**)(uchar*)_buf;
     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
@@ -107,14 +108,15 @@ void cv::split(const Mat& src, Mat* mv)
     }
 
     NAryMatIterator it(arrays, ptrs, cn+1);
-    int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
+    size_t total = it.size;
+    size_t blocksize = std::min((size_t)CV_SPLIT_MERGE_MAX_BLOCK_SIZE(cn), cn <= 4 ? total : std::min(total, blocksize0));
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
     {
-        for( int j = 0; j < total; j += blocksize )
+        for( size_t j = 0; j < total; j += blocksize )
         {
-            int bsz = std::min(total - j, blocksize);
-            func( ptrs[0], &ptrs[1], bsz, cn );
+            size_t bsz = std::min(total - j, blocksize);
+            func( ptrs[0], &ptrs[1], (int)bsz, cn );
 
             if( j + blocksize < total )
             {
@@ -241,8 +243,11 @@ void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
         return;
     }
 
+    MergeFunc func = getMergeFunc(depth);
+    CV_Assert( func != 0 );
+
     size_t esz = dst.elemSize(), esz1 = dst.elemSize1();
-    int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
+    size_t blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
     const Mat** arrays = (const Mat**)(uchar*)_buf;
     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
@@ -252,15 +257,15 @@ void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
         arrays[k+1] = &mv[k];
 
     NAryMatIterator it(arrays, ptrs, cn+1);
-    int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
-    MergeFunc func = getMergeFunc(depth);
+    size_t total = (int)it.size;
+    size_t blocksize = std::min((size_t)CV_SPLIT_MERGE_MAX_BLOCK_SIZE(cn), cn <= 4 ? total : std::min(total, blocksize0));
 
     for( i = 0; i < it.nplanes; i++, ++it )
     {
-        for( int j = 0; j < total; j += blocksize )
+        for( size_t j = 0; j < total; j += blocksize )
         {
-            int bsz = std::min(total - j, blocksize);
-            func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn );
+            size_t bsz = std::min(total - j, blocksize);
+            func( (const uchar**)&ptrs[1], ptrs[0], (int)bsz, cn );
 
             if( j + blocksize < total )
             {
diff --git a/modules/core/src/cuda_info.cpp b/modules/core/src/cuda_info.cpp
index 5ad33ce8a1..b412438581 100644
--- a/modules/core/src/cuda_info.cpp
+++ b/modules/core/src/cuda_info.cpp
@@ -71,6 +71,7 @@ void cv::cuda::setDevice(int device)
     throw_no_cuda();
 #else
     cudaSafeCall( cudaSetDevice(device) );
+    cudaSafeCall( cudaFree(0) );
 #endif
 }
 
diff --git a/modules/core/src/hal_internal.cpp b/modules/core/src/hal_internal.cpp
index 096ac0b400..07054f1d6c 100644
--- a/modules/core/src/hal_internal.cpp
+++ b/modules/core/src/hal_internal.cpp
@@ -399,9 +399,9 @@ lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t s
         set_value((std::complex<fptype>*)dst, lddst, std::complex<fptype>(0.0, 0.0), d_m, d_n);
 
     if(typeid(fptype) == typeid(float))
-        cblas_cgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, &cAlpha, (void*)src1, ldsrc1, (void*)src2, ldsrc2, &cBeta, (void*)dst, lddst);
+        cblas_cgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (float*)reinterpret_cast<fptype(&)[2]>(cAlpha), (float*)src1, ldsrc1, (float*)src2, ldsrc2, (float*)reinterpret_cast<fptype(&)[2]>(cBeta), (float*)dst, lddst);
     else if(typeid(fptype) == typeid(double))
-        cblas_zgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, &cAlpha, (void*)src1, ldsrc1, (void*)src2, ldsrc2, &cBeta, (void*)dst, lddst);
+        cblas_zgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (double*)reinterpret_cast<fptype(&)[2]>(cAlpha), (double*)src1, ldsrc1, (double*)src2, ldsrc2, (double*)reinterpret_cast<fptype(&)[2]>(cBeta), (double*)dst, lddst);
 
     return CV_HAL_ERROR_OK;
 }
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index 605ae3863e..7f826b4096 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -376,6 +376,110 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #define cv_hal_merge64s hal_ni_merge64s
 //! @endcond
 
+
+/**
+@param y,x source Y and X arrays
+@param dst destination array
+@param len length of arrays
+@param angleInDegrees if set to true return angles in degrees, otherwise in radians
+ */
+//! @addtogroup core_hal_interface_fastAtan Atan calculation
+//! @{
+inline int hal_ni_fastAtan32f(const float* y, const float* x, float* dst, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_fastAtan64f(const double* y, const double* x, double* dst, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_fastAtan32f hal_ni_fastAtan32f
+#define cv_hal_fastAtan64f hal_ni_fastAtan64f
+//! @endcond
+
+
+/**
+@param x,y source X and Y arrays
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_magnitude Magnitude calculation
+//! @{
+inline int hal_ni_magnitude32f(const float *x, const float *y, float *dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_magnitude64f(const double *x, const double  *y, double *dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_magnitude32f hal_ni_magnitude32f
+#define cv_hal_magnitude64f hal_ni_magnitude64f
+//! @endcond
+
+
+/**
+@param src source array
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_invSqrt Inverse square root calculation
+//! @{
+inline int hal_ni_invSqrt32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_invSqrt64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_invSqrt32f hal_ni_invSqrt32f
+#define cv_hal_invSqrt64f hal_ni_invSqrt64f
+//! @endcond
+
+
+/**
+@param src source array
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_sqrt Square root calculation
+//! @{
+inline int hal_ni_sqrt32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sqrt64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_sqrt32f hal_ni_sqrt32f
+#define cv_hal_sqrt64f hal_ni_sqrt64f
+//! @endcond
+
+
+/**
+@param src source array
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_log Natural logarithm calculation
+//! @{
+inline int hal_ni_log32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_log64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_log32f hal_ni_log32f
+#define cv_hal_log64f hal_ni_log64f
+//! @endcond
+
+
+/**
+@param src source array
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_exp Exponent calculation
+//! @{
+inline int hal_ni_exp32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_exp64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_exp32f hal_ni_exp32f
+#define cv_hal_exp64f hal_ni_exp64f
+//! @endcond
+
+
 /**
 @brief Dummy structure storing DFT/DCT context
 
@@ -599,21 +703,25 @@ inline int hal_ni_gemm64fc(const double* src1, size_t src1_step, const double* s
 
 //! @cond IGNORED
 #define CALL_HAL_RET(name, fun, retval, ...) \
+{ \
     int res = fun(__VA_ARGS__, &retval); \
     if (res == CV_HAL_ERROR_OK) \
         return retval; \
     else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
         CV_Error_(cv::Error::StsInternal, \
-            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); \
+}
 
 
 #define CALL_HAL(name, fun, ...) \
+{ \
     int res = fun(__VA_ARGS__); \
     if (res == CV_HAL_ERROR_OK) \
         return; \
     else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
         CV_Error_(cv::Error::StsInternal, \
-            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); \
+}
 //! @endcond
 
 #endif
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 495711f8dd..e974776c85 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -51,11 +51,6 @@ namespace cv
 
 typedef void (*MathFunc)(const void* src, void* dst, int len);
 
-static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
-static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
-static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
-static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
-
 #ifdef HAVE_OPENCL
 
 enum { OCL_OP_LOG=0, OCL_OP_EXP=1, OCL_OP_MAG=2, OCL_OP_PHASE_DEGREES=3, OCL_OP_PHASE_RADIANS=4 };
@@ -100,29 +95,6 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in
 
 #endif
 
-float fastAtan2( float y, float x )
-{
-    float ax = std::abs(x), ay = std::abs(y);
-    float a, c, c2;
-    if( ax >= ay )
-    {
-        c = ay/(ax + (float)DBL_EPSILON);
-        c2 = c*c;
-        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-    }
-    else
-    {
-        c = ax/(ay + (float)DBL_EPSILON);
-        c2 = c*c;
-        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-    }
-    if( x < 0 )
-        a = 180.f - a;
-    if( y < 0 )
-        a = 360.f - a;
-    return a;
-}
-
 /* ************************************************************************** *\
    Fast cube root by Ken Turkowski
    (http://www.worldserver.com/turk/computergraphics/papers.html)
@@ -202,7 +174,6 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
     }
 }
 
-
 void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegrees )
 {
     int type = src1.type(), depth = src1.depth(), cn = src1.channels();
@@ -218,19 +189,8 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
     const Mat* arrays[] = {&X, &Y, &Angle, 0};
     uchar* ptrs[3];
     NAryMatIterator it(arrays, ptrs);
-    cv::AutoBuffer<float> _buf;
-    float* buf[2] = {0, 0};
-    int j, k, total = (int)(it.size*cn), blockSize = total;
+    int j, total = (int)(it.size*cn), blockSize = total;
     size_t esz1 = X.elemSize1();
-
-    if( depth == CV_64F )
-    {
-        blockSize = std::min(blockSize, ((BLOCK_SIZE+cn-1)/cn)*cn);
-        _buf.allocate(blockSize*2);
-        buf[0] = _buf;
-        buf[1] = buf[0] + blockSize;
-    }
-
     for( size_t i = 0; i < it.nplanes; i++, ++it )
     {
         for( j = 0; j < total; j += blockSize )
@@ -240,53 +200,13 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
             {
                 const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                 float *angle = (float*)ptrs[2];
-                hal::fastAtan2( y, x, angle, len, angleInDegrees );
+                hal::fastAtan32f( y, x, angle, len, angleInDegrees );
             }
             else
             {
                 const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                 double *angle = (double*)ptrs[2];
-                k = 0;
-
-#if CV_SSE2
-                if (USE_SSE2)
-                {
-                    for ( ; k <= len - 4; k += 4)
-                    {
-                        __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
-                                                      _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
-                        __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
-                                                      _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
-
-                        _mm_storeu_ps(buf[0] + k, v_dst0);
-                        _mm_storeu_ps(buf[1] + k, v_dst1);
-                    }
-                }
-#endif
-
-                for( ; k < len; k++ )
-                {
-                    buf[0][k] = (float)x[k];
-                    buf[1][k] = (float)y[k];
-                }
-
-                hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees );
-                k = 0;
-
-#if CV_SSE2
-                if (USE_SSE2)
-                {
-                    for ( ; k <= len - 4; k += 4)
-                    {
-                        __m128 v_src = _mm_loadu_ps(buf[0] + k);
-                        _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
-                        _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
-                    }
-                }
-#endif
-
-                for( ; k < len; k++ )
-                    angle[k] = buf[0][k];
+                hal::fastAtan64f(y, x, angle, len, angleInDegrees);
             }
             ptrs[0] += len*esz1;
             ptrs[1] += len*esz1;
@@ -353,18 +273,9 @@ void cartToPolar( InputArray src1, InputArray src2,
     const Mat* arrays[] = {&X, &Y, &Mag, &Angle, 0};
     uchar* ptrs[4];
     NAryMatIterator it(arrays, ptrs);
-    cv::AutoBuffer<float> _buf;
-    float* buf[2] = {0, 0};
-    int j, k, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn);
+    int j, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn);
     size_t esz1 = X.elemSize1();
 
-    if( depth == CV_64F )
-    {
-        _buf.allocate(blockSize*2);
-        buf[0] = _buf;
-        buf[1] = buf[0] + blockSize;
-    }
-
     for( size_t i = 0; i < it.nplanes; i++, ++it )
     {
         for( j = 0; j < total; j += blockSize )
@@ -375,55 +286,14 @@ void cartToPolar( InputArray src1, InputArray src2,
                 const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                 float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
                 hal::magnitude32f( x, y, mag, len );
-                hal::fastAtan2( y, x, angle, len, angleInDegrees );
+                hal::fastAtan32f( y, x, angle, len, angleInDegrees );
             }
             else
             {
                 const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                 double *angle = (double*)ptrs[3];
-
                 hal::magnitude64f(x, y, (double*)ptrs[2], len);
-                k = 0;
-
-#if CV_SSE2
-                if (USE_SSE2)
-                {
-                    for ( ; k <= len - 4; k += 4)
-                    {
-                        __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
-                                                      _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
-                        __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
-                                                      _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
-
-                        _mm_storeu_ps(buf[0] + k, v_dst0);
-                        _mm_storeu_ps(buf[1] + k, v_dst1);
-                    }
-                }
-#endif
-
-                for( ; k < len; k++ )
-                {
-                    buf[0][k] = (float)x[k];
-                    buf[1][k] = (float)y[k];
-                }
-
-                hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees );
-                k = 0;
-
-#if CV_SSE2
-                if (USE_SSE2)
-                {
-                    for ( ; k <= len - 4; k += 4)
-                    {
-                        __m128 v_src = _mm_loadu_ps(buf[0] + k);
-                        _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
-                        _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
-                    }
-                }
-#endif
-
-                for( ; k < len; k++ )
-                    angle[k] = buf[0][k];
+                hal::fastAtan64f(y, x, angle, len, angleInDegrees);
             }
             ptrs[0] += len*esz1;
             ptrs[1] += len*esz1;
diff --git a/modules/core/src/mathfuncs_core.cpp b/modules/core/src/mathfuncs_core.cpp
index 7b3ec319c9..de292fae94 100644
--- a/modules/core/src/mathfuncs_core.cpp
+++ b/modules/core/src/mathfuncs_core.cpp
@@ -42,116 +42,188 @@
 
 #include "precomp.hpp"
 
+using namespace std;
+
 #undef HAVE_IPP
 
-namespace cv { namespace hal {
+namespace {
 
-///////////////////////////////////// ATAN2 ////////////////////////////////////
 static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
 static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
 static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
 static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
 
-void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+using namespace cv;
+
+#if CV_SIMD128
+
+template <typename T>
+struct v_atan
 {
-    int i = 0;
-    float scale = angleInDegrees ? 1 : (float)(CV_PI/180);
+    typedef V_RegTrait128<T> Trait;
+    typedef typename Trait::reg VT; // vector type
+    enum { WorkWidth = VT::nlanes * 2 };
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale))
-        return;
-#endif
+    v_atan(const T & scale)
+        : s(Trait::all(scale))
+    {
+        eps = Trait::all(DBL_EPSILON);
+        z = Trait::zero();
+        p7 = Trait::all(atan2_p7);
+        p5 = Trait::all(atan2_p5);
+        p3 = Trait::all(atan2_p3);
+        p1 = Trait::all(atan2_p1);
+        val90 = Trait::all(90.f);
+        val180 = Trait::all(180.f);
+        val360 = Trait::all(360.f);
+    }
 
-#if CV_SSE2
-    Cv32suf iabsmask; iabsmask.i = 0x7fffffff;
-    __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f);
-    __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f);
-    __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale);
-    __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3);
-    __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7);
+    inline int operator()(int len, const T * Y, const T * X, T * angle)
+    {
+        int i = 0;
+        const int c = VT::nlanes;
+        for ( ; i <= len - c * 2; i += c * 2)
+        {
+            VT x1 = v_load(X + i);
+            VT x2 = v_load(X + i + c);
+            VT y1 = v_load(Y + i);
+            VT y2 = v_load(Y + i + c);
+            v_store(&angle[i], s * one(x1, y1));
+            v_store(&angle[i + c], s * one(x2, y2));
+        }
+        return i;
+    }
 
-    for( ; i <= len - 4; i += 4 )
+private:
+    inline VT one(VT & x, VT & y)
     {
-        __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i);
-        __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask);
-        __m128 mask = _mm_cmplt_ps(ax, ay);
-        __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay);
-        __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps));
-        __m128 c2 = _mm_mul_ps(c, c);
-        __m128 a = _mm_mul_ps(c2, p7);
-        a = _mm_mul_ps(_mm_add_ps(a, p5), c2);
-        a = _mm_mul_ps(_mm_add_ps(a, p3), c2);
-        a = _mm_mul_ps(_mm_add_ps(a, p1), c);
-
-        __m128 b = _mm_sub_ps(_90, a);
-        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-        b = _mm_sub_ps(_180, a);
-        mask = _mm_cmplt_ps(x, z);
-        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-        b = _mm_sub_ps(_360, a);
-        mask = _mm_cmplt_ps(y, z);
-        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-        a = _mm_mul_ps(a, scale4);
-        _mm_storeu_ps(angle + i, a);
+        VT ax = v_abs(x);
+        VT ay = v_abs(y);
+        VT c = v_min(ax, ay) / (v_max(ax, ay) + eps);
+        VT cc = c * c;
+        VT a = (((p7 * cc + p5) * cc + p3) * cc + p1) * c;
+        a = v_select(ax >= ay, a, val90 - a);
+        a = v_select(x < z, val180 - a, a);
+        a = v_select(y < z, val360 - a, a);
+        return a;
     }
-#elif CV_NEON
-    float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
-    float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
-    float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
-    float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
-    float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
 
-    for( ; i <= len - 4; i += 4 )
+private:
+    VT eps;
+    VT z;
+    VT p7;
+    VT p5;
+    VT p3;
+    VT p1;
+    VT val90;
+    VT val180;
+    VT val360;
+    VT s;
+};
+
+#if !CV_SIMD128_64F
+
+// emulation
+template <>
+struct v_atan<double>
+{
+    v_atan(double scale) : impl(static_cast<float>(scale)) {}
+    inline int operator()(int len, const double * Y, const double * X, double * angle)
+    {
+        int i = 0;
+        const int c = v_atan<float>::WorkWidth;
+        float bufY[c];
+        float bufX[c];
+        float bufA[c];
+        for ( ; i <= len - c ; i += c)
+        {
+            for (int j = 0; j < c; ++j)
+            {
+                bufY[j] = static_cast<float>(Y[i + j]);
+                bufX[j] = static_cast<float>(X[i + j]);
+            }
+            impl(c, bufY, bufX, bufA);
+            for (int j = 0; j < c; ++j)
+            {
+                angle[i + j] = bufA[j];
+            }
+        }
+        return i;
+    }
+private:
+    v_atan<float> impl;
+};
+#endif
+
+#endif
+
+template <typename T>
+static inline T atanImpl(T y, T x)
+{
+    T ax = std::abs(x), ay = std::abs(y);
+    T a, c, c2;
+    if( ax >= ay )
     {
-        float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
-        float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
-        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
-        float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
-        float32x4_t c2 = vmulq_f32(c, c);
-        float32x4_t a = vmulq_f32(c2, p7);
-        a = vmulq_f32(vaddq_f32(a, p5), c2);
-        a = vmulq_f32(vaddq_f32(a, p3), c2);
-        a = vmulq_f32(vaddq_f32(a, p1), c);
-
-        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
-        a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
-        a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
-
-        vst1q_f32(angle + i, vmulq_f32(a, scale4));
+        c = ay/(ax + static_cast<T>(DBL_EPSILON));
+        c2 = c*c;
+        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
     }
+    else
+    {
+        c = ax/(ay + static_cast<T>(DBL_EPSILON));
+        c2 = c*c;
+        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+    }
+    if( x < 0 )
+        a = 180.f - a;
+    if( y < 0 )
+        a = 360.f - a;
+    return a;
+}
+
+template <typename T>
+static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angleInDegrees)
+{
+    int i = 0;
+    T scale = angleInDegrees ? 1 : static_cast<T>(CV_PI/180);
+
+#if CV_SIMD128
+    i = v_atan<T>(scale)(len, Y, X, angle);
 #endif
 
     for( ; i < len; i++ )
     {
-        float x = X[i], y = Y[i];
-        float ax = std::abs(x), ay = std::abs(y);
-        float a, c, c2;
-        if( ax >= ay )
-        {
-            c = ay/(ax + (float)DBL_EPSILON);
-            c2 = c*c;
-            a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-        }
-        else
-        {
-            c = ax/(ay + (float)DBL_EPSILON);
-            c2 = c*c;
-            a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-        }
-        if( x < 0 )
-            a = 180.f - a;
-        if( y < 0 )
-            a = 360.f - a;
-        angle[i] = (float)(a*scale);
+        angle[i] = atanImpl<T>(Y[i], X[i]) * scale;
     }
 }
 
+} // anonymous::
+
+namespace cv { namespace hal {
+
+///////////////////////////////////// ATAN2 ////////////////////////////////////
+
+void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+{
+    CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees);
+    atanImpl<float>(Y, X, angle, len, angleInDegrees);
+}
+
+void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
+{
+    CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees);
+    atanImpl<double>(Y, X, angle, len, angleInDegrees);
+}
+
+// deprecated
+void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+{
+    fastAtan32f(Y, X, angle, len, angleInDegrees);
+}
 
 void magnitude32f(const float* x, const float* y, float* mag, int len)
 {
+    CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
 #if defined HAVE_IPP
     CV_IPP_CHECK()
     {
@@ -188,6 +260,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
 
 void magnitude64f(const double* x, const double* y, double* mag, int len)
 {
+    CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len);
 #if defined(HAVE_IPP)
     CV_IPP_CHECK()
     {
@@ -225,6 +298,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
 
 void invSqrt32f(const float* src, float* dst, int len)
 {
+    CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len);
 #if defined(HAVE_IPP)
     CV_IPP_CHECK()
     {
@@ -256,6 +330,7 @@ void invSqrt32f(const float* src, float* dst, int len)
 
 void invSqrt64f(const double* src, double* dst, int len)
 {
+    CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len);
     int i = 0;
 
 #if CV_SSE2
@@ -271,6 +346,7 @@ void invSqrt64f(const double* src, double* dst, int len)
 
 void sqrt32f(const float* src, float* dst, int len)
 {
+    CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len);
 #if defined(HAVE_IPP)
     CV_IPP_CHECK()
     {
@@ -302,6 +378,7 @@ void sqrt32f(const float* src, float* dst, int len)
 
 void sqrt64f(const double* src, double* dst, int len)
 {
+    CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len);
 #if defined(HAVE_IPP)
     CV_IPP_CHECK()
     {
@@ -433,6 +510,7 @@ static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) <
 
 void exp32f( const float *_x, float *y, int n )
 {
+    CALL_HAL(exp32f, cv_hal_exp32f, _x, y, n);
     static const float
     A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
     A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
@@ -632,6 +710,7 @@ void exp32f( const float *_x, float *y, int n )
 
 void exp64f( const double *_x, double *y, int n )
 {
+    CALL_HAL(exp64f, cv_hal_exp64f, _x, y, n);
     static const double
     A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
     A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
@@ -1076,6 +1155,7 @@ static const double ln_2 = 0.69314718055994530941723212145818;
 
 void log32f( const float *_x, float *y, int n )
 {
+    CALL_HAL(log32f, cv_hal_log32f, _x, y, n);
     static const float shift[] = { 0, -1.f/512 };
     static const float
     A0 = 0.3333333333333333333333333f,
@@ -1220,6 +1300,7 @@ void log32f( const float *_x, float *y, int n )
 
 void log64f( const double *x, double *y, int n )
 {
+    CALL_HAL(log64f, cv_hal_log64f, x, y, n);
     static const double shift[] = { 0, -1./512 };
     static const double
     A7 = 1.0,
@@ -1457,4 +1538,10 @@ void invSqrt(const double* src, double* dst, int len)
 }
 
 
-}} // cv::hal::
+} // cv::hal::
+} // cv::
+
+float cv::fastAtan2( float y, float x )
+{
+    return atanImpl<float>(y, x);
+}
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index 8f9a42abe6..4d99a4a275 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -44,6 +44,8 @@
 
 #include <ctype.h>
 #include <deque>
+#include <sstream>
+#include <string>
 #include <iterator>
 
 #define USE_ZLIB 1
@@ -181,6 +183,8 @@ typedef struct CvXMLStackRecord
 }
 CvXMLStackRecord;
 
+namespace base64 { class Base64Writer; }
+
 #define CV_XML_OPENING_TAG 1
 #define CV_XML_CLOSING_TAG 2
 #define CV_XML_EMPTY_TAG 3
@@ -238,10 +242,96 @@ typedef struct CvFileStorage
     size_t strbufsize, strbufpos;
     std::deque<char>* outbuf;
 
+    base64::Base64Writer * base64_writer;
+
     bool is_opened;
 }
 CvFileStorage;
 
+namespace base64
+{
+    static const size_t HEADER_SIZE         = 24U;
+    static const size_t ENCODED_HEADER_SIZE = 32U;
+
+    /* base64 */
+
+    typedef uchar uint8_t;
+
+    extern uint8_t const base64_padding;
+    extern uint8_t const base64_mapping[65];
+    extern uint8_t const base64_demapping[127];
+
+    size_t base64_encode(uint8_t const * src, uint8_t * dst, size_t off,      size_t cnt);
+    size_t base64_encode(   char const * src,    char * dst, size_t off = 0U, size_t cnt = 0U);
+
+    size_t base64_decode(uint8_t const * src, uint8_t * dst, size_t off,      size_t cnt);
+    size_t base64_decode(   char const * src,    char * dst, size_t off = 0U, size_t cnt = 0U);
+
+    bool   base64_valid (uint8_t const * src, size_t off,      size_t cnt);
+    bool   base64_valid (   char const * src, size_t off = 0U, size_t cnt = 0U);
+
+    size_t base64_encode_buffer_size(size_t cnt);
+
+    size_t base64_decode_buffer_size(size_t cnt);
+
+    /* binary */
+
+    template<typename _uint_t> inline size_t to_binary(_uint_t val, uchar * cur);
+    template<> inline size_t to_binary(double val, uchar * cur);
+    template<> inline size_t to_binary(float val, uchar * cur);
+    template<typename _primitive_t> inline size_t to_binary(uchar const * val, uchar * cur);
+
+    template<typename _uint_t> inline size_t binary_to(uchar const * cur, _uint_t & val);
+    template<> inline size_t binary_to(uchar const * cur, double & val);
+    template<> inline size_t binary_to(uchar const * cur, float & val);
+    template<typename _primitive_t> inline size_t binary_to(uchar const * cur, uchar * val);
+
+    class MatToBinaryConvertor;
+    class RawDataToBinaryConvertor;
+
+    class BinaryToCvSeqConvertor;
+
+    /* class */
+
+    class Base64ContextParser
+    {
+    public:
+        explicit Base64ContextParser(uchar * buffer, size_t size);
+        ~Base64ContextParser();
+        Base64ContextParser & read(const uchar * beg, const uchar * end);
+        bool flush();
+    private:
+        static const size_t BUFFER_LEN = 120U;
+        uchar * dst_cur;
+        uchar * dst_end;
+        std::vector<uchar> base64_buffer;
+        uchar * src_beg;
+        uchar * src_cur;
+        uchar * src_end;
+        std::vector<uchar> binary_buffer;
+    };
+
+    class Base64ContextEmitter;
+
+    /* other */
+
+    std::string make_base64_header(int byte_size, const char * dt);
+
+    bool read_base64_header(std::string const & header, int & byte_size, std::string & dt);
+
+    void make_seq(void * binary_data, int elem_cnt, const char * dt, CvSeq & seq);
+
+    /* sample */
+
+    void cvStartWriteRawData_Base64(::CvFileStorage * fs, const char* name, int len, const char* dt);
+    void cvWriteRawData_Base64(::CvFileStorage * fs, const void* _data, int len);
+    void cvEndWriteRawData_Base64(::CvFileStorage * fs);
+
+    void cvWriteRawData_Base64(::cv::FileStorage & fs, const void* _data, int len, const char* dt);
+    void cvWriteMat_Base64(CvFileStorage * fs, const char * name, ::cv::Mat const & mat);
+}
+
+
 static void icvPuts( CvFileStorage* fs, const char* str )
 {
     if( fs->outbuf )
@@ -995,6 +1085,95 @@ icvYMLSkipSpaces( CvFileStorage* fs, char* ptr, int min_indent, int max_comment_
 }
 
 
+static void icvYMLGetMultilineStringContent(CvFileStorage* fs,
+    char* ptr, int indent, char* &beg, char* &end)
+{
+    ptr = icvYMLSkipSpaces(fs, ptr, 0, INT_MAX);
+    beg = ptr;
+    end = ptr;
+    if (fs->dummy_eof)
+        return ; /* end of file */
+
+    if (ptr - fs->buffer_start != indent)
+        return ; /* end of string */
+
+    /* find end */
+    while(cv_isprint(*ptr)) /* no check for base64 string */
+        ++ ptr;
+    if (*ptr == '\0')
+        CV_PARSE_ERROR("Unexpected end of line");
+
+    end = ptr;
+}
+
+static int icvCalcStructSize( const char* dt, int initial_size );
+
+static char* icvYMLParseBase64(CvFileStorage* fs, char* ptr, int indent, CvFileNode * node)
+{
+    char * beg = 0;
+    char * end = 0;
+
+    icvYMLGetMultilineStringContent(fs, ptr, indent, beg, end);
+    if (beg >= end)
+        return end; // CV_PARSE_ERROR("Empty Binary Data");
+
+    /* calc (decoded) total_byte_size from header */
+    std::string dt;
+    int total_byte_size = -1;
+    {
+        if (end - beg < static_cast<int>(base64::ENCODED_HEADER_SIZE))
+            CV_PARSE_ERROR("Unrecognized Base64 header");
+
+        std::vector<char> header(base64::HEADER_SIZE + 1, ' ');
+        base64::base64_decode(beg, header.data(), 0U, base64::ENCODED_HEADER_SIZE);
+        std::istringstream iss(header.data());
+
+        if (!(iss >> total_byte_size) || total_byte_size < 0)
+            CV_PARSE_ERROR("Cannot parse size in Base64 header");
+        if (!(iss >> dt) || dt.empty())
+            CV_PARSE_ERROR("Cannot parse dt in Base64 header");
+
+        beg += base64::ENCODED_HEADER_SIZE;
+    }
+
+    /* buffer for decoded data(exclude header) */
+    std::vector<uchar> buffer(total_byte_size + 4);
+    {
+        base64::Base64ContextParser parser(buffer.data(), total_byte_size + 4);
+
+        /* decoding */
+        while(beg < end)
+        {
+            /* save this part [beg, end) */
+            parser.read((const uchar *)beg, (const uchar *)end);
+
+            beg = end;
+
+            /* find next part */
+            icvYMLGetMultilineStringContent(fs, beg, indent, beg, end);
+        }
+    }
+    /* save as CvSeq */
+    int elem_size = ::icvCalcStructSize(dt.c_str(), 0);
+    if (total_byte_size % elem_size != 0)
+        CV_PARSE_ERROR("Byte size not match elememt size");
+    int elem_cnt = total_byte_size / elem_size;
+
+    node->tag = CV_NODE_NONE;
+    int struct_flags = CV_NODE_FLOW + CV_NODE_SEQ; /* after icvFSCreateCollection, node->tag == struct_flags */
+    icvFSCreateCollection(fs, struct_flags, node);
+    base64::make_seq(buffer.data(), elem_cnt, dt.c_str(), *node->data.seq);
+
+    if (fs->dummy_eof) {
+        /* end of file */
+        return fs->buffer_start;
+    } else {
+        /* end of line */
+        return end;
+    }
+}
+
+
 static char*
 icvYMLParseKey( CvFileStorage* fs, char* ptr,
                 CvFileNode* map_node, CvFileNode** value_placeholder )
@@ -1038,6 +1217,7 @@ icvYMLParseValue( CvFileStorage* fs, char* ptr, CvFileNode* node,
     int is_parent_flow = CV_NODE_IS_FLOW(parent_flags);
     int value_type = CV_NODE_NONE;
     int len;
+    bool is_binary_string = false;
 
     memset( node, 0, sizeof(*node) );
 
@@ -1074,6 +1254,27 @@ icvYMLParseValue( CvFileStorage* fs, char* ptr, CvFileNode* node,
             if( memcmp( ptr, "float", 5 ) == 0 )
                 value_type = CV_NODE_REAL;
         }
+        else if (len == 6 && CV_NODE_IS_USER(value_type))
+        {
+            if( memcmp( ptr, "binary", 6 ) == 0 ) {
+                value_type = CV_NODE_SEQ;
+                is_binary_string = true;
+
+                /* for ignore '|' */
+
+                /**** operation with endptr ****/
+                *endptr = d;
+
+                do {
+                    d = *++endptr;
+                    if (d == '|')
+                        break;
+                } while (d == ' ');
+
+                d = *++endptr;
+                *endptr = '\0';
+            }
+        }
         else if( CV_NODE_IS_USER(value_type) )
         {
             node->info = cvFindType( ptr );
@@ -1088,7 +1289,7 @@ icvYMLParseValue( CvFileStorage* fs, char* ptr, CvFileNode* node,
 
         if( !CV_NODE_IS_USER(value_type) )
         {
-            if( value_type == CV_NODE_STRING && c != '\'' && c != '\"' )
+            if (value_type == CV_NODE_STRING && c != '\'' && c != '\"')
                 goto force_string;
             if( value_type == CV_NODE_INT )
                 goto force_int;
@@ -1097,7 +1298,13 @@ icvYMLParseValue( CvFileStorage* fs, char* ptr, CvFileNode* node,
         }
     }
 
-    if( cv_isdigit(c) ||
+    if (is_binary_string)
+    {
+        /* for base64 string */
+        int indent = static_cast<int>(ptr - fs->buffer_start);
+        ptr = icvYMLParseBase64(fs, ptr, indent, node);
+    }
+    else if( cv_isdigit(c) ||
         ((c == '-' || c == '+') && (cv_isdigit(d) || d == '.')) ||
         (c == '.' && cv_isalnum(d))) // a number
     {
@@ -1355,8 +1562,9 @@ icvYMLParse( CvFileStorage* fs )
 
             if( *ptr == '%' )
             {
-                if( memcmp( ptr, "%YAML:", 6 ) == 0 &&
-                    memcmp( ptr, "%YAML:1.", 8 ) != 0 )
+                if( memcmp( ptr, "%YAML", 5 ) == 0 &&
+                    memcmp( ptr, "%YAML:1.", 8 ) != 0 &&
+                    memcmp( ptr, "%YAML 1.", 8 ) != 0)
                     CV_PARSE_ERROR( "Unsupported YAML version (it must be 1.x)" );
                 *ptr = '\0';
             }
@@ -1521,7 +1729,14 @@ icvYMLStartWriteStruct( CvFileStorage* fs, const char* key, int struct_flags,
         CV_Error( CV_StsBadArg,
         "Some collection type - CV_NODE_SEQ or CV_NODE_MAP, must be specified" );
 
-    if( CV_NODE_IS_FLOW(struct_flags) )
+    if (type_name && memcmp(type_name, "binary", 6) == 0)
+    {
+        /* reset struct flag. in order not to print ']' */
+        struct_flags = CV_NODE_SEQ;
+        sprintf(buf, "!!binary |");
+        data = buf;
+    }
+    else if( CV_NODE_IS_FLOW(struct_flags))
     {
         char c = CV_NODE_IS_MAP(struct_flags) ? '{' : '[';
         struct_flags |= CV_NODE_FLOW;
@@ -1637,7 +1852,7 @@ icvYMLWriteString( CvFileStorage* fs, const char* key,
 
     if( quote || len == 0 || str[0] != str[len-1] || (str[0] != '\"' && str[0] != '\'') )
     {
-        int need_quote = quote || len == 0;
+        int need_quote = quote || len == 0 || str[0] == ' ';
         data = buf;
         *data++ = '\"';
         for( i = 0; i < len; i++ )
@@ -1813,6 +2028,92 @@ icvXMLSkipSpaces( CvFileStorage* fs, char* ptr, int mode )
 }
 
 
+static void icvXMLGetMultilineStringContent(CvFileStorage* fs,
+    char* ptr, char* &beg, char* &end)
+{
+    ptr = icvXMLSkipSpaces(fs, ptr, CV_XML_INSIDE_TAG);
+    beg = ptr;
+    end = ptr;
+    if (fs->dummy_eof)
+        return ; /* end of file */
+
+    if (*beg == '<')
+        return; /* end of string */
+
+    /* find end */
+    while(cv_isprint(*ptr)) /* no check for base64 string */
+        ++ ptr;
+    if (*ptr == '\0')
+        CV_PARSE_ERROR("Unexpected end of line");
+
+    end = ptr;
+}
+
+
+static char* icvXMLParseBase64(CvFileStorage* fs, char* ptr, CvFileNode * node)
+{
+    char * beg = 0;
+    char * end = 0;
+
+    icvXMLGetMultilineStringContent(fs, ptr, beg, end);
+    if (beg >= end)
+        return end; // CV_PARSE_ERROR("Empty Binary Data");
+
+    /* calc (decoded) total_byte_size from header */
+    std::string dt;
+    int total_byte_size = -1;
+    {
+        if (end - beg < static_cast<int>(base64::ENCODED_HEADER_SIZE))
+            CV_PARSE_ERROR("Unrecognized Base64 header");
+
+        std::vector<char> header(base64::HEADER_SIZE + 1, ' ');
+        base64::base64_decode(beg, header.data(), 0U, base64::ENCODED_HEADER_SIZE);
+        std::istringstream iss(header.data());
+        if (!(iss >> total_byte_size) || total_byte_size < 0)
+            CV_PARSE_ERROR("Cannot parse size in Base64 header");
+        if (!(iss >> dt) || dt.empty())
+            CV_PARSE_ERROR("Cannot parse dt in Base64 header");
+
+        beg += base64::ENCODED_HEADER_SIZE;
+    }
+
+    /* alloc buffer for all decoded data(include header) */
+    std::vector<uchar> buffer(total_byte_size + 4);
+    {
+        base64::Base64ContextParser parser(buffer.data(), total_byte_size + 4);
+
+        /* decoding */
+        while(beg < end)
+        {
+            /* save this part [beg, end) */
+            parser.read((const uchar *)beg, (const uchar *)end);
+            beg = end;
+            /* find next part */
+            icvXMLGetMultilineStringContent(fs, beg, beg, end);
+        }
+    }
+
+    /* save as CvSeq */
+    int elem_size = ::icvCalcStructSize(dt.c_str(), 0);
+    if (total_byte_size % elem_size != 0)
+        CV_PARSE_ERROR("Byte size not match elememt size");
+    int elem_cnt = total_byte_size / elem_size;
+
+    node->tag = CV_NODE_NONE;
+    int struct_flags = CV_NODE_SEQ; /* after icvFSCreateCollection, node->tag == struct_flags */
+    icvFSCreateCollection(fs, struct_flags, node);
+    base64::make_seq(buffer.data(), elem_cnt, dt.c_str(), *node->data.seq);
+
+    if (fs->dummy_eof) {
+        /* end of file */
+        return fs->buffer_start;
+    } else {
+        /* end of line */
+        return end;
+    }
+}
+
+
 static char*
 icvXMLParseTag( CvFileStorage* fs, char* ptr, CvStringHashNode** _tag,
                 CvAttrList** _list, int* _tag_type );
@@ -1864,6 +2165,9 @@ icvXMLParseValue( CvFileStorage* fs, char* ptr, CvFileNode* node,
 
             assert( tag_type == CV_XML_OPENING_TAG );
 
+            /* for base64 string */
+            bool is_binary_string = false;
+
             type_name = list ? cvAttrValue( list, "type_id" ) : 0;
             if( type_name )
             {
@@ -1873,6 +2177,11 @@ icvXMLParseValue( CvFileStorage* fs, char* ptr, CvFileNode* node,
                     elem_type = CV_NODE_MAP;
                 else if( strcmp( type_name, "seq" ) == 0 )
                     elem_type = CV_NODE_SEQ;
+                else if (strcmp(type_name, "binary") == 0)
+                {
+                    elem_type = CV_NODE_NONE;
+                    is_binary_string = true;
+                }
                 else
                 {
                     info = cvFindType( type_name );
@@ -1895,7 +2204,14 @@ icvXMLParseValue( CvFileStorage* fs, char* ptr, CvFileNode* node,
             else
                 elem = cvGetFileNode( fs, node, key, 1 );
 
-            ptr = icvXMLParseValue( fs, ptr, elem, elem_type);
+            if (!is_binary_string)
+                ptr = icvXMLParseValue( fs, ptr, elem, elem_type);
+            else {
+                /* for base64 string */
+                ptr = icvXMLParseBase64( fs, ptr, elem);
+                ptr = icvXMLSkipSpaces( fs, ptr, 0 );
+            }
+
             if( !is_noname )
                 elem->tag |= CV_NODE_NAMED;
             is_simple &= !CV_NODE_IS_COLLECTION(elem->tag);
@@ -2832,7 +3148,7 @@ cvOpenFileStorage( const char* filename, CvMemStorage* dststorage, int flags, co
         else
         {
             if( !append )
-                icvPuts( fs, "%YAML:1.0\n" );
+                icvPuts( fs, "%YAML 1.0\n---\n" );
             else
                 icvPuts( fs, "...\n---\n" );
             fs->start_write_struct = icvYMLStartWriteStruct;
@@ -2853,7 +3169,7 @@ cvOpenFileStorage( const char* filename, CvMemStorage* dststorage, int flags, co
         }
 
         size_t buf_size = 1 << 20;
-        const char* yaml_signature = "%YAML:";
+        const char* yaml_signature = "%YAML";
         char buf[16];
         icvGets( fs, buf, sizeof(buf)-2 );
         fs->fmt = strncmp( buf, yaml_signature, strlen(yaml_signature) ) == 0 ?
@@ -3074,6 +3390,29 @@ icvCalcElemSize( const char* dt, int initial_size )
 }
 
 
+static int
+icvCalcStructSize( const char* dt, int initial_size )
+{
+    int size = icvCalcElemSize( dt, initial_size );
+    size_t elem_max_size = 0;
+    for ( const char * type = dt; *type != '\0'; type++ ) {
+        switch ( *type )
+        {
+        case 'u': { if (elem_max_size < sizeof(uchar))  elem_max_size = sizeof(uchar);  break; }
+        case 'c': { if (elem_max_size < sizeof(schar))  elem_max_size = sizeof(schar);  break; }
+        case 'w': { if (elem_max_size < sizeof(ushort)) elem_max_size = sizeof(ushort); break; }
+        case 's': { if (elem_max_size < sizeof(short))  elem_max_size = sizeof(short);  break; }
+        case 'i': { if (elem_max_size < sizeof(int))    elem_max_size = sizeof(int);    break; }
+        case 'f': { if (elem_max_size < sizeof(float))  elem_max_size = sizeof(float);  break; }
+        case 'd': { if (elem_max_size < sizeof(double)) elem_max_size = sizeof(double); break; }
+        default: break;
+        }
+    }
+    size = cvAlign( size, static_cast<int>(elem_max_size) );
+    return size;
+}
+
+
 static int
 icvDecodeSimpleFormat( const char* dt )
 {
@@ -3166,7 +3505,7 @@ cvWriteRawData( CvFileStorage* fs, const void* _data, int len, const char* dt )
                     data += sizeof(size_t);
                     break;
                 default:
-                    assert(0);
+                    CV_Error( CV_StsUnsupportedFormat, "Unsupported type" );
                     return;
                 }
 
@@ -3288,7 +3627,7 @@ cvReadRawDataSlice( const CvFileStorage* fs, CvSeqReader* reader,
                         data += sizeof(size_t);
                         break;
                     default:
-                        assert(0);
+                        CV_Error( CV_StsUnsupportedFormat, "Unsupported type" );
                         return;
                     }
                 }
@@ -3338,7 +3677,7 @@ cvReadRawDataSlice( const CvFileStorage* fs, CvSeqReader* reader,
                         data += sizeof(size_t);
                         break;
                     default:
-                        assert(0);
+                        CV_Error( CV_StsUnsupportedFormat, "Unsupported type" );
                         return;
                     }
                 }
@@ -3534,7 +3873,7 @@ static int
 icvFileNodeSeqLen( CvFileNode* node )
 {
     return CV_NODE_IS_COLLECTION(node->tag) ? node->data.seq->total :
-           CV_NODE_TYPE(node->tag) != CV_NODE_NONE;
+        CV_NODE_TYPE(node->tag) != CV_NODE_NONE;
 }
 
 
@@ -5680,4 +6019,1113 @@ void read(const FileNode& node, String& value, const String& default_value)
 
 }
 
+
+
+
+
+
+
+
+
+/****************************************************************************
+ * Newly added for Base64
+ *
+ *
+ ***************************************************************************/
+
+
+/****************************************************************************
+ * constant
+ ***************************************************************************/
+
+#if CHAR_BIT != 8
+#error "`char` should be 8 bit."
+#endif
+
+base64::uint8_t const base64::base64_mapping[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz"
+    "0123456789+/";
+
+base64::uint8_t const base64::base64_padding = '=';
+
+base64::uint8_t const base64::base64_demapping[] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0, 62,  0,  0,  0, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,  0,  0,
+    0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
+    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  0,  0,  0,  0,  0,  0, 26, 27, 28,
+    29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+    49, 50, 51,  0,  0,  0,  0,
+};
+
+/*    `base64_demapping` above is generated in this way:
+ *    `````````````````````````````````````````````````````````````````````
+ *  std::string mapping((const char *)base64_mapping);
+ *    for (auto ch = 0; ch < 127; ch++) {
+ *        auto i = mapping.find(ch);
+ *        printf("%3u, ", (i != std::string::npos ? i : 0));
+ *    }
+ *    putchar('\n');
+ *    `````````````````````````````````````````````````````````````````````
+ */
+
+/****************************************************************************
+ * function
+ ***************************************************************************/
+
+size_t base64::base64_encode(uint8_t const * src, uint8_t * dst, size_t off, size_t cnt)
+{
+    if (!src || !dst || !cnt)
+        return 0;
+
+    /* initialize beginning and end */
+    uint8_t       * dst_beg = dst;
+    uint8_t       * dst_cur = dst_beg;
+
+    uint8_t const * src_beg = src + off;
+    uint8_t const * src_cur = src_beg;
+    uint8_t const * src_end = src_cur + cnt / 3U * 3U;
+
+    /* integer multiples part */
+    while (src_cur < src_end) {
+        uint8_t _2 = *src_cur++;
+        uint8_t _1 = *src_cur++;
+        uint8_t _0 = *src_cur++;
+        *dst_cur++ = base64_mapping[ _2          >> 2U];
+        *dst_cur++ = base64_mapping[(_1 & 0xF0U) >> 4U | (_2 & 0x03U) << 4U];
+        *dst_cur++ = base64_mapping[(_0 & 0xC0U) >> 6U | (_1 & 0x0FU) << 2U];
+        *dst_cur++ = base64_mapping[ _0 & 0x3FU];
+    }
+
+    /* remainder part */
+    size_t rst = src_beg + cnt - src_cur;
+    if (rst == 1U) {
+        uint8_t _2 = *src_cur++;
+        *dst_cur++ = base64_mapping[ _2          >> 2U];
+        *dst_cur++ = base64_mapping[(_2 & 0x03U) << 4U];
+    } else if (rst == 2U) {
+        uint8_t _2 = *src_cur++;
+        uint8_t _1 = *src_cur++;
+        *dst_cur++ = base64_mapping[ _2          >> 2U];
+        *dst_cur++ = base64_mapping[(_2 & 0x03U) << 4U | (_1 & 0xF0U) >> 4U];
+        *dst_cur++ = base64_mapping[(_1 & 0x0FU) << 2U];
+    }
+
+    /* padding */
+    switch (rst)
+    {
+    case 1U: *dst_cur++ = base64_padding;
+    case 2U: *dst_cur++ = base64_padding;
+    default: *dst_cur   = 0;
+        break;
+    }
+
+    return static_cast<size_t>(dst_cur - dst_beg);
+}
+
+size_t base64::base64_encode(char const * src, char * dst, size_t off, size_t cnt)
+{
+    if (cnt == 0U)
+        cnt = std::strlen(src);
+
+    return base64_encode
+    (
+        reinterpret_cast<uint8_t const *>(src),
+        reinterpret_cast<uint8_t       *>(dst),
+        off,
+        cnt
+    );
+}
+
+size_t base64::base64_decode(uint8_t const * src, uint8_t * dst, size_t off, size_t cnt)
+{
+    /* check parameters */
+    if (!src || !dst || !cnt)
+        return 0U;
+    if (cnt & 0x3U)
+        return 0U;
+
+    /* initialize beginning and end */
+    uint8_t       * dst_beg = dst;
+    uint8_t       * dst_cur = dst_beg;
+
+    uint8_t const * src_beg = src + off;
+    uint8_t const * src_cur = src_beg;
+    uint8_t const * src_end = src_cur + cnt;
+
+    /* start decoding */
+    while (src_cur < src_end) {
+        uint8_t d50 = base64_demapping[*src_cur++];
+        uint8_t c50 = base64_demapping[*src_cur++];
+        uint8_t b50 = base64_demapping[*src_cur++];
+        uint8_t a50 = base64_demapping[*src_cur++];
+
+        uint8_t b10 = b50 & 0x03U;
+        uint8_t b52 = b50 & 0x3CU;
+        uint8_t c30 = c50 & 0x0FU;
+        uint8_t c54 = c50 & 0x30U;
+
+        *dst_cur++ = (d50 << 2U) | (c54 >> 4U);
+        *dst_cur++ = (c30 << 4U) | (b52 >> 2U);
+        *dst_cur++ = (b10 << 6U) | (a50 >> 0U);
+    }
+
+    *dst_cur = 0;
+    return size_t(dst_cur - dst_beg);
+}
+
+size_t base64::base64_decode(char const * src, char * dst, size_t off, size_t cnt)
+{
+    if (cnt == 0U)
+        cnt = std::strlen(src);
+
+    return base64_decode
+    (
+        reinterpret_cast<uint8_t const *>(src),
+        reinterpret_cast<uint8_t       *>(dst),
+        off,
+        cnt
+    );
+}
+
+bool base64::base64_valid(uint8_t const * src, size_t off, size_t cnt)
+{
+    /* check parameters */
+    if (src == 0 || src + off == 0)
+        return false;
+    if (cnt == 0U)
+        cnt = std::strlen(reinterpret_cast<char const *>(src));
+    if (cnt & 0x3U)
+        return false;
+
+    /* initialize beginning and end */
+    uint8_t const * beg = src + off;
+    uint8_t const * end = beg + cnt;
+
+    /* skip padding */
+    if (*(end - 1U) == base64_padding) {
+        end--;
+        if (*(end - 1U) == base64_padding)
+            end--;
+    }
+
+    /* find illegal characters */
+    for (uint8_t const * iter = beg; iter < end; iter++)
+        if (*iter > 126U || (!base64_demapping[(uint8_t)*iter] && *iter != base64_mapping[0]))
+            return false;
+
+    return true;
+}
+
+bool base64::base64_valid(char const * src, size_t off, size_t cnt)
+{
+    if (cnt == 0U)
+        cnt = std::strlen(src);
+
+    return base64_valid(reinterpret_cast<uint8_t const *>(src), off, cnt);
+}
+
+size_t base64::base64_encode_buffer_size(size_t cnt)
+{
+    return size_t((cnt + 2U) / 3U * 4U + 1U);
+}
+
+size_t base64::base64_decode_buffer_size(size_t cnt)
+{
+    return size_t(cnt / 4U * 3U + 1U);
+}
+
+/****************************************************************************
+ * to_binary && binary_to
+ ***************************************************************************/
+
+template<typename _uint_t> inline size_t base64::
+to_binary(_uint_t val, uchar * cur)
+{
+    size_t delta = CHAR_BIT;
+    size_t cnt = sizeof(_uint_t);
+    while (cnt --> static_cast<size_t>(0U)) {
+        *cur++ = static_cast<uchar>(val);
+        val >>= delta;
+    }
+    return sizeof(_uint_t);
+}
+
+template<> inline size_t base64::to_binary(double val, uchar * cur)
+{
+    Cv64suf bit64;
+    bit64.f = val;
+    return to_binary(bit64.u, cur);
+}
+
+template<> inline size_t base64::to_binary(float val, uchar * cur)
+{
+    Cv32suf bit32;
+    bit32.f = val;
+    return to_binary(bit32.u, cur);
+}
+
+template<typename _primitive_t> inline size_t base64::
+to_binary(uchar const * val, uchar * cur)
+{
+    return to_binary<_primitive_t>(*reinterpret_cast<_primitive_t const *>(val), cur);
+}
+
+
+template<typename _uint_t> inline size_t base64::
+binary_to(uchar const * cur, _uint_t & val)
+{
+    val = static_cast<_uint_t>(0);
+    for (size_t i = static_cast<size_t>(0U); i < sizeof(_uint_t); i++)
+        val |= (static_cast<_uint_t>(*cur++) << (i * CHAR_BIT));
+    return sizeof(_uint_t);
+}
+
+template<> inline size_t base64::binary_to(uchar const * cur, double & val)
+{
+    Cv64suf bit64;
+    binary_to(cur, bit64.u);
+    val = bit64.f;
+    return sizeof(val);
+}
+
+template<> inline size_t base64::binary_to(uchar const * cur, float & val)
+{
+    Cv32suf bit32;
+    binary_to(cur, bit32.u);
+    val = bit32.f;
+    return sizeof(val);
+}
+
+template<typename _primitive_t> inline size_t base64::
+binary_to(uchar const * cur, uchar * val)
+{
+    return binary_to<_primitive_t>(cur, *reinterpret_cast<_primitive_t *>(val));
+}
+
+/****************************************************************************
+ * others
+ ***************************************************************************/
+
+std::string base64::make_base64_header(int byte_size, const char * dt)
+{
+    int size = byte_size;
+
+    std::ostringstream oss;
+    oss << size << ' '
+        << dt   << ' ';
+    std::string buffer(oss.str());
+    CV_Assert(buffer.size() < HEADER_SIZE);
+
+    buffer.reserve(HEADER_SIZE);
+    while (buffer.size() < HEADER_SIZE)
+        buffer += ' ';
+
+    return buffer;
+}
+
+bool base64::read_base64_header(std::string const & header, int & byte_size, std::string & dt)
+{
+    std::istringstream iss(header);
+    return static_cast<bool>(iss >> byte_size >> dt);
+}
+
+/****************************************************************************
+ * Parser
+ ***************************************************************************/
+
+base64::Base64ContextParser::Base64ContextParser(uchar * buffer, size_t size)
+    : dst_cur(buffer)
+    , dst_end(buffer + size)
+    , base64_buffer(BUFFER_LEN)
+    , src_beg(0)
+    , src_cur(0)
+    , src_end(0)
+    , binary_buffer(base64_encode_buffer_size(BUFFER_LEN))
+{
+    src_beg = binary_buffer.data();
+    src_cur = src_beg;
+    src_end = src_beg + BUFFER_LEN;
+}
+
+base64::Base64ContextParser::~Base64ContextParser()
+{
+    if (src_cur != src_beg) {
+        /* encode the rest binary data to base64 buffer */
+        flush();
+    }
+}
+
+base64::Base64ContextParser & base64::Base64ContextParser::
+read(const uchar * beg, const uchar * end)
+{
+    if (beg >= end)
+        return *this;
+
+    while (beg < end) {
+        /* collect binary data and copy to binary buffer */
+        size_t len = std::min(end - beg, src_end - src_cur);
+        std::memcpy(src_cur, beg, len);
+        beg     += len;
+        src_cur += len;
+
+        if (src_cur >= src_end) {
+            /* binary buffer is full. */
+            /* decode it send result to dst */
+
+            CV_Assert(flush());    /* check for base64_valid */
+        }
+    }
+
+    return *this;
+}
+
+bool base64::Base64ContextParser::flush()
+{
+    if (!base64_valid(src_beg, 0U, src_cur - src_beg))
+        return false;
+
+    uchar * buffer = binary_buffer.data();
+    size_t len = base64_decode(src_beg, buffer, 0U, src_cur - src_beg);
+    src_cur = src_beg;
+
+    /* unexpected error */
+    CV_Assert(len != 0);
+
+    /* buffer is full */
+    CV_Assert(dst_cur + len < dst_end);
+
+    if (dst_cur + len < dst_end) {
+        /* send data to dst */
+        std::memcpy(dst_cur, buffer, len);
+        dst_cur += len;
+    }
+
+    return true;
+}
+
+/****************************************************************************
+ * Emitter
+ ***************************************************************************/
+
+/* A decorator for CvFileStorage
+ * - no copyable
+ * - not safe for now
+ * - move constructor may be needed if C++11
+ */
+class base64::Base64ContextEmitter
+{
+public:
+    explicit Base64ContextEmitter(CvFileStorage * fs)
+        : file_storage(fs)
+        , binary_buffer(BUFFER_LEN)
+        , base64_buffer(base64_encode_buffer_size(BUFFER_LEN))
+        , src_beg(0)
+        , src_cur(0)
+        , src_end(0)
+    {
+        src_beg = binary_buffer.data();
+        src_end = src_beg + BUFFER_LEN;
+        src_cur = src_beg;
+
+        CV_CHECK_OUTPUT_FILE_STORAGE(fs);
+
+        ::icvFSFlush(file_storage);
+    }
+
+    ~Base64ContextEmitter()
+    {
+        /* cleaning */
+        if (src_cur != src_beg)
+            flush();    /* encode the rest binary data to base64 buffer */
+    }
+
+    Base64ContextEmitter & write(const uchar * beg, const uchar * end)
+    {
+        if (beg >= end)
+            return *this;
+
+        while (beg < end) {
+            /* collect binary data and copy to binary buffer */
+            size_t len = std::min(end - beg, src_end - src_cur);
+           std::memcpy(src_cur, beg, len);
+            beg     += len;
+            src_cur += len;
+
+            if (src_cur >= src_end) {
+                /* binary buffer is full. */
+                /* encode it to base64 and send result to fs */
+                flush();
+            }
+        }
+
+        return *this;
+    }
+
+    /*
+     * a convertor must provide :
+     * - `operator >> (uchar * & dst)` for writting current binary data to `dst` and moving to next data.
+     * - `operator bool` for checking if current loaction is valid and not the end.
+     */
+    template<typename _to_binary_convertor_t> inline
+    Base64ContextEmitter & write(_to_binary_convertor_t & convertor)
+    {
+        static const size_t BUFFER_MAX_LEN = 1024U;
+
+        std::vector<uchar> buffer(BUFFER_MAX_LEN);
+        uchar * beg = buffer.data();
+        uchar * end = beg;
+
+        while (convertor) {
+            convertor >> end;
+            write(beg, end);
+            end = beg;
+        }
+
+        return *this;
+    }
+
+    bool flush()
+    {
+        /* controll line width, so on. */
+        size_t len = base64_encode(src_beg, base64_buffer.data(), 0U, src_cur - src_beg);
+        if (len == 0U)
+            return false;
+
+        src_cur = src_beg;
+        {
+            // TODO: better solutions.
+            const char newline[] = "\n";
+            char space[80];
+
+            int ident = file_storage->struct_indent;
+            memset(space, ' ', ident);
+            space[ident] = '\0';
+
+            ::icvPuts(file_storage, space);
+            ::icvPuts(file_storage, (const char*)base64_buffer.data());
+            ::icvPuts(file_storage, newline);
+            ::icvFSFlush(file_storage);
+        }
+
+        return true;
+    }
+
+private:
+    /* because of Base64, we must keep its length a multiple of 3 */
+    static const size_t BUFFER_LEN = 51U;
+    // static_assert(BUFFER_LEN % 3 == 0, "BUFFER_LEN is invalid");
+
+private:
+    CvFileStorage * file_storage;
+
+    std::vector<uchar> binary_buffer;
+    std::vector<uchar> base64_buffer;
+    uchar * src_beg;
+    uchar * src_cur;
+    uchar * src_end;
+};
+
+class base64::MatToBinaryConvertor
+{
+public:
+
+    explicit MatToBinaryConvertor(const cv::Mat & src)
+        : y (0)
+        , y_max(0)
+        , x(0)
+        , x_max(0)
+    {
+        /* make sure each mat `mat.dims == 2` */
+        if (src.dims > 2) {
+            const cv::Mat * arrays[] = { &src, 0 };
+            cv::Mat plane;
+            cv::NAryMatIterator it(arrays, &plane, 1);
+
+            CV_Assert(it.nplanes > 0U); /* make sure mats not empty */
+            mats.reserve(it.nplanes);
+            for (size_t i = 0U; i < it.nplanes; ++i, ++it)
+                mats.push_back(*it.planes);
+        } else {
+            mats.push_back(src);
+        }
+
+        /* set all to beginning */
+        mat_iter  = mats.begin();
+        y_max     = (mat_iter)->rows;
+        x_max     = (mat_iter)->cols * (mat_iter)->elemSize();
+        row_begin = (mat_iter)->ptr(0);
+        step      = (mat_iter)->elemSize1();
+
+        /* choose a function */
+        switch ((mat_iter)->depth())
+        {
+        case CV_8U :
+        case CV_8S : { to_binary_func = to_binary<uchar> ; break; }
+        case CV_16U:
+        case CV_16S: { to_binary_func = to_binary<ushort>; break; }
+        case CV_32S: { to_binary_func = to_binary<uint>  ; break; }
+        case CV_32F: { to_binary_func = to_binary<float> ; break; }
+        case CV_64F: { to_binary_func = to_binary<double>; break; }
+        case CV_USRTYPE1:
+        default:     { CV_Assert(!"mat type is invalid"); break; }
+        };
+
+        /* check if empty */
+        if (mats.empty() || mats.front().empty() || mats.front().data == 0) {
+            mat_iter = mats.end();
+            CV_Assert(!(*this));
+        }
+
+    }
+
+    inline MatToBinaryConvertor & operator >> (uchar * & dst)
+    {
+        CV_DbgAssert(*this);
+
+        /* copy to dst */
+        dst += to_binary_func(row_begin + x, dst);
+
+        /* move to next */
+        x += step;
+        if (x >= x_max) {
+            /* when x arrive end, reset it and increase y */
+            x = 0U;
+            ++ y;
+            if (y >= y_max) {
+                /* when y arrive end, reset it and increase iter */
+                y = 0U;
+                ++ mat_iter;
+                if (mat_iter == mats.end()) {
+                    ;/* when iter arrive end, all done */
+                } else {
+                    /* usually x_max and y_max won't change */
+                    y_max     = (mat_iter)->rows;
+                    x_max     = (mat_iter)->cols * (mat_iter)->elemSize();
+                    row_begin = (mat_iter)->ptr(static_cast<int>(y));
+                }
+            } else
+                row_begin = (mat_iter)->ptr(static_cast<int>(y));
+        }
+
+        return *this;
+    }
+
+    inline operator bool() const
+    {
+        return mat_iter != mats.end();
+    }
+
+private:
+
+    size_t y;
+    size_t y_max;
+    size_t x;
+    size_t x_max;
+    std::vector<cv::Mat>::iterator mat_iter;
+    std::vector<cv::Mat> mats;
+
+    size_t step;
+    const uchar * row_begin;
+
+    typedef size_t(*to_binary_t)(const uchar *, uchar *);
+    to_binary_t to_binary_func;
+};
+
+class base64::RawDataToBinaryConvertor
+{
+public:
+
+    RawDataToBinaryConvertor(const void* src, int len, const char* dt)
+        : beg(reinterpret_cast<const uchar *>(src))
+        , cur(0)
+        , end(0)
+    {
+        CV_Assert(src);
+        CV_Assert(dt);
+        CV_Assert(len > 0);
+
+        /* calc step and to_binary_funcs */
+        make_to_binary_funcs(dt);
+
+        end = beg;
+        cur = beg;
+
+        step = ::icvCalcStructSize(dt, 0);
+        end = beg + step * static_cast<size_t>(len);
+    }
+
+    inline RawDataToBinaryConvertor & operator >>(uchar * & dst)
+    {
+        CV_DbgAssert(*this);
+
+        for (size_t i = 0U, n = to_binary_funcs.size(); i < n; i++) {
+            elem_to_binary_t & pack = to_binary_funcs[i];
+            pack.func(cur + pack.offset, dst + pack.offset);
+        }
+        cur += step;
+        dst += step;
+
+        return *this;
+    }
+
+    inline operator bool() const
+    {
+        return cur < end;
+    }
+
+private:
+    typedef size_t(*to_binary_t)(const uchar *, uchar *);
+    struct elem_to_binary_t
+    {
+        size_t      offset;
+        to_binary_t func;
+    };
+
+private:
+    void make_to_binary_funcs(const char* dt)
+    {
+        size_t cnt = 0;
+        size_t offset = 0;
+        char type = '\0';
+
+        std::istringstream iss(dt);
+        while (!iss.eof()) {
+            if (!(iss >> cnt)) {
+                iss.clear();
+                cnt = 1;
+            }
+            CV_Assert(cnt > 0U);
+            if (!(iss >> type))
+                break;
+
+            while (cnt-- > 0)
+            {
+                elem_to_binary_t pack;
+
+                size_t size = 0;
+                switch (type)
+                {
+                case 'u':
+                case 'c':
+                    size = sizeof(uchar);
+                    pack.func = to_binary<uchar>;
+                    break;
+                case 'w':
+                case 's':
+                    size = sizeof(ushort);
+                    pack.func = to_binary<ushort>;
+                    break;
+                case 'i':
+                    size = sizeof(uint);
+                    pack.func = to_binary<uint>;
+                    break;
+                case 'f':
+                    size = sizeof(float);
+                    pack.func = to_binary<float>;
+                    break;
+                case 'd':
+                    size = sizeof(double);
+                    pack.func = to_binary<double>;
+                    break;
+                case 'r':
+                default: { CV_Assert(!"type not support"); break; }
+                };
+
+                offset = static_cast<size_t>(cvAlign(static_cast<int>(offset), static_cast<int>(size)));
+                pack.offset = offset;
+                offset += size;
+
+                to_binary_funcs.push_back(pack);
+            }
+        }
+
+        CV_Assert(iss.eof());
+    }
+
+private:
+    const uchar * beg;
+    const uchar * cur;
+    const uchar * end;
+
+    size_t step;
+    std::vector<elem_to_binary_t> to_binary_funcs;
+};
+
+class base64::BinaryToCvSeqConvertor
+{
+public:
+    BinaryToCvSeqConvertor(const void* src, int len, const char* dt)
+        : cur(reinterpret_cast<const uchar *>(src))
+        , beg(reinterpret_cast<const uchar *>(src))
+        , end(reinterpret_cast<const uchar *>(src))
+    {
+        CV_Assert(src);
+        CV_Assert(dt);
+        CV_Assert(len >= 0);
+
+        /* calc binary_to_funcs */
+        make_funcs(dt);
+        functor_iter = binary_to_funcs.begin();
+
+        step = ::icvCalcStructSize(dt, 0);
+        end = beg + step * static_cast<size_t>(len);
+    }
+
+    inline BinaryToCvSeqConvertor & operator >> (CvFileNode & dst)
+    {
+        CV_DbgAssert(*this);
+
+        /* get current data */
+        union
+        {
+            uchar mem[sizeof(double)];
+            uchar  u;
+            char   b;
+            ushort w;
+            short  s;
+            int    i;
+            float  f;
+            double d;
+        } buffer; /* for GCC -Wstrict-aliasing */
+        std::memset(buffer.mem, 0, sizeof(buffer));
+        functor_iter->func(cur + functor_iter->offset, buffer.mem);
+
+        /* set node::data */
+        switch (functor_iter->cv_type)
+        {
+        case CV_8U : { dst.data.i = cv::saturate_cast<int>   (buffer.u); break;}
+        case CV_8S : { dst.data.i = cv::saturate_cast<int>   (buffer.b); break;}
+        case CV_16U: { dst.data.i = cv::saturate_cast<int>   (buffer.w); break;}
+        case CV_16S: { dst.data.i = cv::saturate_cast<int>   (buffer.s); break;}
+        case CV_32S: { dst.data.i = cv::saturate_cast<int>   (buffer.i); break;}
+        case CV_32F: { dst.data.f = cv::saturate_cast<double>(buffer.f); break;}
+        case CV_64F: { dst.data.f = cv::saturate_cast<double>(buffer.d); break;}
+        default: break;
+        }
+
+        /* set node::tag */
+        switch (functor_iter->cv_type)
+        {
+        case CV_8U :
+        case CV_8S :
+        case CV_16U:
+        case CV_16S:
+        case CV_32S: { dst.tag = CV_NODE_INT; /*std::printf("%i,", dst.data.i);*/ break; }
+        case CV_32F:
+        case CV_64F: { dst.tag = CV_NODE_REAL; /*std::printf("%.1f,", dst.data.f);*/ break; }
+        default: break;
+        }
+
+        /* check if end */
+        if (++functor_iter == binary_to_funcs.end()) {
+            functor_iter = binary_to_funcs.begin();
+            cur += step;
+        }
+
+        return *this;
+    }
+
+    inline operator bool() const
+    {
+        return cur < end;
+    }
+
+private:
+    typedef size_t(*binary_to_t)(uchar const *, uchar *);
+    struct binary_to_filenode_t
+    {
+        size_t      cv_type;
+        size_t      offset;
+        binary_to_t func;
+    };
+
+private:
+    void make_funcs(const char* dt)
+    {
+        size_t cnt = 0;
+        char type = '\0';
+        size_t offset = 0;
+
+        std::istringstream iss(dt);
+        while (!iss.eof()) {
+            if (!(iss >> cnt)) {
+                iss.clear();
+                cnt = 1;
+            }
+            CV_Assert(cnt > 0U);
+            if (!(iss >> type))
+                break;
+
+            while (cnt-- > 0)
+            {
+                binary_to_filenode_t pack;
+
+                /* set func and offset */
+                size_t size = 0;
+                switch (type)
+                {
+                case 'u':
+                case 'c':
+                    size      = sizeof(uchar);
+                    pack.func = binary_to<uchar>;
+                    break;
+                case 'w':
+                case 's':
+                    size      = sizeof(ushort);
+                    pack.func = binary_to<ushort>;
+                    break;
+                case 'i':
+                    size      = sizeof(uint);
+                    pack.func = binary_to<uint>;
+                    break;
+                case 'f':
+                    size      = sizeof(float);
+                    pack.func = binary_to<float>;
+                    break;
+                case 'd':
+                    size      = sizeof(double);
+                    pack.func = binary_to<double>;
+                    break;
+                case 'r':
+                default:  { CV_Assert(!"type not support"); break; }
+                }; // need a better way for outputting error.
+
+                offset = static_cast<size_t>(cvAlign(static_cast<int>(offset), static_cast<int>(size)));
+                pack.offset = offset;
+                offset += size;
+
+                /* set type */
+                switch (type)
+                {
+                case 'u': { pack.cv_type = CV_8U ; break; }
+                case 'c': { pack.cv_type = CV_8S ; break; }
+                case 'w': { pack.cv_type = CV_16U; break; }
+                case 's': { pack.cv_type = CV_16S; break; }
+                case 'i': { pack.cv_type = CV_32S; break; }
+                case 'f': { pack.cv_type = CV_32F; break; }
+                case 'd': { pack.cv_type = CV_64F; break; }
+                case 'r':
+                default:  { CV_Assert(!"type is not support"); break; }
+                } // need a better way for outputting error.
+
+                binary_to_funcs.push_back(pack);
+            }
+        }
+
+        CV_Assert(iss.eof());
+        CV_Assert(binary_to_funcs.size());
+    }
+
+private:
+
+    const uchar * cur;
+    const uchar * beg;
+    const uchar * end;
+
+    size_t step;
+    std::vector<binary_to_filenode_t> binary_to_funcs;
+    std::vector<binary_to_filenode_t>::iterator functor_iter;
+};
+
+
+
+/****************************************************************************
+ * Wapper
+ ***************************************************************************/
+
+class base64::Base64Writer
+{
+public:
+
+    Base64Writer(::CvFileStorage * fs, const char * name, int len, const char* dt)
+        : file_storage(fs)
+        , emitter(fs)
+        , remaining_data_length(len)
+        , data_type_string(dt)
+    {
+        CV_CHECK_OUTPUT_FILE_STORAGE(fs);
+
+        cvStartWriteStruct(fs, name, CV_NODE_SEQ, "binary");
+        icvFSFlush(fs);
+
+        /* output header */
+
+        /* total byte size(before encode) */
+        int size = len * ::icvCalcStructSize(dt, 0);
+
+        std::string buffer = make_base64_header(size, dt);
+        const uchar * beg = reinterpret_cast<const uchar *>(buffer.data());
+        const uchar * end = beg + buffer.size();
+
+        emitter.write(beg, end);
+    }
+
+    void write(const void* _data, int len)
+    {
+        CV_Assert(len >= 0);
+        CV_Assert(remaining_data_length >= static_cast<size_t>(len));
+        remaining_data_length -= static_cast<size_t>(len);
+
+        RawDataToBinaryConvertor convertor(_data, len, data_type_string);
+        emitter.write(convertor);
+    }
+
+    template<typename _to_binary_convertor_t> inline
+    void write(_to_binary_convertor_t & convertor, int data_length_of_convertor)
+    {
+        CV_Assert(data_length_of_convertor >= 0);
+        CV_Assert(remaining_data_length >= static_cast<size_t>(data_length_of_convertor));
+        remaining_data_length -= static_cast<size_t>(data_length_of_convertor);
+
+        emitter.write(convertor);
+    }
+
+    ~Base64Writer()
+    {
+        CV_Assert(remaining_data_length == 0U);
+        emitter.flush();
+        cvEndWriteStruct(file_storage);
+        icvFSFlush(file_storage);
+    }
+
+private:
+
+    ::CvFileStorage * file_storage;
+    Base64ContextEmitter emitter;
+    size_t remaining_data_length;
+    const char* data_type_string;
+};
+
+void base64::make_seq(void * binary, int elem_cnt, const char * dt, ::CvSeq & seq)
+{
+    ::CvFileNode node;
+    node.info = 0;
+    BinaryToCvSeqConvertor convertor(binary, elem_cnt, dt);
+    while (convertor) {
+        convertor >> node;
+        cvSeqPush(&seq, &node);
+    }
+}
+
+void base64::cvStartWriteRawData_Base64(::CvFileStorage * fs, const char* name, int len, const char* dt)
+{
+    CV_Assert(fs);
+    CV_CHECK_OUTPUT_FILE_STORAGE(fs);
+    CV_Assert(fs->base64_writer == 0);
+    fs->base64_writer = new Base64Writer(fs, name, len, dt);
+}
+
+void base64::cvWriteRawData_Base64(::CvFileStorage * fs, const void* _data, int len)
+{
+    CV_Assert(fs);
+    CV_CHECK_OUTPUT_FILE_STORAGE(fs);
+    CV_Assert(fs->base64_writer != 0);
+    fs->base64_writer->write(_data, len);
+}
+
+void base64::cvEndWriteRawData_Base64(::CvFileStorage * fs)
+{
+    CV_Assert(fs);
+    CV_CHECK_OUTPUT_FILE_STORAGE(fs);
+    CV_Assert(fs->base64_writer != 0);
+    delete fs->base64_writer;
+    fs->base64_writer = 0;
+}
+
+void base64::cvWriteRawData_Base64(::cv::FileStorage & fs, const void* _data, int len, const char* dt)
+{
+    cvStartWriteStruct(*fs, fs.elname.c_str(), CV_NODE_SEQ, "binary");
+    {
+        Base64ContextEmitter emitter(*fs);
+        {    /* header */
+            /* total byte size(before encode) */
+            int size = len * ::icvCalcStructSize(dt, 0);
+            std::string buffer = make_base64_header(size, dt);
+            const uchar * beg = reinterpret_cast<const uchar *>(buffer.data());
+            const uchar * end = beg + buffer.size();
+
+            emitter.write(beg, end);
+        }
+        {    /* body */
+            RawDataToBinaryConvertor convert(_data, len, dt);
+            emitter.write(convert);
+        }
+    }
+    cvEndWriteStruct(*fs);
+}
+
+void base64::cvWriteMat_Base64(::CvFileStorage * fs, const char * name, ::cv::Mat const & mat)
+{
+    char dt[4];
+    ::icvEncodeFormat(CV_MAT_TYPE(mat.type()), dt);
+
+    {    /* [1]output other attr */
+
+        if (mat.dims <= 2) {
+            cvStartWriteStruct(fs, name, CV_NODE_MAP, CV_TYPE_NAME_MAT);
+
+            cvWriteInt(fs, "rows", mat.rows );
+            cvWriteInt(fs, "cols", mat.cols );
+        } else {
+            cvStartWriteStruct(fs, name, CV_NODE_MAP, CV_TYPE_NAME_MATND);
+
+            cvStartWriteStruct(fs, "sizes", CV_NODE_SEQ | CV_NODE_FLOW);
+            cvWriteRawData(fs, mat.size.p, mat.dims, "i");
+            cvEndWriteStruct(fs);
+        }
+        cvWriteString(fs, "dt", ::icvEncodeFormat(CV_MAT_TYPE(mat.type()), dt ), 0 );
+    }
+
+    {    /* [2]deal with matrix's data */
+        int len = static_cast<int>(mat.total());
+        MatToBinaryConvertor convertor(mat);
+
+        cvStartWriteRawData_Base64(fs, "data", len, dt);
+        fs->base64_writer->write(convertor, len);
+        cvEndWriteRawData_Base64(fs);
+    }
+
+    {    /* [3]output end */
+        cvEndWriteStruct(fs);
+    }
+}
+
+/****************************************************************************
+ * Interface
+ ***************************************************************************/
+
+namespace cv
+{
+    void cvWriteMat_Base64(::CvFileStorage* fs, const char* name, const ::CvMat* mat)
+    {
+        ::cv::Mat holder = ::cv::cvarrToMat(mat);
+        ::base64::cvWriteMat_Base64(fs, name, holder);
+    }
+
+    void cvWriteMatND_Base64(::CvFileStorage* fs, const char* name, const ::CvMatND* mat)
+    {
+        ::cv::Mat holder = ::cv::cvarrToMat(mat);
+        ::base64::cvWriteMat_Base64(fs, name, holder);
+    }
+
+    void cvStartWriteRawData_Base64(::CvFileStorage * fs, const char* name, int len, const char* dt)
+    {
+        ::base64::cvStartWriteRawData_Base64(fs, name, len, dt);
+    }
+
+    void cvWriteRawData_Base64(::CvFileStorage * fs, const void* _data, int len)
+    {
+        ::base64::cvWriteRawData_Base64(fs, _data, len);
+    }
+
+    void cvEndWriteRawData_Base64(::CvFileStorage * fs)
+    {
+        ::base64::cvEndWriteRawData_Base64(fs);
+    }
+
+}
+
+
 /* End of file. */
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index e3525752e5..121569092c 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2280,7 +2280,9 @@ static bool ipp_minMaxIdx( Mat &src, double* minVal, double* maxVal, int* minIdx
 #endif
                 depth == CV_16U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1R :
 #if !((defined _MSC_VER && defined _M_IX86) || defined __i386__)
-                depth == CV_32F ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1R :
+                // See bug #4955: the function fails with SEGFAULT when the source matrix contains NANs
+                // IPPICV version is 9.0.1.
+                // depth == CV_32F ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1R :
 #endif
                 0;
             CV_SUPPRESS_DEPRECATED_END
@@ -2718,19 +2720,25 @@ static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
             ippiMaskNormFuncC3 ippFuncC3 =
                 normType == NORM_INF ?
                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
+#if IPP_VERSION_X100 < 900
                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8s_C3CMR :
+#endif
                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
                 0) :
             normType == NORM_L1 ?
                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
+#if IPP_VERSION_X100 < 900
                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8s_C3CMR :
+#endif
                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
                 0) :
             normType == NORM_L2 || normType == NORM_L2SQR ?
                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
+#if IPP_VERSION_X100 < 900
                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8s_C3CMR :
+#endif
                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
                 0) : 0;
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index ace7950a64..3856897382 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -1844,3 +1844,12 @@ TEST(Normalize, regression_5876_inplace_change_type)
     normalize(m, m, 1, 0, NORM_MINMAX, CV_32F);
     EXPECT_EQ(0, cvtest::norm(m, result, NORM_INF));
 }
+
+TEST(MinMaxLoc, regression_4955_nans)
+{
+    cv::Mat one_mat(2, 2, CV_32F, cv::Scalar(1));
+    cv::minMaxLoc(one_mat, NULL, NULL, NULL, NULL);
+
+    cv::Mat nan_mat(2, 2, CV_32F, cv::Scalar(NAN));
+    cv::minMaxLoc(nan_mat, NULL, NULL, NULL, NULL);
+}
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index 42b0cfcfd9..ca9d3dc7b7 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -69,8 +69,8 @@ template<typename R> struct TheTest
         EXPECT_EQ(d, res);
 
         // zero, all
-        Data<R> resZ = RegTrait<R>::zero();
-        Data<R> resV = RegTrait<R>::all(8);
+        Data<R> resZ = V_RegTrait128<LaneType>::zero();
+        Data<R> resV = V_RegTrait128<LaneType>::all(8);
         for (int i = 0; i < R::nlanes; ++i)
         {
             EXPECT_EQ((LaneType)0, resZ[i]);
@@ -135,7 +135,7 @@ template<typename R> struct TheTest
     // v_expand and v_load_expand
     TheTest & test_expand()
     {
-        typedef typename RegTrait<R>::w_reg Rx2;
+        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
         Data<R> dataA;
         R a = dataA;
 
@@ -158,7 +158,7 @@ template<typename R> struct TheTest
 
     TheTest & test_expand_q()
     {
-        typedef typename RegTrait<R>::q_reg Rx4;
+        typedef typename V_RegTrait128<LaneType>::q_reg Rx4;
         Data<R> data;
         Data<Rx4> out = v_load_expand_q(data.d);
         const int n = Rx4::nlanes;
@@ -232,7 +232,7 @@ template<typename R> struct TheTest
 
     TheTest & test_mul_expand()
     {
-        typedef typename RegTrait<R>::w_reg Rx2;
+        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
         Data<R> dataA, dataB(2);
         R a = dataA, b = dataB;
         Rx2 c, d;
@@ -295,7 +295,7 @@ template<typename R> struct TheTest
 
     TheTest & test_dot_prod()
     {
-        typedef typename RegTrait<R>::w_reg Rx2;
+        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
         Data<R> dataA, dataB(2);
         R a = dataA, b = dataB;
 
@@ -361,7 +361,7 @@ template<typename R> struct TheTest
 
     TheTest & test_absdiff()
     {
-        typedef typename RegTrait<R>::u_reg Ru;
+        typedef typename V_RegTrait128<LaneType>::u_reg Ru;
         typedef typename Ru::lane_type u_type;
         Data<R> dataA(std::numeric_limits<LaneType>::max()),
                 dataB(std::numeric_limits<LaneType>::min());
@@ -445,7 +445,7 @@ template<typename R> struct TheTest
     template <int s>
     TheTest & test_pack()
     {
-        typedef typename RegTrait<R>::w_reg Rx2;
+        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
         typedef typename Rx2::lane_type w_type;
         Data<Rx2> dataA, dataB;
         dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
@@ -480,8 +480,8 @@ template<typename R> struct TheTest
     template <int s>
     TheTest & test_pack_u()
     {
-        typedef typename RegTrait<R>::w_reg Rx2;
-        typedef typename RegTrait<Rx2>::int_reg Ri2;
+        typedef typename V_TypeTraits<LaneType>::w_type LaneType_w;
+        typedef typename V_RegTrait128<LaneType_w>::int_reg Ri2;
         typedef typename Ri2::lane_type w_type;
 
         Data<Ri2> dataA, dataB;
@@ -572,7 +572,7 @@ template<typename R> struct TheTest
 
     TheTest & test_float_math()
     {
-        typedef typename RegTrait<R>::int_reg Ri;
+        typedef typename V_RegTrait128<LaneType>::int_reg Ri;
         Data<R> data1, data2, data3;
         data1 *= 1.1;
         data2 += 10;
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index a0eab56a53..1f8a78d98d 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -155,80 +155,4 @@ template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R
     return out;
 }
 
-//==================================================================================================
-
-template <typename R> struct RegTrait;
-
-template <> struct RegTrait<cv::v_uint8x16> {
-    typedef cv::v_uint16x8 w_reg;
-    typedef cv::v_uint32x4 q_reg;
-    typedef cv::v_uint8x16 u_reg;
-    static cv::v_uint8x16 zero() { return cv::v_setzero_u8(); }
-    static cv::v_uint8x16 all(uchar val) { return cv::v_setall_u8(val); }
-};
-template <> struct RegTrait<cv::v_int8x16> {
-    typedef cv::v_int16x8 w_reg;
-    typedef cv::v_int32x4 q_reg;
-    typedef cv::v_uint8x16 u_reg;
-    static cv::v_int8x16 zero() { return cv::v_setzero_s8(); }
-    static cv::v_int8x16 all(schar val) { return cv::v_setall_s8(val); }
-};
-
-template <> struct RegTrait<cv::v_uint16x8> {
-    typedef cv::v_uint32x4 w_reg;
-    typedef cv::v_int16x8 int_reg;
-    typedef cv::v_uint16x8 u_reg;
-    static cv::v_uint16x8 zero() { return cv::v_setzero_u16(); }
-    static cv::v_uint16x8 all(ushort val) { return cv::v_setall_u16(val); }
-};
-
-template <> struct RegTrait<cv::v_int16x8> {
-    typedef cv::v_int32x4 w_reg;
-    typedef cv::v_uint16x8 u_reg;
-    static cv::v_int16x8 zero() { return cv::v_setzero_s16(); }
-    static cv::v_int16x8 all(short val) { return cv::v_setall_s16(val); }
-};
-
-template <> struct RegTrait<cv::v_uint32x4> {
-    typedef cv::v_uint64x2 w_reg;
-    typedef cv::v_int32x4 int_reg;
-    typedef cv::v_uint32x4 u_reg;
-    static cv::v_uint32x4 zero() { return cv::v_setzero_u32(); }
-    static cv::v_uint32x4 all(unsigned val) { return cv::v_setall_u32(val); }
-};
-
-template <> struct RegTrait<cv::v_int32x4> {
-    typedef cv::v_int64x2 w_reg;
-    typedef cv::v_uint32x4 u_reg;
-    static cv::v_int32x4 zero() { return cv::v_setzero_s32(); }
-    static cv::v_int32x4 all(int val) { return cv::v_setall_s32(val); }
-};
-
-template <> struct RegTrait<cv::v_uint64x2> {
-    static cv::v_uint64x2 zero() { return cv::v_setzero_u64(); }
-    static cv::v_uint64x2 all(uint64 val) { return cv::v_setall_u64(val); }
-};
-
-template <> struct RegTrait<cv::v_int64x2> {
-    static cv::v_int64x2 zero() { return cv::v_setzero_s64(); }
-    static cv::v_int64x2 all(int64 val) { return cv::v_setall_s64(val); }
-};
-
-template <> struct RegTrait<cv::v_float32x4> {
-    typedef cv::v_int32x4 int_reg;
-    typedef cv::v_float32x4 u_reg;
-    static cv::v_float32x4 zero() { return cv::v_setzero_f32(); }
-    static cv::v_float32x4 all(float val) { return cv::v_setall_f32(val); }
-};
-
-#if CV_SIMD128_64F
-template <> struct RegTrait<cv::v_float64x2> {
-    typedef cv::v_int32x4 int_reg;
-    typedef cv::v_float64x2 u_reg;
-    static cv::v_float64x2 zero() { return cv::v_setzero_f64(); }
-    static cv::v_float64x2 all(double val) { return cv::v_setall_f64(val); }
-};
-
-#endif
-
 #endif
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index b53c43c83b..f2c53dc964 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -574,6 +574,169 @@ TEST(Core_InputOutput, FileStorageKey)
     EXPECT_NO_THROW(f << "key1" << "value1");
     EXPECT_NO_THROW(f << "_key2" << "value2");
     EXPECT_NO_THROW(f << "key_3" << "value3");
-    const std::string expected = "%YAML:1.0\nkey1: value1\n_key2: value2\nkey_3: value3\n";
+    const std::string expected = "%YAML 1.0\n---\nkey1: value1\n_key2: value2\nkey_3: value3\n";
     ASSERT_STREQ(f.releaseAndGetString().c_str(), expected.c_str());
 }
+
+TEST(Core_InputOutput, FileStorageSpaces)
+{
+    cv::FileStorage f("dummy.yml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+    const int valueCount = 5;
+    std::string values[5] = { "", " ", " ", "  a", " some string" };
+    for (size_t i = 0; i < valueCount; i++) {
+        EXPECT_NO_THROW(f << cv::format("key%d", i) << values[i]);
+    }
+    cv::FileStorage f2(f.releaseAndGetString(), cv::FileStorage::READ | cv::FileStorage::MEMORY);
+    std::string valuesRead[valueCount];
+    for (size_t i = 0; i < valueCount; i++) {
+        EXPECT_NO_THROW(f2[cv::format("key%d", i)] >> valuesRead[i]);
+        ASSERT_STREQ(values[i].c_str(), valuesRead[i].c_str());
+    }
+}
+
+TEST(Core_InputOutput, filestorage_yml_compatibility)
+{
+    // TODO:
+}
+
+class CV_Base64IOTest : public cvtest::BaseTest
+{
+private:
+    std::string file_name;
+
+    struct data_t
+    {
+        uchar u1, u2;
+        int i1, i2, i3;
+        double d1, d2;
+        int i4;
+    };
+
+public:
+    CV_Base64IOTest(std::string const & test_file_name)
+        : file_name(test_file_name) {}
+    ~CV_Base64IOTest() {}
+protected:
+    void run(int)
+    {
+        try
+        {
+            std::vector<data_t> rawdata;
+
+            cv::Mat _em_out, _em_in;
+            cv::Mat _2d_out, _2d_in;
+            cv::Mat _nd_out, _nd_in;
+
+            {   /* init */
+
+                /* normal mat */
+                _2d_out = cv::Mat(100, 100, CV_8UC3, cvScalar(1U, 2U, 127U));
+                for (int i = 0; i < _2d_out.rows; ++i)
+                    for (int j = 0; j < _2d_out.cols; ++j)
+                        _2d_out.at<cv::Vec3b>(i, j)[1] = (i + j) % 256;
+
+                /* 4d mat */
+                const int Size[] = {4, 4, 4, 4};
+                cv::Mat _4d(4, Size, CV_64FC4, cvScalar(0.888, 0.111, 0.666, 0.444));
+                const cv::Range ranges[] = {
+                    cv::Range(0, 2),
+                    cv::Range(0, 2),
+                    cv::Range(1, 2),
+                    cv::Range(0, 2) };
+                _nd_out = _4d(ranges);
+
+                /* raw data */
+                for (int i = 0; i < 1000; i++) {
+                    data_t tmp;
+                    tmp.u1 = 1;
+                    tmp.u2 = 2;
+                    tmp.i1 = 1;
+                    tmp.i2 = 2;
+                    tmp.i3 = 3;
+                    tmp.d1 = 0.1;
+                    tmp.d2 = 0.2;
+                    tmp.i4 = i;
+                    rawdata.push_back(tmp);
+                }
+            }
+
+            {   /* write */
+                cv::FileStorage fs(file_name, cv::FileStorage::WRITE);
+                CvMat holder = _2d_out;
+                cv::cvWriteMat_Base64(*fs, "normal_2d_mat", &holder);
+                CvMatND holder_nd = _nd_out;
+                cv::cvWriteMatND_Base64(*fs, "normal_nd_mat", &holder_nd);
+                holder = _em_out;
+                cv::cvWriteMat_Base64(*fs, "empty_2d_mat", &holder);
+
+                cv::cvStartWriteRawData_Base64(*fs, "rawdata", static_cast<int>(rawdata.size()), "2u3i2di");
+                for (int i = 0; i < 10; i++)
+                    cv::cvWriteRawData_Base64(*fs, rawdata.data() + i * 100, 100);
+                cv::cvEndWriteRawData_Base64(*fs);
+
+                fs.release();
+            }
+
+            {   /* read */
+                cv::FileStorage fs(file_name, cv::FileStorage::READ);
+
+                /* mat */
+                fs["empty_2d_mat"]  >> _em_in;
+                fs["normal_2d_mat"] >> _2d_in;
+                fs["normal_nd_mat"] >> _nd_in;
+
+                /* raw data */
+                std::vector<data_t>(1000).swap(rawdata);
+                cvReadRawData(*fs, fs["rawdata"].node, rawdata.data(), "2u3i2di");
+
+                fs.release();
+            }
+
+            for (int i = 0; i < 1000; i++) {
+                // TODO: Solve this bug in `cvReadRawData`
+                //EXPECT_EQ(rawdata[i].u1, 1);
+                //EXPECT_EQ(rawdata[i].u2, 2);
+                //EXPECT_EQ(rawdata[i].i1, 1);
+                //EXPECT_EQ(rawdata[i].i2, 2);
+                //EXPECT_EQ(rawdata[i].i3, 3);
+                //EXPECT_EQ(rawdata[i].d1, 0.1);
+                //EXPECT_EQ(rawdata[i].d2, 0.2);
+                //EXPECT_EQ(rawdata[i].i4, i);
+            }
+
+            EXPECT_EQ(_em_in.rows   , _em_out.rows);
+            EXPECT_EQ(_em_in.cols   , _em_out.cols);
+            EXPECT_EQ(_em_in.dims   , _em_out.dims);
+            EXPECT_EQ(_em_in.depth(), _em_out.depth());
+            EXPECT_TRUE(_em_in.empty());
+
+            EXPECT_EQ(_2d_in.rows   , _2d_in.rows);
+            EXPECT_EQ(_2d_in.cols   , _2d_in.cols);
+            EXPECT_EQ(_2d_in.dims   , _2d_in.dims);
+            EXPECT_EQ(_2d_in.depth(), _2d_in.depth());
+            for(int i = 0; i < _2d_in.rows; ++i)
+                for (int j = 0; j < _2d_in.cols; ++j)
+                    EXPECT_EQ(_2d_in.at<cv::Vec3b>(i, j), _2d_out.at<cv::Vec3b>(i, j));
+
+            EXPECT_EQ(_nd_in.rows   , _nd_in.rows);
+            EXPECT_EQ(_nd_in.cols   , _nd_in.cols);
+            EXPECT_EQ(_nd_in.dims   , _nd_in.dims);
+            EXPECT_EQ(_nd_in.depth(), _nd_in.depth());
+            EXPECT_EQ(cv::countNonZero(cv::mean(_nd_in != _nd_out)), 0);
+        }
+        catch(...)
+        {
+            ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
+        }
+    }
+};
+
+TEST(Core_InputOutput, filestorage_yml_base64)
+{
+    CV_Base64IOTest test("base64_test_tmp_file.yml"); test.safe_run();
+}
+
+TEST(Core_InputOutput, filestorage_xml_base64)
+{
+    CV_Base64IOTest test("base64_test_tmp_file.xml"); test.safe_run();
+}
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index e2de3249b6..9aeb02ed39 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -1483,3 +1483,40 @@ TEST(Mat, regression_5991)
     EXPECT_EQ(sz[2], mat.size[2]);
     EXPECT_EQ(0, cvtest::norm(mat, Mat(3, sz, CV_8U, Scalar(1)), NORM_INF));
 }
+
+#ifdef OPENCV_TEST_BIGDATA
+TEST(Mat, regression_6696_BigData_8Gb)
+{
+    int width = 60000;
+    int height = 10000;
+
+    Mat destImageBGR = Mat(height, width, CV_8UC3, Scalar(1, 2, 3, 0));
+    Mat destImageA = Mat(height, width, CV_8UC1, Scalar::all(4));
+
+    vector<Mat> planes;
+    split(destImageBGR, planes);
+    planes.push_back(destImageA);
+    merge(planes, destImageBGR);
+
+    EXPECT_EQ(1, destImageBGR.at<Vec4b>(0)[0]);
+    EXPECT_EQ(2, destImageBGR.at<Vec4b>(0)[1]);
+    EXPECT_EQ(3, destImageBGR.at<Vec4b>(0)[2]);
+    EXPECT_EQ(4, destImageBGR.at<Vec4b>(0)[3]);
+
+    EXPECT_EQ(1, destImageBGR.at<Vec4b>(height-1, width-1)[0]);
+    EXPECT_EQ(2, destImageBGR.at<Vec4b>(height-1, width-1)[1]);
+    EXPECT_EQ(3, destImageBGR.at<Vec4b>(height-1, width-1)[2]);
+    EXPECT_EQ(4, destImageBGR.at<Vec4b>(height-1, width-1)[3]);
+}
+#endif
+
+TEST(Reduce, regression_should_fail_bug_4594)
+{
+    cv::Mat src = cv::Mat::eye(4, 4, CV_8U);
+    std::vector<int> dst;
+
+    EXPECT_THROW(cv::reduce(src, dst, 0, CV_REDUCE_MIN, CV_32S), cv::Exception);
+    EXPECT_THROW(cv::reduce(src, dst, 0, CV_REDUCE_MAX, CV_32S), cv::Exception);
+    EXPECT_NO_THROW(cv::reduce(src, dst, 0, CV_REDUCE_SUM, CV_32S));
+    EXPECT_NO_THROW(cv::reduce(src, dst, 0, CV_REDUCE_AVG, CV_32S));
+}
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index 65e3861ed8..5eb9fe267f 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -2384,7 +2384,7 @@ TEST(Core_SolvePoly, regression_5599)
         double prec;
         prec = cv::solvePoly(coefs, r);
         EXPECT_LE(prec, 1e-6);
-        EXPECT_EQ(4, r.total());
+        EXPECT_EQ(4u, r.total());
         //std::cout << "Preciseness = " << prec << std::endl;
         //std::cout << "roots:\n" << r << "\n" << std::endl;
         ASSERT_EQ(CV_32FC2, r.type());
@@ -2400,7 +2400,7 @@ TEST(Core_SolvePoly, regression_5599)
         double prec;
         prec = cv::solvePoly(coefs, r);
         EXPECT_LE(prec, 1e-6);
-        EXPECT_EQ(2, r.total());
+        EXPECT_EQ(2u, r.total());
         //std::cout << "Preciseness = " << prec << std::endl;
         //std::cout << "roots:\n" << r << "\n" << std::endl;
         ASSERT_EQ(CV_32FC2, r.type());
@@ -2411,8 +2411,9 @@ TEST(Core_SolvePoly, regression_5599)
 
 class Core_PhaseTest : public cvtest::BaseTest
 {
+    int t;
 public:
-    Core_PhaseTest() {}
+    Core_PhaseTest(int t_) : t(t_) {}
     ~Core_PhaseTest() {}
 protected:
     virtual void run(int)
@@ -2421,9 +2422,9 @@ protected:
         const int axisCount = 8;
         const int dim = theRNG().uniform(1,10);
         const float scale = theRNG().uniform(1.f, 100.f);
-        Mat x(axisCount + 1, dim, CV_32FC1),
-            y(axisCount + 1, dim, CV_32FC1);
-        Mat anglesInDegrees(axisCount + 1, dim, CV_32FC1);
+        Mat x(axisCount + 1, dim, t),
+            y(axisCount + 1, dim, t);
+        Mat anglesInDegrees(axisCount + 1, dim, t);
 
         // fill the data
         x.row(0).setTo(Scalar(0));
@@ -2696,8 +2697,8 @@ TEST(Core_SVD, accuracy) { Core_SVDTest test; test.safe_run(); }
 TEST(Core_SVBkSb, accuracy) { Core_SVBkSbTest test; test.safe_run(); }
 TEST(Core_Trace, accuracy) { Core_TraceTest test; test.safe_run(); }
 TEST(Core_SolvePoly, accuracy) { Core_SolvePolyTest test; test.safe_run(); }
-TEST(Core_Phase, accuracy) { Core_PhaseTest test; test.safe_run(); }
-
+TEST(Core_Phase, accuracy32f) { Core_PhaseTest test(CV_32FC1); test.safe_run(); }
+TEST(Core_Phase, accuracy64f) { Core_PhaseTest test(CV_64FC1); test.safe_run(); }
 
 TEST(Core_SVD, flt)
 {
diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp
index 18b0a5a7ea..8ff76af661 100644
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -27,6 +27,7 @@ TEST(CommandLineParser, testFailure)
     parser.get<bool>("h");
     EXPECT_FALSE(parser.check());
 }
+
 TEST(CommandLineParser, testHas_noValues)
 {
     const char* argv[] = {"<bin>", "-h", "--info"};
@@ -218,4 +219,17 @@ TEST(CommandLineParser, positional_regression_5074_equal_sign)
     EXPECT_TRUE(parser.check());
 }
 
+
+TEST(AutoBuffer, allocate_test)
+{
+    AutoBuffer<int, 5> abuf(2);
+    EXPECT_EQ(2u, abuf.size());
+
+    abuf.allocate(4);
+    EXPECT_EQ(4u, abuf.size());
+
+    abuf.allocate(6);
+    EXPECT_EQ(6u, abuf.size());
+}
+
 } // namespace
diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index d377a70e7a..46c7bfb327 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -77,7 +77,7 @@ namespace cv { namespace cuda {
 @param dst Destination matrix that has the same size and number of channels as the input array(s).
 The depth is defined by dtype or src1 depth.
 @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
-destination array to be changed.
+destination array to be changed. The mask can be used only with single channel images.
 @param dtype Optional depth of the output array.
 @param stream Stream for the asynchronous version.
 
@@ -92,7 +92,7 @@ CV_EXPORTS void add(InputArray src1, InputArray src2, OutputArray dst, InputArra
 @param dst Destination matrix that has the same size and number of channels as the input array(s).
 The depth is defined by dtype or src1 depth.
 @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
-destination array to be changed.
+destination array to be changed. The mask can be used only with single channel images.
 @param dtype Optional depth of the output array.
 @param stream Stream for the asynchronous version.
 
@@ -226,7 +226,8 @@ CV_EXPORTS void compare(InputArray src1, InputArray src2, OutputArray dst, int c
 
 @param src Source matrix.
 @param dst Destination matrix with the same size and type as src .
-@param mask Optional operation mask. 8-bit single channel image.
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
 @param stream Stream for the asynchronous version.
  */
 CV_EXPORTS void bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
@@ -236,7 +237,8 @@ CV_EXPORTS void bitwise_not(InputArray src, OutputArray dst, InputArray mask = n
 @param src1 First source matrix or scalar.
 @param src2 Second source matrix or scalar.
 @param dst Destination matrix that has the same size and type as the input array(s).
-@param mask Optional operation mask. 8-bit single channel image.
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
 @param stream Stream for the asynchronous version.
  */
 CV_EXPORTS void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
@@ -246,7 +248,8 @@ CV_EXPORTS void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, In
 @param src1 First source matrix or scalar.
 @param src2 Second source matrix or scalar.
 @param dst Destination matrix that has the same size and type as the input array(s).
-@param mask Optional operation mask. 8-bit single channel image.
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
 @param stream Stream for the asynchronous version.
  */
 CV_EXPORTS void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
@@ -256,7 +259,8 @@ CV_EXPORTS void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, I
 @param src1 First source matrix or scalar.
 @param src2 Second source matrix or scalar.
 @param dst Destination matrix that has the same size and type as the input array(s).
-@param mask Optional operation mask. 8-bit single channel image.
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
 @param stream Stream for the asynchronous version.
  */
 CV_EXPORTS void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
diff --git a/modules/cudafeatures2d/src/fast.cpp b/modules/cudafeatures2d/src/fast.cpp
index 2095ef7cf6..ce44b3a606 100644
--- a/modules/cudafeatures2d/src/fast.cpp
+++ b/modules/cudafeatures2d/src/fast.cpp
@@ -104,7 +104,7 @@ namespace
         }
 
         BufferPool pool(Stream::Null());
-        GpuMat d_keypoints = pool.getBuffer(ROWS_COUNT, max_npoints_, CV_16SC2);
+        GpuMat d_keypoints = pool.getBuffer(ROWS_COUNT, max_npoints_, CV_32FC1);
 
         detectAsync(_image, d_keypoints, _mask, Stream::Null());
         convert(d_keypoints, keypoints);
diff --git a/modules/cudaoptflow/src/cuda/pyrlk.cu b/modules/cudaoptflow/src/cuda/pyrlk.cu
index 5d40a47eae..5c81edface 100644
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -344,6 +344,18 @@ namespace pyrlk
         return ret;
     }
 
+    template <typename T>
+    struct DenormalizationFactor
+    {
+        static const float factor = 1.0;
+    };
+
+    template <>
+    struct DenormalizationFactor<uchar>
+    {
+        static const float factor = 255.0;
+    };
+
     template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
     __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
     {
@@ -532,7 +544,7 @@ namespace pyrlk
             nextPts[blockIdx.x] = nextPt;
 
             if (calcErr)
-                err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
+                err[blockIdx.x] = static_cast<float>(errval) / (::min(cn, 3) * c_winSize_x * c_winSize_y) * DenormalizationFactor<T>::factor;
         }
     }
 
@@ -725,7 +737,7 @@ namespace pyrlk
             nextPts[blockIdx.x] = nextPt;
 
             if (calcErr)
-                err[blockIdx.x] = static_cast<float>(errval) / (3 * c_winSize_x * c_winSize_y);
+                err[blockIdx.x] = static_cast<float>(errval) / (::min(cn, 3)*c_winSize_x * c_winSize_y);
         }
     } // __global__ void sparseKernel_
 
@@ -1109,4 +1121,4 @@ namespace pyrlk
     template class pyrLK_caller<float, 4>;
 }
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp
index 6f1b610ccf..d5a0299440 100644
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
       }
     }
   }
-  hal::fastAtan2(resY, resX, Ang, ang_size, false);
+  hal::fastAtan32f(resY, resX, Ang, ang_size, false);
   // Loop slides pi/3 window around feature point
   for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) {
     ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
diff --git a/modules/flann/src/miniflann.cpp b/modules/flann/src/miniflann.cpp
index 7d81438dbc..b7661752c0 100644
--- a/modules/flann/src/miniflann.cpp
+++ b/modules/flann/src/miniflann.cpp
@@ -318,7 +318,19 @@ buildIndex_(void*& index, const Mat& data, const IndexParams& params, const Dist
 
     ::cvflann::Matrix<ElementType> dataset((ElementType*)data.data, data.rows, data.cols);
     IndexType* _index = new IndexType(dataset, get_params(params), dist);
-    _index->buildIndex();
+
+    try
+    {
+        _index->buildIndex();
+    }
+    catch (...)
+    {
+        delete _index;
+        _index = NULL;
+
+        throw;
+    }
+
     index = _index;
 }
 
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index ac0fd24ee9..ce827a7e0f 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -84,7 +84,7 @@ enum ImwriteFlags {
        IMWRITE_JPEG_RST_INTERVAL   = 4,  //!< JPEG restart interval, 0 - 65535, default is 0 - no restart.
        IMWRITE_JPEG_LUMA_QUALITY   = 5,  //!< Separate luma quality level, 0 - 100, default is 0 - don't use.
        IMWRITE_JPEG_CHROMA_QUALITY = 6,  //!< Separate chroma quality level, 0 - 100, default is 0 - don't use.
-       IMWRITE_PNG_COMPRESSION     = 16, //!< For PNG, it can be the compression level from 0 to 9. A higher value means a smaller size and longer compression time. Default value is 3.
+       IMWRITE_PNG_COMPRESSION     = 16, //!< For PNG, it can be the compression level from 0 to 9. A higher value means a smaller size and longer compression time. Default value is 3. Also strategy is changed to IMWRITE_PNG_STRATEGY_DEFAULT (Z_DEFAULT_STRATEGY).
        IMWRITE_PNG_STRATEGY        = 17, //!< One of cv::ImwritePNGFlags, default is IMWRITE_PNG_STRATEGY_DEFAULT.
        IMWRITE_PNG_BILEVEL         = 18, //!< Binary level PNG, 0 or 1, default is 0.
        IMWRITE_PXM_BINARY          = 32, //!< For PPM, PGM, or PBM, it can be a binary format flag, 0 or 1. Default value is 1.
diff --git a/modules/imgcodecs/src/grfmt_png.cpp b/modules/imgcodecs/src/grfmt_png.cpp
index d5d175f89e..e672e0cc96 100644
--- a/modules/imgcodecs/src/grfmt_png.cpp
+++ b/modules/imgcodecs/src/grfmt_png.cpp
@@ -370,22 +370,23 @@ bool  PngEncoder::write( const Mat& img, const std::vector<int>& params )
                 }
 
                 int compression_level = -1; // Invalid value to allow setting 0-9 as valid
-                int compression_strategy = Z_RLE; // Default strategy
+                int compression_strategy = IMWRITE_PNG_STRATEGY_RLE; // Default strategy
                 bool isBilevel = false;
 
                 for( size_t i = 0; i < params.size(); i += 2 )
                 {
-                    if( params[i] == CV_IMWRITE_PNG_COMPRESSION )
+                    if( params[i] == IMWRITE_PNG_COMPRESSION )
                     {
+                        compression_strategy = IMWRITE_PNG_STRATEGY_DEFAULT; // Default strategy
                         compression_level = params[i+1];
                         compression_level = MIN(MAX(compression_level, 0), Z_BEST_COMPRESSION);
                     }
-                    if( params[i] == CV_IMWRITE_PNG_STRATEGY )
+                    if( params[i] == IMWRITE_PNG_STRATEGY )
                     {
                         compression_strategy = params[i+1];
                         compression_strategy = MIN(MAX(compression_strategy, 0), Z_FIXED);
                     }
-                    if( params[i] == CV_IMWRITE_PNG_BILEVEL )
+                    if( params[i] == IMWRITE_PNG_BILEVEL )
                     {
                         isBilevel = params[i+1] != 0;
                     }
diff --git a/modules/imgcodecs/src/jpeg_exif.cpp b/modules/imgcodecs/src/jpeg_exif.cpp
index adb87e5b77..0704c2f49a 100644
--- a/modules/imgcodecs/src/jpeg_exif.cpp
+++ b/modules/imgcodecs/src/jpeg_exif.cpp
@@ -140,8 +140,8 @@ std::map<int, ExifEntry_t > ExifReader::getExif()
         return m_exif; //Until this moment the map is empty
     }
 
-    bool exifFound = false;
-    while( ( !feof( f ) ) && !exifFound )
+    bool exifFound = false, stopSearch = false;
+    while( ( !feof( f ) ) && !exifFound && !stopSearch )
     {
         count = fread( appMarker, sizeof(unsigned char), markerSize, f );
         if( count < markerSize )
@@ -180,6 +180,7 @@ std::map<int, ExifEntry_t > ExifReader::getExif()
                 break;
 
             default: //No other markers are expected according to standard. May be a signal of error
+                stopSearch = true;
                 break;
         }
     }
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 346c8e44f1..9f4763d34f 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -1828,7 +1828,8 @@ with qualityLevel=B .
 @param image Input 8-bit or floating-point 32-bit, single-channel image.
 @param corners Output vector of detected corners.
 @param maxCorners Maximum number of corners to return. If there are more corners than are found,
-the strongest of them is returned.
+the strongest of them is returned. `maxCorners <= 0` implies that no limit on the maximum is set
+and all detected corners are returned.
 @param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
 parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
 (see cornerMinEigenVal ) or the Harris function response (see cornerHarris ). The corners with the
@@ -3157,10 +3158,12 @@ same object.
 
 @param signature1 First signature, a \f$\texttt{size1}\times \texttt{dims}+1\f$ floating-point matrix.
 Each row stores the point weight followed by the point coordinates. The matrix is allowed to have
-a single column (weights only) if the user-defined cost matrix is used.
+a single column (weights only) if the user-defined cost matrix is used. The weights must be
+non-negative and have at least one non-zero value.
 @param signature2 Second signature of the same format as signature1 , though the number of rows
 may be different. The total weights may be different. In this case an extra "dummy" point is added
-to either signature1 or signature2 .
+to either signature1 or signature2. The weights must be non-negative and have at least one non-zero
+value.
 @param distType Used metric. See cv::DistanceTypes.
 @param cost User-defined \f$\texttt{size1}\times \texttt{size2}\f$ cost matrix. Also, if a cost matrix
 is used, lower boundary lowerBound cannot be calculated because it needs a metric function.
diff --git a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
index 6ed492bcb6..0943a2dfa2 100644
--- a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
@@ -75,6 +75,108 @@ CV_EXPORTS void warpPerspectve(int src_type,
                                uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                                const double M[9], int interpolation, int borderType, const double borderValue[4]);
 
+CV_EXPORTS void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, int dcn, bool swapBlue);
+
+CV_EXPORTS void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
+                               uchar * dst_data, size_t dst_step,
+                               int width, int height,
+                               int scn, bool swapBlue, int greenBits);
+
+CV_EXPORTS void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
+                               uchar * dst_data, size_t dst_step,
+                               int width, int height,
+                               int dcn, bool swapBlue, int greenBits);
+
+CV_EXPORTS void cvtBGRtoGray(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height,
+                             int depth, int scn, bool swapBlue);
+
+CV_EXPORTS void cvtGraytoBGR(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height,
+                             int depth, int dcn);
+
+CV_EXPORTS void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
+                                uchar * dst_data, size_t dst_step,
+                                int width, int height,
+                                int greenBits);
+
+CV_EXPORTS void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
+                                uchar * dst_data, size_t dst_step,
+                                int width, int height,
+                                int greenBits);
+CV_EXPORTS void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isCbCr);
+
+CV_EXPORTS void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isCbCr);
+
+CV_EXPORTS void cvtBGRtoXYZ(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue);
+
+CV_EXPORTS void cvtXYZtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue);
+
+CV_EXPORTS void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV);
+
+CV_EXPORTS void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
+
+CV_EXPORTS void cvtBGRtoLab(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isLab, bool srgb);
+
+CV_EXPORTS void cvtLabtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isLab, bool srgb);
+
+CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int dst_width, int dst_height,
+                                    int dcn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                      uchar * dst_data, size_t dst_step,
+                                      int dst_width, int dst_height,
+                                      int dcn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
+                                      uchar * dst_data, size_t dst_step,
+                                      int width, int height,
+                                      int scn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int width, int height,
+                                    int dcn, bool swapBlue, int uIdx, int ycn);
+
+CV_EXPORTS void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
+                                        uchar * dst_data, size_t dst_step,
+                                        int width, int height);
+
+CV_EXPORTS void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
+                                        uchar * dst_data, size_t dst_step,
+                                        int width, int height);
+
 //! @}
 
 }}
diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index 18cdbd73ee..53fb66bbbe 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -408,14 +408,12 @@ public:
                         __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
                         __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
 
-                        __m128i v_dx_ml = _mm_mullo_epi16(v_dx, v_dx), v_dx_mh = _mm_mulhi_epi16(v_dx, v_dx);
-                        __m128i v_dy_ml = _mm_mullo_epi16(v_dy, v_dy), v_dy_mh = _mm_mulhi_epi16(v_dy, v_dy);
-
-                        __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx_ml, v_dx_mh), _mm_unpacklo_epi16(v_dy_ml, v_dy_mh));
-                        _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
-
-                        v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx_ml, v_dx_mh), _mm_unpackhi_epi16(v_dy_ml, v_dy_mh));
-                        _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
+                        __m128i v_dx_dy_ml = _mm_unpacklo_epi16(v_dx, v_dy);
+                        __m128i v_dx_dy_mh = _mm_unpackhi_epi16(v_dx, v_dy);
+                        __m128i v_norm_ml = _mm_madd_epi16(v_dx_dy_ml, v_dx_dy_ml);
+                        __m128i v_norm_mh = _mm_madd_epi16(v_dx_dy_mh, v_dx_dy_mh);
+                        _mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml);
+                        _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh);
                     }
                 }
 #elif CV_NEON
@@ -799,14 +797,12 @@ while (borderPeaks.try_pop(m))
                         __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
                         __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
 
-                        __m128i v_dx_ml = _mm_mullo_epi16(v_dx, v_dx), v_dx_mh = _mm_mulhi_epi16(v_dx, v_dx);
-                        __m128i v_dy_ml = _mm_mullo_epi16(v_dy, v_dy), v_dy_mh = _mm_mulhi_epi16(v_dy, v_dy);
-
-                        __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx_ml, v_dx_mh), _mm_unpacklo_epi16(v_dy_ml, v_dy_mh));
-                        _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
-
-                        v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx_ml, v_dx_mh), _mm_unpackhi_epi16(v_dy_ml, v_dy_mh));
-                        _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
+                        __m128i v_dx_dy_ml = _mm_unpacklo_epi16(v_dx, v_dy);
+                        __m128i v_dx_dy_mh = _mm_unpackhi_epi16(v_dx, v_dy);
+                        __m128i v_norm_ml = _mm_madd_epi16(v_dx_dy_ml, v_dx_dy_ml);
+                        __m128i v_norm_mh = _mm_madd_epi16(v_dx_dy_mh, v_dx_dy_mh);
+                        _mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml);
+                        _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh);
                     }
                 }
 #elif CV_NEON
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 95197ec03c..b08c608220 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -93,6 +93,7 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include <limits>
+#include "hal_replacement.hpp"
 
 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
 
@@ -171,32 +172,38 @@ class CvtColorLoop_Invoker : public ParallelLoopBody
     typedef typename Cvt::channel_type _Tp;
 public:
 
-    CvtColorLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt) :
-        ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt)
+    CvtColorLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt) :
+        ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_),
+        width(width_), cvt(_cvt)
     {
     }
 
     virtual void operator()(const Range& range) const
     {
-        const uchar* yS = src.ptr<uchar>(range.start);
-        uchar* yD = dst.ptr<uchar>(range.start);
+        const uchar* yS = src_data + static_cast<size_t>(range.start) * src_step;
+        uchar* yD = dst_data + static_cast<size_t>(range.start) * dst_step;
 
-        for( int i = range.start; i < range.end; ++i, yS += src.step, yD += dst.step )
-            cvt((const _Tp*)yS, (_Tp*)yD, src.cols);
+        for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step )
+            cvt(reinterpret_cast<const _Tp*>(yS), reinterpret_cast<_Tp*>(yD), width);
     }
 
 private:
-    const Mat& src;
-    Mat& dst;
+    const uchar * src_data;
+    size_t src_step;
+    uchar * dst_data;
+    size_t dst_step;
+    int width;
     const Cvt& cvt;
 
     const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
 };
 
 template <typename Cvt>
-void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
+void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
 {
-    parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
+    parallel_for_(Range(0, height),
+                  CvtColorLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt),
+                  (width * height) / static_cast<double>(1<<16));
 }
 
 #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700)
@@ -211,17 +218,17 @@ class CvtColorIPPLoop_Invoker :
 {
 public:
 
-    CvtColorIPPLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt, bool *_ok) :
-        ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt), ok(_ok)
+    CvtColorIPPLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt, bool *_ok) :
+        ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), cvt(_cvt), ok(_ok)
     {
         *ok = true;
     }
 
     virtual void operator()(const Range& range) const
     {
-        const void *yS = src.ptr<uchar>(range.start);
-        void *yD = dst.ptr<uchar>(range.start);
-        if( !cvt(yS, (int)src.step[0], yD, (int)dst.step[0], src.cols, range.end - range.start) )
+        const void *yS = src_data + src_step * range.start;
+        void *yD = dst_data + dst_step * range.start;
+        if( !cvt(yS, static_cast<int>(src_step), yD, static_cast<int>(dst_step), width, range.end - range.start) )
             *ok = false;
         else
         {
@@ -230,8 +237,11 @@ public:
     }
 
 private:
-    const Mat& src;
-    Mat& dst;
+    const uchar * src_data;
+    size_t src_step;
+    uchar * dst_data;
+    size_t dst_step;
+    int width;
     const Cvt& cvt;
     bool *ok;
 
@@ -239,25 +249,28 @@ private:
 };
 
 template <typename Cvt>
-bool CvtColorIPPLoop(const Mat& src, Mat& dst, const Cvt& cvt)
+bool CvtColorIPPLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
 {
     bool ok;
-    parallel_for_(Range(0, src.rows), CvtColorIPPLoop_Invoker<Cvt>(src, dst, cvt, &ok), src.total()/(double)(1<<16) );
+    parallel_for_(Range(0, height), CvtColorIPPLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt, &ok), (width * height)/(double)(1<<16) );
     return ok;
 }
 
 template <typename Cvt>
-bool CvtColorIPPLoopCopy(Mat& src, Mat& dst, const Cvt& cvt)
+bool CvtColorIPPLoopCopy(const uchar * src_data, size_t src_step, int src_type, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
 {
     Mat temp;
-    Mat &source = src;
-    if( src.data == dst.data )
+    Mat src(Size(width, height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat source = src;
+    if( src_data == dst_data )
     {
         src.copyTo(temp);
         source = temp;
     }
     bool ok;
-    parallel_for_(Range(0, source.rows), CvtColorIPPLoop_Invoker<Cvt>(source, dst, cvt, &ok),
+    parallel_for_(Range(0, source.rows),
+                  CvtColorIPPLoop_Invoker<Cvt>(source.data, source.step, dst_data, dst_step,
+                                               source.cols, cvt, &ok),
                   source.total()/(double)(1<<16) );
     return ok;
 }
@@ -354,11 +367,13 @@ static ippiGeneralFunc ippiXYZ2RGBTab[] =
     0, (ippiGeneralFunc)ippiXYZToRGB_32f_C3R, 0, 0
 };
 
+#if IPP_DISABLE_BLOCK
 static ippiGeneralFunc ippiRGB2HSVTab[] =
 {
     (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
     0, 0, 0, 0
 };
+#endif
 
 static ippiGeneralFunc ippiHSV2RGBTab[] =
 {
@@ -6106,12 +6121,14 @@ const int ITUR_BT_601_CBV = -74448;
 template<int bIdx, int uIdx>
 struct YUV420sp2RGB888Invoker : ParallelLoopBody
 {
-    Mat* dst;
+    uchar * dst_data;
+    size_t dst_step;
+    int width;
     const uchar* my1, *muv;
-    int width, stride;
+    size_t stride;
 
-    YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
-        : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
+    YUV420sp2RGB888Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv)
+        : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), muv(_uv), stride(_stride) {}
 
     void operator()(const Range& range) const
     {
@@ -6128,15 +6145,10 @@ struct YUV420sp2RGB888Invoker : ParallelLoopBody
 
         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-        if(tegra::useTegra() && tegra::cvtYUV4202RGB(bIdx, uIdx, 3, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
-            return;
-#endif
-
         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
         {
-            uchar* row1 = dst->ptr<uchar>(j);
-            uchar* row2 = dst->ptr<uchar>(j + 1);
+            uchar* row1 = dst_data + dst_step * j;
+            uchar* row2 = dst_data + dst_step * (j + 1);
             const uchar* y2 = y1 + stride;
 
             for (int i = 0; i < width; i += 2, row1 += 6, row2 += 6)
@@ -6175,12 +6187,14 @@ struct YUV420sp2RGB888Invoker : ParallelLoopBody
 template<int bIdx, int uIdx>
 struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
 {
-    Mat* dst;
+    uchar * dst_data;
+    size_t dst_step;
+    int width;
     const uchar* my1, *muv;
-    int width, stride;
+    size_t stride;
 
-    YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
-        : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
+    YUV420sp2RGBA8888Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv)
+        : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), muv(_uv), stride(_stride) {}
 
     void operator()(const Range& range) const
     {
@@ -6197,15 +6211,10 @@ struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
 
         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-        if(tegra::useTegra() && tegra::cvtYUV4202RGB(bIdx, uIdx, 4, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
-            return;
-#endif
-
         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
         {
-            uchar* row1 = dst->ptr<uchar>(j);
-            uchar* row2 = dst->ptr<uchar>(j + 1);
+            uchar* row1 = dst_data + dst_step * j;
+            uchar* row2 = dst_data + dst_step * (j + 1);
             const uchar* y2 = y1 + stride;
 
             for (int i = 0; i < width; i += 2, row1 += 8, row2 += 8)
@@ -6248,20 +6257,22 @@ struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
 template<int bIdx>
 struct YUV420p2RGB888Invoker : ParallelLoopBody
 {
-    Mat* dst;
+    uchar * dst_data;
+    size_t dst_step;
+    int width;
     const uchar* my1, *mu, *mv;
-    int width, stride;
+    size_t stride;
     int ustepIdx, vstepIdx;
 
-    YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
-        : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
+    YUV420p2RGB888Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
+        : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), mu(_u), mv(_v), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
 
     void operator()(const Range& range) const
     {
         const int rangeBegin = range.start * 2;
         const int rangeEnd = range.end * 2;
 
-        int uvsteps[2] = {width/2, stride - width/2};
+        int uvsteps[2] = {width/2, static_cast<int>(stride) - width/2};
         int usIdx = ustepIdx, vsIdx = vstepIdx;
 
         const uchar* y1 = my1 + rangeBegin * stride;
@@ -6276,8 +6287,8 @@ struct YUV420p2RGB888Invoker : ParallelLoopBody
 
         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
         {
-            uchar* row1 = dst->ptr<uchar>(j);
-            uchar* row2 = dst->ptr<uchar>(j + 1);
+            uchar* row1 = dst_data + dst_step * j;
+            uchar* row2 = dst_data + dst_step * (j + 1);
             const uchar* y2 = y1 + stride;
 
             for (int i = 0; i < width / 2; i += 1, row1 += 6, row2 += 6)
@@ -6316,20 +6327,22 @@ struct YUV420p2RGB888Invoker : ParallelLoopBody
 template<int bIdx>
 struct YUV420p2RGBA8888Invoker : ParallelLoopBody
 {
-    Mat* dst;
+    uchar * dst_data;
+    size_t dst_step;
+    int width;
     const uchar* my1, *mu, *mv;
-    int width, stride;
+    size_t  stride;
     int ustepIdx, vstepIdx;
 
-    YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
-        : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
+    YUV420p2RGBA8888Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
+        : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), mu(_u), mv(_v), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
 
     void operator()(const Range& range) const
     {
         int rangeBegin = range.start * 2;
         int rangeEnd = range.end * 2;
 
-        int uvsteps[2] = {width/2, stride - width/2};
+        int uvsteps[2] = {width/2, static_cast<int>(stride) - width/2};
         int usIdx = ustepIdx, vsIdx = vstepIdx;
 
         const uchar* y1 = my1 + rangeBegin * stride;
@@ -6344,8 +6357,8 @@ struct YUV420p2RGBA8888Invoker : ParallelLoopBody
 
         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
         {
-            uchar* row1 = dst->ptr<uchar>(j);
-            uchar* row2 = dst->ptr<uchar>(j + 1);
+            uchar* row1 = dst_data + dst_step * j;
+            uchar* row2 = dst_data + dst_step * (j + 1);
             const uchar* y2 = y1 + stride;
 
             for (int i = 0; i < width / 2; i += 1, row1 += 8, row2 += 8)
@@ -6388,70 +6401,78 @@ struct YUV420p2RGBA8888Invoker : ParallelLoopBody
 #define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240)
 
 template<int bIdx, int uIdx>
-inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
+inline void cvtYUV420sp2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _uv)
 {
-    YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
-    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for_(Range(0, _dst.rows/2), converter);
+    YUV420sp2RGB888Invoker<bIdx, uIdx> converter(dst_data, dst_step, dst_width, _stride, _y1,  _uv);
+    if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
+        parallel_for_(Range(0, dst_height/2), converter);
     else
-        converter(Range(0, _dst.rows/2));
+        converter(Range(0, dst_height/2));
 }
 
 template<int bIdx, int uIdx>
-inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
+inline void cvtYUV420sp2RGBA(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _uv)
 {
-    YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
-    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for_(Range(0, _dst.rows/2), converter);
+    YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(dst_data, dst_step, dst_width, _stride, _y1,  _uv);
+    if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
+        parallel_for_(Range(0, dst_height/2), converter);
     else
-        converter(Range(0, _dst.rows/2));
+        converter(Range(0, dst_height/2));
 }
 
 template<int bIdx>
-inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
+inline void cvtYUV420p2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
 {
-    YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for_(Range(0, _dst.rows/2), converter);
+    YUV420p2RGB888Invoker<bIdx> converter(dst_data, dst_step, dst_width, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
+    if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
+        parallel_for_(Range(0, dst_height/2), converter);
     else
-        converter(Range(0, _dst.rows/2));
+        converter(Range(0, dst_height/2));
 }
 
 template<int bIdx>
-inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
+inline void cvtYUV420p2RGBA(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
 {
-    YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for_(Range(0, _dst.rows/2), converter);
+    YUV420p2RGBA8888Invoker<bIdx> converter(dst_data, dst_step, dst_width, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
+    if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
+        parallel_for_(Range(0, dst_height/2), converter);
     else
-        converter(Range(0, _dst.rows/2));
+        converter(Range(0, dst_height/2));
 }
 
 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
 
-template<int bIdx>
+template<int uIdx>
+inline void swapUV(uchar * &, uchar * &) {}
+template<>
+inline void swapUV<2>(uchar * & u, uchar * & v) { std::swap(u, v); }
+
+template<int bIdx, int uIdx>
 struct RGB888toYUV420pInvoker: public ParallelLoopBody
 {
-    RGB888toYUV420pInvoker( const Mat& src, Mat* dst, const int uIdx )
-        : src_(src),
-          dst_(dst),
-          uIdx_(uIdx) { }
+    RGB888toYUV420pInvoker(const uchar * _src_data, size_t _src_step, uchar * _dst_data, size_t _dst_step,
+                           int _src_width, int _src_height, int _scn)
+        : src_data(_src_data), src_step(_src_step),
+          dst_data(_dst_data), dst_step(_dst_step),
+          src_width(_src_width), src_height(_src_height),
+          scn(_scn) { }
 
     void operator()(const Range& rowRange) const
     {
-        const int w = src_.cols;
-        const int h = src_.rows;
+        const int w = src_width;
+        const int h = src_height;
 
-        const int cn = src_.channels();
+        const int cn = scn;
         for( int i = rowRange.start; i < rowRange.end; i++ )
         {
-            const uchar* row0 = src_.ptr<uchar>(2 * i);
-            const uchar* row1 = src_.ptr<uchar>(2 * i + 1);
+            const uchar* row0 = src_data + src_step * (2 * i);
+            const uchar* row1 = src_data + src_step * (2 * i + 1);
+
+            uchar* y = dst_data + dst_step * (2*i);
+            uchar* u = dst_data + dst_step * (h + i/2) + (i % 2) * (w/2);
+            uchar* v = dst_data + dst_step * (h + (i + h/2)/2) + ((i + h/2) % 2) * (w/2);
 
-            uchar* y = dst_->ptr<uchar>(2*i);
-            uchar* u = dst_->ptr<uchar>(h + i/2) + (i % 2) * (w/2);
-            uchar* v = dst_->ptr<uchar>(h + (i + h/2)/2) + ((i + h/2) % 2) * (w/2);
-            if( uIdx_ == 2 ) std::swap(u, v);
+            swapUV<uIdx>(u, v);
 
             for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ )
             {
@@ -6469,8 +6490,8 @@ struct RGB888toYUV420pInvoker: public ParallelLoopBody
 
                 y[2*k + 0]            = saturate_cast<uchar>(y00 >> ITUR_BT_601_SHIFT);
                 y[2*k + 1]            = saturate_cast<uchar>(y01 >> ITUR_BT_601_SHIFT);
-                y[2*k + dst_->step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
-                y[2*k + dst_->step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
+                y[2*k + dst_step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
+                y[2*k + dst_step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
 
                 const int shifted128 = (128 << ITUR_BT_601_SHIFT);
                 int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128;
@@ -6482,27 +6503,33 @@ struct RGB888toYUV420pInvoker: public ParallelLoopBody
         }
     }
 
-    static bool isFit( const Mat& src )
+    static bool isFit( int src_width, int src_height )
     {
-        return (src.total() >= 320*240);
+        return (src_width * src_height >= 320*240);
     }
 
 private:
     RGB888toYUV420pInvoker& operator=(const RGB888toYUV420pInvoker&);
 
-    const Mat& src_;
-    Mat* const dst_;
-    const int uIdx_;
+    const uchar * src_data;
+    size_t src_step;
+    uchar * dst_data;
+    size_t dst_step;
+    int src_width;
+    int src_height;
+    const int scn;
 };
 
 template<int bIdx, int uIdx>
-static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
+static void cvtRGBtoYUV420p(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int src_width, int src_height, int scn)
 {
-    RGB888toYUV420pInvoker<bIdx> colorConverter(src, &dst, uIdx);
-    if( RGB888toYUV420pInvoker<bIdx>::isFit(src) )
-        parallel_for_(Range(0, src.rows/2), colorConverter);
+    RGB888toYUV420pInvoker<bIdx, uIdx> colorConverter(src_data, src_step, dst_data, dst_step, src_width, src_height, scn);
+    if( RGB888toYUV420pInvoker<bIdx, uIdx>::isFit(src_width, src_height) )
+        parallel_for_(Range(0, src_height/2), colorConverter);
     else
-        colorConverter(Range(0, src.rows/2));
+        colorConverter(Range(0, src_height/2));
 }
 
 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
@@ -6510,12 +6537,16 @@ static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
 template<int bIdx, int uIdx, int yIdx>
 struct YUV422toRGB888Invoker : ParallelLoopBody
 {
-    Mat* dst;
-    const uchar* src;
-    int width, stride;
+    uchar * dst_data;
+    size_t dst_step;
+    const uchar * src_data;
+    size_t src_step;
+    int width;
 
-    YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
-        : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
+    YUV422toRGB888Invoker(uchar * _dst_data, size_t _dst_step,
+                          const uchar * _src_data, size_t _src_step,
+                          int _width)
+        : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {}
 
     void operator()(const Range& range) const
     {
@@ -6524,11 +6555,11 @@ struct YUV422toRGB888Invoker : ParallelLoopBody
 
         const int uidx = 1 - yIdx + uIdx * 2;
         const int vidx = (2 + uidx) % 4;
-        const uchar* yuv_src = src + rangeBegin * stride;
+        const uchar* yuv_src = src_data + rangeBegin * src_step;
 
-        for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
+        for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step)
         {
-            uchar* row = dst->ptr<uchar>(j);
+            uchar* row = dst_data + dst_step * j;
 
             for (int i = 0; i < 2 * width; i += 4, row += 6)
             {
@@ -6556,12 +6587,16 @@ struct YUV422toRGB888Invoker : ParallelLoopBody
 template<int bIdx, int uIdx, int yIdx>
 struct YUV422toRGBA8888Invoker : ParallelLoopBody
 {
-    Mat* dst;
-    const uchar* src;
-    int width, stride;
+    uchar * dst_data;
+    size_t dst_step;
+    const uchar * src_data;
+    size_t src_step;
+    int width;
 
-    YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
-        : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
+    YUV422toRGBA8888Invoker(uchar * _dst_data, size_t _dst_step,
+                            const uchar * _src_data, size_t _src_step,
+                            int _width)
+        : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {}
 
     void operator()(const Range& range) const
     {
@@ -6570,11 +6605,11 @@ struct YUV422toRGBA8888Invoker : ParallelLoopBody
 
         const int uidx = 1 - yIdx + uIdx * 2;
         const int vidx = (2 + uidx) % 4;
-        const uchar* yuv_src = src + rangeBegin * stride;
+        const uchar* yuv_src = src_data + rangeBegin * src_step;
 
-        for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
+        for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step)
         {
-            uchar* row = dst->ptr<uchar>(j);
+            uchar* row = dst_data + dst_step * j;
 
             for (int i = 0; i < 2 * width; i += 4, row += 8)
             {
@@ -6604,23 +6639,25 @@ struct YUV422toRGBA8888Invoker : ParallelLoopBody
 #define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240)
 
 template<int bIdx, int uIdx, int yIdx>
-inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
+inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step,
+                           int width, int height)
 {
-    YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
-    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for_(Range(0, _dst.rows), converter);
+    YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(dst_data, dst_step, src_data, src_step, width);
+    if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
+        parallel_for_(Range(0, height), converter);
     else
-        converter(Range(0, _dst.rows));
+        converter(Range(0, height));
 }
 
 template<int bIdx, int uIdx, int yIdx>
-inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
+inline void cvtYUV422toRGBA(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step,
+                           int width, int height)
 {
-    YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
-    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for_(Range(0, _dst.rows), converter);
+    YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(dst_data, dst_step, src_data, src_step, width);
+    if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
+        parallel_for_(Range(0, height), converter);
     else
-        converter(Range(0, _dst.rows));
+        converter(Range(0, height));
 }
 
 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
@@ -7324,598 +7361,1025 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
 #endif
 
-#ifdef HAVE_IPP
-static bool ipp_cvtColor( Mat &src, OutputArray _dst, int code, int dcn )
-{
-    int stype = src.type();
-    int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype);
+}
 
-    Mat dst;
-    Size sz = src.size();
+//
+// HAL functions
+//
 
-    switch( code )
-    {
-#if IPP_VERSION_X100 >= 700
-        case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
-        case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
-            CV_Assert( scn == 3 || scn == 4 );
-            dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
-            _dst.create( sz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+namespace cv {
+namespace hal {
 
-            if( code == CV_BGR2BGRA)
-            {
-                if ( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
-                    return true;
-            }
-            else if( code == CV_BGRA2BGR )
-            {
-                if ( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
-                    return true;
-            }
-            else if( code == CV_BGR2RGBA )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
-                    return true;
-            }
-            else if( code == CV_RGBA2BGR )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
-                    return true;
-            }
-            else if( code == CV_RGB2BGR )
-            {
-                if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
-                    return true;
-            }
+// 8u, 16u, 32f
+void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, int dcn, bool swapBlue)
+{
+    CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+    if(scn == 3 && dcn == 4 && !swapBlue)
+    {
+        if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                             IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
+            return;
+    }
+    else if(scn == 4 && dcn == 3 && !swapBlue)
+    {
+        if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                             IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
+            return;
+    }
+    else if(scn == 3 && dcn == 4 && swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
+            return;
+    }
+    else if(scn == 4 && dcn == 3 && swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
+            return;
+    }
+    else if(scn == 3 && dcn == 3 && swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
+                                IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
+            return;
+    }
 #if IPP_VERSION_X100 >= 810
-            else if( code == CV_RGBA2BGRA )
-            {
-                if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
-                    return true;
-            }
+    else if(scn == 4 && dcn == 4 && swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
+                                IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
+            return;
+    }
+    }
 #endif
-            return false;
 #endif
 
-#if IPP_DISABLE_BLOCK // breaks OCL accuracy tests
-        case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
-        case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
-            CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
-            _dst.create(sz, CV_8UC2);
-            dst = _dst.getMat();
+    int blueIdx = swapBlue ? 2 : 0;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<uchar>(scn, dcn, blueIdx));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<ushort>(scn, dcn, blueIdx));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<float>(scn, dcn, blueIdx));
+}
 
-            CV_SUPPRESS_DEPRECATED_START
+// only 8u
+void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
+                    uchar * dst_data, size_t dst_step,
+                    int width, int height,
+                    int scn, bool swapBlue, int greenBits)
+{
+    CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits);
 
-            if (code == CV_BGR2BGR565 && scn == 3)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R)))
-                    return true;
-            }
-            else if (code == CV_BGRA2BGR565 && scn == 4)
-            {
-                if (CvtColorIPPLoopCopy(src, dst,
-                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                        (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 0, 1, 2, depth)))
-                    return true;
-            }
-            else if (code == CV_RGB2BGR565 && scn == 3)
-            {
-                if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
-                                                                            (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
-                    return true;
-            }
-            else if (code == CV_RGBA2BGR565 && scn == 4)
-            {
-                if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                                            (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
-                    return true;
-            }
-            CV_SUPPRESS_DEPRECATED_END
-            return false;
+#if defined(HAVE_IPP) && IPP_DISABLE_BLOCK // breaks OCL accuracy tests
+    CV_IPP_CHECK()
+    {
+    CV_SUPPRESS_DEPRECATED_START;
+    if (scn == 3 && greenBits == 6 && !swapBlue)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R)))
+            return;
+    }
+    else if (scn == 4 && greenBits == 6 && !swapBlue)
+    {
+        if (CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(CV_8U, scn), dst_data, dst_step, width, height,
+                                IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[CV_8U],
+                                                         (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 0, 1, 2, CV_8U)))
+            return;
+    }
+    else if (scn == 3 && greenBits == 6 && swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(CV_8U, scn), dst_data, dst_step, width, height,
+                                IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[CV_8U],
+                                                         (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, CV_8U)) )
+            return;
+    }
+    else if (scn == 4 && greenBits == 6 && swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(CV_8U, scn), dst_data, dst_step, width, height,
+                                IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[CV_8U],
+                                                         (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, CV_8U)) )
+            return;
+    }
+    CV_SUPPRESS_DEPRECATED_END;
+    }
 #endif
 
-#if IPP_VERSION_X100 < 900
-        case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
-        case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
-            if(dcn <= 0) dcn = (code==CV_BGR5652BGRA || code==CV_BGR5552BGRA || code==CV_BGR5652RGBA || code==CV_BGR5552RGBA) ? 4 : 3;
-            CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );
-            _dst.create(sz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits));
+}
 
-            CV_SUPPRESS_DEPRECATED_START
-            if (code == CV_BGR5652BGR && dcn == 3)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R)))
-                    return true;
-            }
-            else if (code == CV_BGR5652RGB && dcn == 3)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
-                                                                        ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
-                    return true;
-            }
-            else if (code == CV_BGR5652BGRA && dcn == 4)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
-                                                                        ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
-                    return true;
-            }
-            else if (code == CV_BGR5652RGBA && dcn == 4)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
-                                                                        ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
-                    return true;
-            }
-            CV_SUPPRESS_DEPRECATED_END
-            return false;
-#endif
+// only 8u
+void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
+                    uchar * dst_data, size_t dst_step,
+                    int width, int height,
+                    int dcn, bool swapBlue, int greenBits)
+{
+    CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits);
 
-#if IPP_VERSION_X100 >= 700
-        case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
-            CV_Assert( scn == 3 || scn == 4 );
-            _dst.create(sz, CV_MAKETYPE(depth, 1));
-            dst = _dst.getMat();
+#if defined(HAVE_IPP) && IPP_VERSION_X100 < 900
+    CV_IPP_CHECK()
+    {
+    CV_SUPPRESS_DEPRECATED_START;
+    if (dcn == 3 && greenBits == 6 && !swapBlue)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R)))
+            return;
+    }
+    else if (dcn == 3 && greenBits == 6 && swapBlue)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
+                                                     ippiSwapChannelsC3RTab[CV_8U], 2, 1, 0, CV_8U)))
+            return;
+    }
+    else if (dcn == 4 && greenBits == 6 && !swapBlue)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
+                                                     ippiSwapChannelsC3C4RTab[CV_8U], 0, 1, 2, CV_8U)))
+            return;
+    }
+    else if (dcn == 4 && greenBits == 6 && swapBlue)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
+                                                     ippiSwapChannelsC3C4RTab[CV_8U], 2, 1, 0, CV_8U)))
+            return;
+    }
+    CV_SUPPRESS_DEPRECATED_END;
+    }
+#endif
 
-            if( code == CV_BGR2GRAY && depth == CV_32F )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
-                    return true;
-            }
-            else if( code == CV_RGB2GRAY && depth == CV_32F )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
-                    return true;
-            }
-            else if( code == CV_BGRA2GRAY && depth == CV_32F )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
-                    return true;
-            }
-            else if( code == CV_RGBA2GRAY && depth == CV_32F )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
-                    return true;
-            }
-            return false;
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits));
+}
 
-        case CV_GRAY2BGR: case CV_GRAY2BGRA:
-            if( dcn <= 0 ) dcn = (code==CV_GRAY2BGRA) ? 4 : 3;
-            CV_Assert( scn == 1 && (dcn == 3 || dcn == 4));
-            _dst.create(sz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+// 8u, 16u, 32f
+void cvtBGRtoGray(const uchar * src_data, size_t src_step,
+                  uchar * dst_data, size_t dst_step,
+                  int width, int height,
+                  int depth, int scn, bool swapBlue)
+{
+    CALL_HAL(cvtBGRtoGray, cv_hal_cvtBGRtoGray, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue);
 
-            if( code == CV_GRAY2BGR )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPGray2BGRFunctor(ippiCopyP3C3RTab[depth])) )
-                    return true;
-            }
-            else if( code == CV_GRAY2BGRA )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPGray2BGRAFunctor(ippiCopyP3C3RTab[depth], ippiSwapChannelsC3C4RTab[depth], depth)) )
-                    return true;
-            }
-            return false;
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+    if(depth == CV_32F && scn == 3 && !swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
+            return;
+    }
+    else if(depth == CV_32F && scn == 3 && swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
+            return;
+    }
+    else if(depth == CV_32F && scn == 4 && !swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
+            return;
+    }
+    else if(depth == CV_32F && scn == 4 && swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
+            return;
+    }
+    }
 #endif
 
-#if IPP_DISABLE_BLOCK
-        case CV_BGR2YCrCb: case CV_RGB2YCrCb:
-        case CV_BGR2YUV: case CV_RGB2YUV:
-        {
-            CV_Assert( scn == 3 || scn == 4 );
-            static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
-            static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
-            const float* coeffs_f = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_f;
-            const int* coeffs_i = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_i;
+    int blueIdx = swapBlue ? 2 : 0;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<uchar>(scn, blueIdx, 0));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<ushort>(scn, blueIdx, 0));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<float>(scn, blueIdx, 0));
+}
 
-            _dst.create(sz, CV_MAKETYPE(depth, 3));
-            dst = _dst.getMat();
+// 8u, 16u, 32f
+void cvtGraytoBGR(const uchar * src_data, size_t src_step,
+                  uchar * dst_data, size_t dst_step,
+                  int width, int height,
+                  int depth, int dcn)
+{
+    CALL_HAL(cvtGraytoBGR, cv_hal_cvtGraytoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn);
 
-            if (code == CV_RGB2YUV && scn == 3 && depth == CV_8U)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
-                    return true;
-            }
-            else if (code == CV_BGR2YUV && scn == 3 && depth == CV_8U)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
-                                                                        (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
-                    return true;
-            }
-            else if (code == CV_RGB2YUV && scn == 4 && depth == CV_8U)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                                        (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
-                    return true;
-            }
-            else if (code == CV_BGR2YUV && scn == 4 && depth == CV_8U)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                                        (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
-                    return true;
-            }
-            return false;
-        }
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+    if(dcn == 3)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGray2BGRFunctor(ippiCopyP3C3RTab[depth])) )
+            return;
+    }
+    else if(dcn == 4)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGray2BGRAFunctor(ippiCopyP3C3RTab[depth], ippiSwapChannelsC3C4RTab[depth], depth)) )
+            return;
+    }
+    }
 #endif
 
-#if IPP_DISABLE_BLOCK
-        case CV_YCrCb2BGR: case CV_YCrCb2RGB:
-        case CV_YUV2BGR: case CV_YUV2RGB:
-        {
-            if( dcn <= 0 ) dcn = 3;
-            CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
-            static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
-            static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
-            const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_f;
-            const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_i;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<uchar>(dcn));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<ushort>(dcn));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<float>(dcn));
+}
 
-            _dst.create(sz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+// only 8u
+void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
+                     uchar * dst_data, size_t dst_step,
+                     int width, int height,
+                     int greenBits)
+{
+    CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits);
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits));
+}
 
-            if (code == CV_YUV2RGB && dcn == 3 && depth == CV_8U)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
-                    return true;
-            }
-            else if (code == CV_YUV2BGR && dcn == 3 && depth == CV_8U)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
-                                                                        ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
-                    return true;
-            }
-            else if (code == CV_YUV2RGB && dcn == 4 && depth == CV_8U)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
-                                                                        ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
-                    return true;
-            }
-            else if (code == CV_YUV2BGR && dcn == 4 && depth == CV_8U)
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
-                                                                        ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
-                    return true;
-            }
-            return false;
-        }
-#endif
+// only 8u
+void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
+                     uchar * dst_data, size_t dst_step,
+                     int width, int height,
+                     int greenBits)
+{
+    CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits);
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits));
+}
 
-#if IPP_VERSION_X100 >= 700
-        case CV_BGR2XYZ: case CV_RGB2XYZ:
-            CV_Assert( scn == 3 || scn == 4 );
-            _dst.create(sz, CV_MAKETYPE(depth, 3));
-            dst = _dst.getMat();
+// 8u, 16u, 32f
+void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, bool swapBlue, bool isCbCr)
+{
+    CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr);
 
-            if( code == CV_BGR2XYZ && scn == 3 && depth != CV_32F )
-            {
-                if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
-                    return true;
-            }
-            else if( code == CV_BGR2XYZ && scn == 4 && depth != CV_32F )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
-                    return true;
-            }
-            else if( code == CV_RGB2XYZ && scn == 3 && depth != CV_32F )
-            {
-                if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2XYZTab[depth])) )
-                    return true;
-            }
-            else if( code == CV_RGB2XYZ && scn == 4 && depth != CV_32F )
-            {
-                if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 0, 1, 2, depth)) )
-                    return true;
-            }
-            return false;
+#if defined(HAVE_IPP) && IPP_DISABLE_BLOCK
+    CV_IPP_CHECK()
+    {
+    if (scn == 3 && depth == CV_8U && swapBlue && !isCbCr)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
+            return;
+    }
+    else if (scn == 3 && depth == CV_8U && !swapBlue && !isCbCr)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
+                                                     (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
+            return;
+    }
+    else if (scn == 4 && depth == CV_8U && swapBlue && !isCbCr)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                     (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
+            return;
+    }
+    else if (scn == 4 && depth == CV_8U && !swapBlue && !isCbCr)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                     (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
+            return;
+    }
+    }
 #endif
 
-#if IPP_VERSION_X100 >= 700
-        case CV_XYZ2BGR: case CV_XYZ2RGB:
-            if( dcn <= 0 ) dcn = 3;
-            CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
+    static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
+    static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
+    const float* coeffs_f = isCbCr ? 0 : yuv_f;
+    const int* coeffs_i = isCbCr ? 0 : yuv_i;
+    int blueIdx = swapBlue ? 2 : 0;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<uchar>(scn, blueIdx, coeffs_i));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<ushort>(scn, blueIdx, coeffs_i));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_f<float>(scn, blueIdx, coeffs_f));
+}
 
-            _dst.create(sz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int dcn, bool swapBlue, bool isCbCr)
+{
+    CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr);
 
-            if( code == CV_XYZ2BGR && dcn == 3 && depth != CV_32F )
-            {
-                if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
-                    return true;
-            }
-            else if( code == CV_XYZ2BGR && dcn == 4 && depth != CV_32F )
+
+#if defined(HAVE_IPP) && IPP_DISABLE_BLOCK
+    CV_IPP_CHECK()
+    {
+    if (dcn == 3 && depth == CV_8U && swapBlue && !isCbCr)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
+            return;
+    }
+    else if (dcn == 3 && depth == CV_8U && !swapBlue && !isCbCr)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
+                                                               ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
+            return;
+    }
+    else if (dcn == 4 && depth == CV_8U && swapBlue && !isCbCr)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
+                                                               ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
+            return;
+    }
+    else if (dcn == 4 && depth == CV_8U && !swapBlue && !isCbCr)
+    {
+        if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
+                                                               ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
+            return;
+    }
+    }
+#endif
+
+    static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
+    static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
+    const float* coeffs_f = isCbCr ? 0 : yuv_f;
+    const int* coeffs_i = isCbCr ? 0 : yuv_i;
+    int blueIdx = swapBlue ? 2 : 0;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<uchar>(dcn, blueIdx, coeffs_i));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<ushort>(dcn, blueIdx, coeffs_i));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_f<float>(dcn, blueIdx, coeffs_f));
+}
+
+void cvtBGRtoXYZ(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, bool swapBlue)
+{
+    CALL_HAL(cvtBGRtoXYZ, cv_hal_cvtBGRtoXYZ, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+    if(scn == 3 && depth != CV_32F && !swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
+                                IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
+            return;
+    }
+    else if(scn == 4 && depth != CV_32F && !swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
+            return;
+    }
+    else if(scn == 3 && depth != CV_32F && swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
+                                IPPGeneralFunctor(ippiRGB2XYZTab[depth])) )
+            return;
+    }
+    else if(scn == 4 && depth != CV_32F && swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 0, 1, 2, depth)) )
+            return;
+    }
+    }
+#endif
+
+    int blueIdx = swapBlue ? 2 : 0;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2XYZ_i<uchar>(scn, blueIdx, 0));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2XYZ_i<ushort>(scn, blueIdx, 0));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2XYZ_f<float>(scn, blueIdx, 0));
+}
+
+void cvtXYZtoBGR(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int dcn, bool swapBlue)
+{
+    CALL_HAL(cvtXYZtoBGR, cv_hal_cvtXYZtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+    if(dcn == 3 && depth != CV_32F && !swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+            return;
+    }
+    else if(dcn == 4 && depth != CV_32F && !swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+            return;
+    }
+    if(dcn == 3 && depth != CV_32F && swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                IPPGeneralFunctor(ippiXYZ2RGBTab[depth])) )
+            return;
+    }
+    else if(dcn == 4 && depth != CV_32F && swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+            return;
+    }
+    }
+#endif
+
+    int blueIdx = swapBlue ? 2 : 0;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, XYZ2RGB_i<uchar>(dcn, blueIdx, 0));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, XYZ2RGB_i<ushort>(dcn, blueIdx, 0));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, XYZ2RGB_f<float>(dcn, blueIdx, 0));
+}
+
+// 8u, 32f
+void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
+{
+    CALL_HAL(cvtBGRtoHSV, cv_hal_cvtBGRtoHSV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+    if (depth == CV_8U && isFullRange)
+    {
+        if (isHSV)
+        {
+#if IPP_DISABLE_BLOCK // breaks OCL accuracy tests
+            if(scn == 3 && !swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
-                    return true;
+                if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
+                                        IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
+                    return;
             }
-            if( code == CV_XYZ2RGB && dcn == 3 && depth != CV_32F )
+            else if(scn == 4 && !swapBlue)
             {
-                if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiXYZ2RGBTab[depth])) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
+                    return;
             }
-            else if( code == CV_XYZ2RGB && dcn == 4 && depth != CV_32F )
+            else if(scn == 4 && swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
+                    return;
             }
-            return false;
 #endif
-
-#if IPP_VERSION_X100 >= 700
-        case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
-        case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
+        }
+        else
         {
-            CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
-            _dst.create(sz, CV_MAKETYPE(depth, 3));
-            dst = _dst.getMat();
-
-            if( depth == CV_8U || depth == CV_16U )
+            if(scn == 3 && !swapBlue)
             {
-#if IPP_DISABLE_BLOCK // breaks OCL accuracy tests
-                if( code == CV_BGR2HSV_FULL && scn == 3 )
-                {
-                    if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
-                        return true;
-                }
-                else if( code == CV_BGR2HSV_FULL && scn == 4 )
-                {
-                    if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
-                        return true;
-                }
-                else if( code == CV_RGB2HSV_FULL && scn == 4 )
-                {
-                    if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
-                        return true;
-                } else
-#endif
-                if( code == CV_RGB2HSV_FULL && scn == 3 && depth == CV_16U )
-                {
-                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HSVTab[depth])) )
-                        return true;
-                }
-                else if( code == CV_BGR2HLS_FULL && scn == 3 )
-                {
-                    if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
-                        return true;
-                }
-                else if( code == CV_BGR2HLS_FULL && scn == 4 )
-                {
-                    if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
-                        return true;
-                }
-                else if( code == CV_RGB2HLS_FULL && scn == 3 )
-                {
-                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
-                        return true;
-                }
-                else if( code == CV_RGB2HLS_FULL && scn == 4 )
-                {
-                    if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
-                        return true;
-                }
+                if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
+                                        IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+            else if(scn == 4 && !swapBlue)
+            {
+                if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+            else if(scn == 3 && swapBlue)
+            {
+                if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
+                                        IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
+                    return;
+            }
+            else if(scn == 4 && swapBlue)
+            {
+                if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
+                    return;
             }
-            return false;
         }
+    }
+    }
 #endif
 
-#if IPP_VERSION_X100 >= 700
-        case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
-        case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
-        {
-            if( dcn <= 0 ) dcn = 3;
-            CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
-            _dst.create(sz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+    int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180;
+    int blueIdx = swapBlue ? 2 : 0;
+    if(isHSV)
+    {
+        if(depth == CV_8U)
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_b(scn, blueIdx, hrange));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_f(scn, blueIdx, static_cast<float>(hrange)));
+    }
+    else
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_b(scn, blueIdx, hrange));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_f(scn, blueIdx, static_cast<float>(hrange)));
+    }
+}
+
+// 8u, 32f
+void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
+                        uchar * dst_data, size_t dst_step,
+                        int width, int height,
+                        int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
+{
+    CALL_HAL(cvtHSVtoBGR, cv_hal_cvtHSVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV);
 
-            if( depth == CV_8U || depth == CV_16U )
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+    if (depth == CV_8U && isFullRange)
+    {
+        if (isHSV)
+        {
+            if(dcn == 3 && !swapBlue)
             {
-                if( code == CV_HSV2BGR_FULL && dcn == 3 )
-                {
-                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
-                        return true;
-                }
-                else if( code == CV_HSV2BGR_FULL && dcn == 4 )
-                {
-                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
-                        return true;
-                }
-                else if( code == CV_HSV2RGB_FULL && dcn == 3 )
-                {
-                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
-                        return true;
-                }
-                else if( code == CV_HSV2RGB_FULL && dcn == 4 )
-                {
-                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
-                        return true;
-                }
-                else if( code == CV_HLS2BGR_FULL && dcn == 3 )
-                {
-                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
-                        return true;
-                }
-                else if( code == CV_HLS2BGR_FULL && dcn == 4 )
-                {
-                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
-                        return true;
-                }
-                else if( code == CV_HLS2RGB_FULL && dcn == 3 )
-                {
-                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
-                        return true;
-                }
-                else if( code == CV_HLS2RGB_FULL && dcn == 4 )
-                {
-                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
-                        return true;
-                }
+                if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                        IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+            else if(dcn == 4 && !swapBlue)
+            {
+                if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                    IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+            else if(dcn == 3 && swapBlue)
+            {
+                if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                        IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
+                    return;
+            }
+            else if(dcn == 4 && swapBlue)
+            {
+                if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                    IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+                    return;
+            }
+        }
+        else
+        {
+            if(dcn == 3 && !swapBlue)
+            {
+                if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                        IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+            else if(dcn == 4 && !swapBlue)
+            {
+                if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                    IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+            else if(dcn == 3 && swapBlue)
+            {
+                if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                        IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
+                    return;
+            }
+            else if(dcn == 4 && swapBlue)
+            {
+                if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                    IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+                    return;
             }
-            return false;
         }
+    }
+    }
 #endif
 
-#if IPP_DISABLE_BLOCK
-        case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
-        case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
-        {
-            CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
-            bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||
-                        code == CV_BGR2Luv || code == CV_RGB2Luv;
+    int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180;
+    int blueIdx = swapBlue ? 2 : 0;
+    if(isHSV)
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_b(dcn, blueIdx, hrange));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_f(dcn, blueIdx, static_cast<float>(hrange)));
+    }
+    else
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_b(dcn, blueIdx, hrange));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_f(dcn, blueIdx, static_cast<float>(hrange)));
+    }
+}
 
-            _dst.create(sz, CV_MAKETYPE(depth, 3));
-            dst = _dst.getMat();
+// 8u, 32f
+void cvtBGRtoLab(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, bool swapBlue, bool isLab, bool srgb)
+{
+    CALL_HAL(cvtBGRtoLab, cv_hal_cvtBGRtoLab, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isLab, srgb);
 
-            if (code == CV_LBGR2Lab && scn == 3 && depth == CV_8U)
+#if defined(HAVE_IPP) && IPP_DISABLE_BLOCK
+    CV_IPP_CHECK()
+    {
+    if (!srgb)
+    {
+        if (isLab)
+        {
+            if (scn == 3 && depth == CV_8U && !swapBlue)
             {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToLab_8u_C3R)))
-                    return true;
+                if (CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToLab_8u_C3R)))
+                    return;
             }
-            else if (code == CV_LBGR2Lab && scn == 4 && depth == CV_8U)
+            else if (scn == 4 && depth == CV_8U && !swapBlue)
             {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                                        (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 0, 1, 2, depth)))
-                    return true;
+                if (CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                             (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 0, 1, 2, depth)))
+                    return;
             }
-            else
-            if (code == CV_LRGB2Lab && scn == 3 && depth == CV_8U) // slower than OpenCV
+            else if (scn == 3 && depth == CV_8U && swapBlue) // slower than OpenCV
             {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
-                                                                        (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
-                    return true;
+                if (CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
+                                                             (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
+                    return;
             }
-            else if (code == CV_LRGB2Lab && scn == 4 && depth == CV_8U) // slower than OpenCV
+            else if (scn == 4 && depth == CV_8U && swapBlue) // slower than OpenCV
             {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                                        (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
-                    return true;
+                if (CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                             (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
+                    return;
             }
-            else if (code == CV_LRGB2Luv && scn == 3)
+        }
+        else
+        {
+            if (scn == 3 && swapBlue)
             {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGBToLUVTab[depth])))
-                    return true;
+                if (CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralFunctor(ippiRGBToLUVTab[depth])))
+                    return;
             }
-            else if (code == CV_LRGB2Luv && scn == 4)
+            else if (scn == 4 && swapBlue)
             {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                                        ippiRGBToLUVTab[depth], 0, 1, 2, depth)))
-                    return true;
+                if (CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                             ippiRGBToLUVTab[depth], 0, 1, 2, depth)))
+                    return;
             }
-            else if (code == CV_LBGR2Luv && scn == 3)
+            else if (scn == 3 && !swapBlue)
             {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
-                                                                        ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
-                    return true;
+                if (CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
+                                                             ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
+                    return;
             }
-            else if (code == CV_LBGR2Luv && scn == 4)
+            else if (scn == 4 && !swapBlue)
             {
-                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
-                                                                        ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
-                    return true;
+                if (CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                             ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
+                    return;
             }
-            return false;
         }
+    }
+    }
 #endif
 
-#if IPP_DISABLE_BLOCK
-        case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
-        case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
-        {
-            if( dcn <= 0 ) dcn = 3;
-            CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
-            bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||
-                    code == CV_Luv2BGR || code == CV_Luv2RGB;
 
-            _dst.create(sz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+    int blueIdx = swapBlue ? 2 : 0;
+    if(isLab)
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Lab_b(scn, blueIdx, 0, 0, srgb));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Lab_f(scn, blueIdx, 0, 0, srgb));
+    }
+    else
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Luv_b(scn, blueIdx, 0, 0, srgb));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Luv_f(scn, blueIdx, 0, 0, srgb));
+    }
+}
 
-            if( code == CV_Lab2LBGR && dcn == 3 && depth == CV_8U)
+// 8u, 32f
+void cvtLabtoBGR(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int dcn, bool swapBlue, bool isLab, bool srgb)
+{
+    CALL_HAL(cvtLabtoBGR, cv_hal_cvtLabtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isLab, srgb);
+
+#if defined(HAVE_IPP) && IPP_DISABLE_BLOCK
+    CV_IPP_CHECK()
+    {
+    if (!srgb)
+    {
+        if (isLab)
+        {
+            if( dcn == 3 && depth == CV_8U && !swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R)) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R)) )
+                    return;
             }
-            else if( code == CV_Lab2LBGR && dcn == 4 && depth == CV_8U )
+            else if( dcn == 4 && depth == CV_8U && !swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
-                                    ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
+                                                             ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+                    return;
             }
-            if( code == CV_Lab2LRGB && dcn == 3 && depth == CV_8U )
+            if( dcn == 3 && depth == CV_8U && swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
-                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
+                                                             ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+                    return;
             }
-            else if( code == CV_Lab2LRGB && dcn == 4 && depth == CV_8U )
+            else if( dcn == 4 && depth == CV_8U && swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
-                                                                        ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
+                                                             ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+                    return;
             }
-            if( code == CV_Luv2LRGB && dcn == 3 )
+        }
+        else
+        {
+            if( dcn == 3 && swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiLUVToRGBTab[depth])) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralFunctor(ippiLUVToRGBTab[depth])) )
+                    return;
             }
-            else if( code == CV_Luv2LRGB && dcn == 4 )
+            else if( dcn == 4 && swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
-                                                                        ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
+                                                             ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+                    return;
             }
-            if( code == CV_Luv2LBGR && dcn == 3 )
+            if( dcn == 3 && !swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
-                                                                        ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
+                                                             ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+                    return;
             }
-            else if( code == CV_Luv2LBGR && dcn == 4 )
+            else if( dcn == 4 && !swapBlue)
             {
-                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
-                                                                        ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
-                    return true;
+                if( CvtColorIPPLoop(src_data, src_step, dst_data,dst_step, width, height,
+                                    IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
+                                                             ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+                    return;
             }
-            return false;
         }
+    }
+    }
 #endif
 
-        case CV_YUV2GRAY_420:
-        {
-            if (dcn <= 0) dcn = 1;
+    int blueIdx = swapBlue ? 2 : 0;
+    if(isLab)
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Lab2RGB_b(dcn, blueIdx, 0, 0, srgb));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Lab2RGB_f(dcn, blueIdx, 0, 0, srgb));
+    }
+    else
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Luv2RGB_b(dcn, blueIdx, 0, 0, srgb));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Luv2RGB_f(dcn, blueIdx, 0, 0, srgb));
+    }
+}
 
-            CV_Assert( dcn == 1 );
-            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
+void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                uchar * dst_data, size_t dst_step,
+                                int dst_width, int dst_height,
+                                int dcn, bool swapBlue, int uIdx)
+{
+    CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    int blueIdx = swapBlue ? 2 : 0;
+    const uchar* uv = src_data + src_step * static_cast<size_t>(dst_height);
+    switch(dcn*100 + blueIdx * 10 + uIdx)
+    {
+    case 300: cvtYUV420sp2RGB<0, 0> (dst_data, dst_step, dst_width, dst_height, src_step, src_data, uv); break;
+    case 301: cvtYUV420sp2RGB<0, 1> (dst_data, dst_step, dst_width, dst_height, src_step, src_data, uv); break;
+    case 320: cvtYUV420sp2RGB<2, 0> (dst_data, dst_step, dst_width, dst_height, src_step, src_data, uv); break;
+    case 321: cvtYUV420sp2RGB<2, 1> (dst_data, dst_step, dst_width, dst_height, src_step, src_data, uv); break;
+    case 400: cvtYUV420sp2RGBA<0, 0>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, uv); break;
+    case 401: cvtYUV420sp2RGBA<0, 1>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, uv); break;
+    case 420: cvtYUV420sp2RGBA<2, 0>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, uv); break;
+    case 421: cvtYUV420sp2RGBA<2, 1>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, uv); break;
+    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    };
+}
 
-            Size dstSz(sz.width, sz.height * 2 / 3);
-            _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                  uchar * dst_data, size_t dst_step,
+                                  int dst_width, int dst_height,
+                                  int dcn, bool swapBlue, int uIdx)
+{
+    CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    const uchar* u = src_data + src_step * static_cast<size_t>(dst_height);
+    const uchar* v = src_data + src_step * static_cast<size_t>(dst_height + dst_height/4) + (dst_width/2) * ((dst_height % 4)/2);
 
-            if (ippStsNoErr == ippiCopy_8u_C1R(src.data, (int)src.step, dst.data, (int)dst.step,
-                    ippiSize(dstSz.width, dstSz.height)))
-                return true;
-            return false;
-        }
+    int ustepIdx = 0;
+    int vstepIdx = dst_height % 4 == 2 ? 1 : 0;
 
-        case CV_RGBA2mRGBA:
-        {
-            if (dcn <= 0) dcn = 4;
-            CV_Assert( scn == 4 && dcn == 4 );
+    if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
+    int blueIdx = swapBlue ? 2 : 0;
 
-            _dst.create(sz, CV_MAKETYPE(depth, dcn));
-            dst = _dst.getMat();
+    switch(dcn*10 + blueIdx)
+    {
+    case 30: cvtYUV420p2RGB<0>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); break;
+    case 32: cvtYUV420p2RGB<2>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); break;
+    case 40: cvtYUV420p2RGBA<0>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); break;
+    case 42: cvtYUV420p2RGBA<2>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); break;
+    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    };
+}
 
-            if( depth == CV_8U )
-            {
-                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
-                    return true;
-                return false;
-            }
+void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
+                           uchar * dst_data, size_t dst_step,
+                           int width, int height,
+                           int scn, bool swapBlue, int uIdx)
+{
+    CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx);
+    int blueIdx = swapBlue ? 2 : 0;
+    switch(blueIdx + uIdx*10)
+    {
+    case 10: cvtRGBtoYUV420p<0, 1>(src_data, src_step, dst_data, dst_step, width, height, scn); break;
+    case 12: cvtRGBtoYUV420p<2, 1>(src_data, src_step, dst_data, dst_step, width, height, scn); break;
+    case 20: cvtRGBtoYUV420p<0, 2>(src_data, src_step, dst_data, dst_step, width, height, scn); break;
+    case 22: cvtRGBtoYUV420p<2, 2>(src_data, src_step, dst_data, dst_step, width, height, scn); break;
+    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    };
+}
 
-            return false;
-        }
+void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int dcn, bool swapBlue, int uIdx, int ycn)
+{
+    CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn);
+    int blueIdx = swapBlue ? 2 : 0;
+    switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn)
+    {
+    case 3000: cvtYUV422toRGB<0,0,0>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 3001: cvtYUV422toRGB<0,0,1>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 3010: cvtYUV422toRGB<0,1,0>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 3200: cvtYUV422toRGB<2,0,0>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 3201: cvtYUV422toRGB<2,0,1>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 3210: cvtYUV422toRGB<2,1,0>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 4000: cvtYUV422toRGBA<0,0,0>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 4001: cvtYUV422toRGBA<0,0,1>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 4010: cvtYUV422toRGBA<0,1,0>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 4200: cvtYUV422toRGBA<2,0,0>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 4201: cvtYUV422toRGBA<2,0,1>(dst_data, dst_step, src_data, src_step, width, height); break;
+    case 4210: cvtYUV422toRGBA<2,1,0>(dst_data, dst_step, src_data, src_step, width, height); break;
+    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    };
+}
 
-        default:
-            return false;
+void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height)
+{
+    CALL_HAL(cvtRGBAtoMultipliedRGBA, cv_hal_cvtRGBAtoMultipliedRGBA, src_data, src_step, dst_data, dst_step, width, height);
+
+#ifdef HAVE_IPP
+    CV_IPP_CHECK()
+    {
+    if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                        IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
+        return;
     }
-}
 #endif
+
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA<uchar>());
+}
+
+void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height)
+{
+    CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height);
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA<uchar>());
+}
+
+}} // cv::hal::
+
+//
+// Helper functions
+//
+
+inline bool isHSV(int code)
+{
+    switch(code)
+    {
+    case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
+    case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool isLab(int code)
+{
+    switch (code)
+    {
+    case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
+    case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool issRGB(int code)
+{
+    switch (code)
+    {
+    case CV_BGR2Lab: case CV_RGB2Lab: case CV_BGR2Luv: case CV_RGB2Luv:
+    case CV_Lab2BGR: case CV_Lab2RGB: case CV_Luv2BGR: case CV_Luv2RGB:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool swapBlue(int code)
+{
+    switch (code)
+    {
+    case CV_BGR2BGRA: case CV_BGRA2BGR:
+    case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_BGRA2BGR565: case CV_BGRA2BGR555:
+    case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652BGRA: case CV_BGR5552BGRA:
+    case CV_BGR2GRAY: case CV_BGRA2GRAY:
+    case CV_BGR2YCrCb: case CV_BGR2YUV:
+    case CV_YCrCb2BGR: case CV_YUV2BGR:
+    case CV_BGR2XYZ: case CV_XYZ2BGR:
+    case CV_BGR2HSV: case CV_BGR2HLS: case CV_BGR2HSV_FULL: case CV_BGR2HLS_FULL:
+    case CV_YUV2BGR_YV12: case CV_YUV2BGRA_YV12: case CV_YUV2BGR_IYUV: case CV_YUV2BGRA_IYUV:
+    case CV_YUV2BGR_NV21: case CV_YUV2BGRA_NV21: case CV_YUV2BGR_NV12: case CV_YUV2BGRA_NV12:
+    case CV_Lab2BGR: case CV_Luv2BGR: case CV_Lab2LBGR: case CV_Luv2LBGR:
+    case CV_BGR2Lab: case CV_BGR2Luv: case CV_LBGR2Lab: case CV_LBGR2Luv:
+    case CV_HSV2BGR: case CV_HLS2BGR: case CV_HSV2BGR_FULL: case CV_HLS2BGR_FULL:
+    case CV_YUV2BGR_UYVY: case CV_YUV2BGRA_UYVY: case CV_YUV2BGR_YUY2:
+    case CV_YUV2BGRA_YUY2:  case CV_YUV2BGR_YVYU: case CV_YUV2BGRA_YVYU:
+    case CV_BGR2YUV_IYUV: case CV_BGRA2YUV_IYUV: case CV_BGR2YUV_YV12: case CV_BGRA2YUV_YV12:
+        return false;
+    default:
+        return true;
+    }
+}
+
+inline bool isFullRange(int code)
+{
+    switch (code)
+    {
+    case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
+    case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
+        return true;
+    default:
+        return false;
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
@@ -7925,104 +8389,68 @@ static bool ipp_cvtColor( Mat &src, OutputArray _dst, int code, int dcn )
 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
     int stype = _src.type();
-    int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
+    int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), uidx, gbits, ycn;
 
     CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() && !(depth == CV_8U && (code == CV_Luv2BGR || code == CV_Luv2RGB)),
                 ocl_cvtColor(_src, _dst, code, dcn) )
 
-    Mat src = _src.getMat(), dst;
+    Mat src, dst;
+    if (_src.getObj() == _dst.getObj()) // inplace processing (#6653)
+        _src.copyTo(src);
+    else
+        src = _src.getMat();
     Size sz = src.size();
     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );
 
-    CV_IPP_RUN(true, ipp_cvtColor(src, _dst, code, dcn));
-
     switch( code )
     {
         case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
         case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
             CV_Assert( scn == 3 || scn == 4 );
             dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
-            bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;
-
             _dst.create( sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
-
-            if( depth == CV_8U )
-            {
-#ifdef HAVE_TEGRA_OPTIMIZATION
-                if(tegra::useTegra() && tegra::cvtBGR2RGB(src, dst, bidx))
-                    break;
-#endif
-                CvtColorLoop(src, dst, RGB2RGB<uchar>(scn, dcn, bidx));
-            }
-            else if( depth == CV_16U )
-                CvtColorLoop(src, dst, RGB2RGB<ushort>(scn, dcn, bidx));
-            else
-                CvtColorLoop(src, dst, RGB2RGB<float>(scn, dcn, bidx));
+            hal::cvtBGRtoBGR(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                             depth, scn, dcn, swapBlue(code));
             break;
 
         case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
         case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
             CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
+            gbits = code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
+                    code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5;
             _dst.create(sz, CV_8UC2);
             dst = _dst.getMat();
-
-#ifdef HAVE_TEGRA_OPTIMIZATION
-            if(code == CV_BGR2BGR565 || code == CV_BGRA2BGR565 || code == CV_RGB2BGR565  || code == CV_RGBA2BGR565)
-                if(tegra::useTegra() && tegra::cvtRGB2RGB565(src, dst, code == CV_RGB2BGR565 || code == CV_RGBA2BGR565 ? 0 : 2))
-                    break;
-#endif
-
-            CvtColorLoop(src, dst, RGB2RGB5x5(scn,
-                      code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||
-                      code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2,
-                      code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
-                      code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5 // green bits
-                                              ));
+            hal::cvtBGRtoBGR5x5(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                                scn, swapBlue(code), gbits);
             break;
 
         case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
         case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
             if(dcn <= 0) dcn = (code==CV_BGR5652BGRA || code==CV_BGR5552BGRA || code==CV_BGR5652RGBA || code==CV_BGR5552RGBA) ? 4 : 3;
             CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );
+            gbits = code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
+                    code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5;
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
-
-            CvtColorLoop(src, dst, RGB5x52RGB(dcn,
-                      code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
-                      code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2, // blue idx
-                      code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
-                      code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5 // green bits
-                      ));
+            hal::cvtBGR5x5toBGR(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                                dcn, swapBlue(code), gbits);
             break;
 
         case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
             CV_Assert( scn == 3 || scn == 4 );
             _dst.create(sz, CV_MAKETYPE(depth, 1));
             dst = _dst.getMat();
-
-            bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
-
-            if( depth == CV_8U )
-            {
-#ifdef HAVE_TEGRA_OPTIMIZATION
-                if(tegra::useTegra() && tegra::cvtRGB2Gray(src, dst, bidx))
-                    break;
-#endif
-                CvtColorLoop(src, dst, RGB2Gray<uchar>(scn, bidx, 0));
-            }
-            else if( depth == CV_16U )
-                CvtColorLoop(src, dst, RGB2Gray<ushort>(scn, bidx, 0));
-            else
-                CvtColorLoop(src, dst, RGB2Gray<float>(scn, bidx, 0));
+            hal::cvtBGRtoGray(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                              depth, scn, swapBlue(code));
             break;
 
         case CV_BGR5652GRAY: case CV_BGR5552GRAY:
             CV_Assert( scn == 2 && depth == CV_8U );
+            gbits = code == CV_BGR5652GRAY ? 6 : 5;
             _dst.create(sz, CV_8UC1);
             dst = _dst.getMat();
-
-            CvtColorLoop(src, dst, RGB5x52Gray(code == CV_BGR5652GRAY ? 6 : 5));
+            hal::cvtBGR5x5toGray(src.data, src.step, dst.data, dst.step, src.cols, src.rows, gbits);
             break;
 
         case CV_GRAY2BGR: case CV_GRAY2BGRA:
@@ -8030,235 +8458,87 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             CV_Assert( scn == 1 && (dcn == 3 || dcn == 4));
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
-
-            if( depth == CV_8U )
-            {
-#ifdef HAVE_TEGRA_OPTIMIZATION
-                if(tegra::useTegra() && tegra::cvtGray2RGB(src, dst))
-                    break;
-#endif
-                CvtColorLoop(src, dst, Gray2RGB<uchar>(dcn));
-            }
-            else if( depth == CV_16U )
-                CvtColorLoop(src, dst, Gray2RGB<ushort>(dcn));
-            else
-                CvtColorLoop(src, dst, Gray2RGB<float>(dcn));
+            hal::cvtGraytoBGR(src.data, src.step, dst.data, dst.step, src.cols, src.rows, depth, dcn);
             break;
 
         case CV_GRAY2BGR565: case CV_GRAY2BGR555:
             CV_Assert( scn == 1 && depth == CV_8U );
+            gbits = code == CV_GRAY2BGR565 ? 6 : 5;
             _dst.create(sz, CV_8UC2);
             dst = _dst.getMat();
-
-            CvtColorLoop(src, dst, Gray2RGB5x5(code == CV_GRAY2BGR565 ? 6 : 5));
+            hal::cvtGraytoBGR5x5(src.data, src.step, dst.data, dst.step, src.cols, src.rows, gbits);
             break;
 
         case CV_BGR2YCrCb: case CV_RGB2YCrCb:
         case CV_BGR2YUV: case CV_RGB2YUV:
-            {
             CV_Assert( scn == 3 || scn == 4 );
-            bidx = code == CV_BGR2YCrCb || code == CV_BGR2YUV ? 0 : 2;
-            static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
-            static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
-            const float* coeffs_f = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_f;
-            const int* coeffs_i = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_i;
-
             _dst.create(sz, CV_MAKETYPE(depth, 3));
             dst = _dst.getMat();
-
-            if( depth == CV_8U )
-            {
-#ifdef HAVE_TEGRA_OPTIMIZATION
-                if((code == CV_RGB2YCrCb || code == CV_BGR2YCrCb) && tegra::useTegra() && tegra::cvtRGB2YCrCb(src, dst, bidx))
-                    break;
-#endif
-                CvtColorLoop(src, dst, RGB2YCrCb_i<uchar>(scn, bidx, coeffs_i));
-            }
-            else if( depth == CV_16U )
-                CvtColorLoop(src, dst, RGB2YCrCb_i<ushort>(scn, bidx, coeffs_i));
-            else
-                CvtColorLoop(src, dst, RGB2YCrCb_f<float>(scn, bidx, coeffs_f));
-            }
+            hal::cvtBGRtoYUV(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                             depth, scn, swapBlue(code), code == CV_BGR2YCrCb || code == CV_RGB2YCrCb);
             break;
 
         case CV_YCrCb2BGR: case CV_YCrCb2RGB:
         case CV_YUV2BGR: case CV_YUV2RGB:
-            {
             if( dcn <= 0 ) dcn = 3;
             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
-            bidx = code == CV_YCrCb2BGR || code == CV_YUV2BGR ? 0 : 2;
-            static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
-            static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
-            const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_f;
-            const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_i;
-
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
-
-            if( depth == CV_8U )
-                CvtColorLoop(src, dst, YCrCb2RGB_i<uchar>(dcn, bidx, coeffs_i));
-            else if( depth == CV_16U )
-                CvtColorLoop(src, dst, YCrCb2RGB_i<ushort>(dcn, bidx, coeffs_i));
-            else
-                CvtColorLoop(src, dst, YCrCb2RGB_f<float>(dcn, bidx, coeffs_f));
-            }
+            hal::cvtYUVtoBGR(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                             depth, dcn, swapBlue(code), code == CV_YCrCb2BGR || code == CV_YCrCb2RGB);
             break;
 
         case CV_BGR2XYZ: case CV_RGB2XYZ:
             CV_Assert( scn == 3 || scn == 4 );
-            bidx = code == CV_BGR2XYZ ? 0 : 2;
-
             _dst.create(sz, CV_MAKETYPE(depth, 3));
             dst = _dst.getMat();
-
-            if( depth == CV_8U )
-                CvtColorLoop(src, dst, RGB2XYZ_i<uchar>(scn, bidx, 0));
-            else if( depth == CV_16U )
-                CvtColorLoop(src, dst, RGB2XYZ_i<ushort>(scn, bidx, 0));
-            else
-                CvtColorLoop(src, dst, RGB2XYZ_f<float>(scn, bidx, 0));
+            hal::cvtBGRtoXYZ(src.data, src.step, dst.data, dst.step, src.cols, src.rows, depth, scn, swapBlue(code));
             break;
 
         case CV_XYZ2BGR: case CV_XYZ2RGB:
             if( dcn <= 0 ) dcn = 3;
             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
-            bidx = code == CV_XYZ2BGR ? 0 : 2;
-
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
-
-            if( depth == CV_8U )
-                CvtColorLoop(src, dst, XYZ2RGB_i<uchar>(dcn, bidx, 0));
-            else if( depth == CV_16U )
-                CvtColorLoop(src, dst, XYZ2RGB_i<ushort>(dcn, bidx, 0));
-            else
-                CvtColorLoop(src, dst, XYZ2RGB_f<float>(dcn, bidx, 0));
+            hal::cvtXYZtoBGR(src.data, src.step, dst.data, dst.step, src.cols, src.rows, depth, dcn, swapBlue(code));
             break;
 
         case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
         case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
-            {
             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
-            bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
-                code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
-            int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
-                code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
-
             _dst.create(sz, CV_MAKETYPE(depth, 3));
             dst = _dst.getMat();
-
-            if( code == CV_BGR2HSV || code == CV_RGB2HSV ||
-                code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL )
-            {
-#ifdef HAVE_TEGRA_OPTIMIZATION
-                if(tegra::useTegra() && tegra::cvtRGB2HSV(src, dst, bidx, hrange))
-                    break;
-#endif
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, RGB2HSV_b(scn, bidx, hrange));
-                else
-                    CvtColorLoop(src, dst, RGB2HSV_f(scn, bidx, (float)hrange));
-            }
-            else
-            {
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, RGB2HLS_b(scn, bidx, hrange));
-                else
-                    CvtColorLoop(src, dst, RGB2HLS_f(scn, bidx, (float)hrange));
-            }
-            }
+            hal::cvtBGRtoHSV(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                             depth, scn, swapBlue(code), isFullRange(code), isHSV(code));
             break;
 
         case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
         case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
-            {
             if( dcn <= 0 ) dcn = 3;
             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
-            bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
-                code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
-            int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
-                code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
-
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
-
-            if( code == CV_HSV2BGR || code == CV_HSV2RGB ||
-                code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL )
-            {
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, HSV2RGB_b(dcn, bidx, hrange));
-                else
-                    CvtColorLoop(src, dst, HSV2RGB_f(dcn, bidx, (float)hrange));
-            }
-            else
-            {
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, HLS2RGB_b(dcn, bidx, hrange));
-                else
-                    CvtColorLoop(src, dst, HLS2RGB_f(dcn, bidx, (float)hrange));
-            }
-            }
+            hal::cvtHSVtoBGR(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                             depth, dcn, swapBlue(code), isFullRange(code), isHSV(code));
             break;
 
         case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
         case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
-            {
             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
-            bidx = code == CV_BGR2Lab || code == CV_BGR2Luv ||
-                   code == CV_LBGR2Lab || code == CV_LBGR2Luv ? 0 : 2;
-            bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||
-                        code == CV_BGR2Luv || code == CV_RGB2Luv;
-
             _dst.create(sz, CV_MAKETYPE(depth, 3));
             dst = _dst.getMat();
-
-            if( code == CV_BGR2Lab || code == CV_RGB2Lab ||
-                code == CV_LBGR2Lab || code == CV_LRGB2Lab )
-            {
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, RGB2Lab_b(scn, bidx, 0, 0, srgb));
-                else
-                    CvtColorLoop(src, dst, RGB2Lab_f(scn, bidx, 0, 0, srgb));
-            }
-            else
-            {
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, RGB2Luv_b(scn, bidx, 0, 0, srgb));
-                else
-                    CvtColorLoop(src, dst, RGB2Luv_f(scn, bidx, 0, 0, srgb));
-            }
-            }
+            hal::cvtBGRtoLab(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                             depth, scn, swapBlue(code), isLab(code), issRGB(code));
             break;
 
         case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
         case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
-            {
             if( dcn <= 0 ) dcn = 3;
             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
-            bidx = code == CV_Lab2BGR || code == CV_Luv2BGR ||
-                   code == CV_Lab2LBGR || code == CV_Luv2LBGR ? 0 : 2;
-            bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||
-                    code == CV_Luv2BGR || code == CV_Luv2RGB;
-
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
-
-            if( code == CV_Lab2BGR || code == CV_Lab2RGB ||
-                code == CV_Lab2LBGR || code == CV_Lab2LRGB )
-            {
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, Lab2RGB_b(dcn, bidx, 0, 0, srgb));
-                else
-                    CvtColorLoop(src, dst, Lab2RGB_f(dcn, bidx, 0, 0, srgb));
-            }
-            else
-            {
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, Luv2RGB_b(dcn, bidx, 0, 0, srgb));
-                else
-                    CvtColorLoop(src, dst, Luv2RGB_f(dcn, bidx, 0, 0, srgb));
-            }
-            }
+            hal::cvtLabtoBGR(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                             depth, dcn, swapBlue(code), isLab(code), issRGB(code));
             break;
 
         case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
@@ -8270,76 +8550,31 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         case CV_YUV2BGR_NV21:  case CV_YUV2RGB_NV21:  case CV_YUV2BGR_NV12:  case CV_YUV2RGB_NV12:
         case CV_YUV2BGRA_NV21: case CV_YUV2RGBA_NV21: case CV_YUV2BGRA_NV12: case CV_YUV2RGBA_NV12:
-            {
-                // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
-                // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
-
-                if (dcn <= 0) dcn = (code==CV_YUV420sp2BGRA || code==CV_YUV420sp2RGBA || code==CV_YUV2BGRA_NV12 || code==CV_YUV2RGBA_NV12) ? 4 : 3;
-                const int bIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2BGR_NV12 || code==CV_YUV2BGRA_NV12) ? 0 : 2;
-                const int uIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2RGB_NV21 || code==CV_YUV2RGBA_NV21) ? 1 : 0;
-
-                CV_Assert( dcn == 3 || dcn == 4 );
-                CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
-
-                Size dstSz(sz.width, sz.height * 2 / 3);
-                _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-                dst = _dst.getMat();
-
-                int srcstep = (int)src.step;
-                const uchar* y = src.ptr();
-                const uchar* uv = y + srcstep * dstSz.height;
-
-                switch(dcn*100 + bIdx * 10 + uIdx)
-                {
-                    case 300: cvtYUV420sp2RGB<0, 0> (dst, srcstep, y, uv); break;
-                    case 301: cvtYUV420sp2RGB<0, 1> (dst, srcstep, y, uv); break;
-                    case 320: cvtYUV420sp2RGB<2, 0> (dst, srcstep, y, uv); break;
-                    case 321: cvtYUV420sp2RGB<2, 1> (dst, srcstep, y, uv); break;
-                    case 400: cvtYUV420sp2RGBA<0, 0>(dst, srcstep, y, uv); break;
-                    case 401: cvtYUV420sp2RGBA<0, 1>(dst, srcstep, y, uv); break;
-                    case 420: cvtYUV420sp2RGBA<2, 0>(dst, srcstep, y, uv); break;
-                    case 421: cvtYUV420sp2RGBA<2, 1>(dst, srcstep, y, uv); break;
-                    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
-                };
-            }
+            // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
+            // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
+            if (dcn <= 0) dcn = (code==CV_YUV420sp2BGRA || code==CV_YUV420sp2RGBA || code==CV_YUV2BGRA_NV12 || code==CV_YUV2RGBA_NV12) ? 4 : 3;
+            uidx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2RGB_NV21 || code==CV_YUV2RGBA_NV21) ? 1 : 0;
+            CV_Assert( dcn == 3 || dcn == 4 );
+            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
+            _dst.create(Size(sz.width, sz.height * 2 / 3), CV_MAKETYPE(depth, dcn));
+            dst = _dst.getMat();
+            hal::cvtTwoPlaneYUVtoBGR(src.data, src.step, dst.data, dst.step, dst.cols, dst.rows,
+                                     dcn, swapBlue(code), uidx);
             break;
         case CV_YUV2BGR_YV12: case CV_YUV2RGB_YV12: case CV_YUV2BGRA_YV12: case CV_YUV2RGBA_YV12:
         case CV_YUV2BGR_IYUV: case CV_YUV2RGB_IYUV: case CV_YUV2BGRA_IYUV: case CV_YUV2RGBA_IYUV:
-            {
-                //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
-                //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
-
-                if (dcn <= 0) dcn = (code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12 || code==CV_YUV2RGBA_IYUV || code==CV_YUV2BGRA_IYUV) ? 4 : 3;
-                const int bIdx = (code==CV_YUV2BGR_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2BGR_IYUV || code==CV_YUV2BGRA_IYUV) ? 0 : 2;
-                const int uIdx  = (code==CV_YUV2BGR_YV12 || code==CV_YUV2RGB_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12) ? 1 : 0;
-
-                CV_Assert( dcn == 3 || dcn == 4 );
-                CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
-
-                Size dstSz(sz.width, sz.height * 2 / 3);
-                _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-                dst = _dst.getMat();
-
-                int srcstep = (int)src.step;
-                const uchar* y = src.ptr();
-                const uchar* u = y + srcstep * dstSz.height;
-                const uchar* v = y + srcstep * (dstSz.height + dstSz.height/4) + (dstSz.width/2) * ((dstSz.height % 4)/2);
-
-                int ustepIdx = 0;
-                int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;
-
-                if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
-
-                switch(dcn*10 + bIdx)
-                {
-                    case 30: cvtYUV420p2RGB<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
-                    case 32: cvtYUV420p2RGB<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
-                    case 40: cvtYUV420p2RGBA<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
-                    case 42: cvtYUV420p2RGBA<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
-                    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
-                };
-            }
+            //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
+            //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
+            if (dcn <= 0) dcn = (code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12 || code==CV_YUV2RGBA_IYUV || code==CV_YUV2BGRA_IYUV) ? 4 : 3;
+            uidx  = (code==CV_YUV2BGR_YV12 || code==CV_YUV2RGB_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12) ? 1 : 0;
+            CV_Assert( dcn == 3 || dcn == 4 );
+            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
+            _dst.create(Size(sz.width, sz.height * 2 / 3), CV_MAKETYPE(depth, dcn));
+            dst = _dst.getMat();
+            hal::cvtThreePlaneYUVtoBGR(src.data, src.step, dst.data, dst.step, dst.cols, dst.rows,
+                                       dcn, swapBlue(code), uidx);
             break;
+
         case CV_YUV2GRAY_420:
             {
                 if (dcn <= 0) dcn = 1;
@@ -8350,74 +8585,41 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                 Size dstSz(sz.width, sz.height * 2 / 3);
                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
                 dst = _dst.getMat();
+#ifdef HAVE_IPP
+                if (ippStsNoErr == ippiCopy_8u_C1R(src.data, (int)src.step, dst.data, (int)dst.step,
+                                                   ippiSize(dstSz.width, dstSz.height)))
+                    break;
+#endif
                 src(Range(0, dstSz.height), Range::all()).copyTo(dst);
             }
             break;
         case CV_RGB2YUV_YV12: case CV_BGR2YUV_YV12: case CV_RGBA2YUV_YV12: case CV_BGRA2YUV_YV12:
         case CV_RGB2YUV_IYUV: case CV_BGR2YUV_IYUV: case CV_RGBA2YUV_IYUV: case CV_BGRA2YUV_IYUV:
-            {
-                if (dcn <= 0) dcn = 1;
-                const int bIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_BGR2YUV_YV12 || code == CV_BGRA2YUV_YV12) ? 0 : 2;
-                const int uIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_RGB2YUV_IYUV || code == CV_RGBA2YUV_IYUV) ? 1 : 2;
-
-                CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
-                CV_Assert( dcn == 1 );
-                CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
-
-                Size dstSz(sz.width, sz.height / 2 * 3);
-                _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-                dst = _dst.getMat();
-
-                switch(bIdx + uIdx*10)
-                {
-                    case 10: cvtRGBtoYUV420p<0, 1>(src, dst); break;
-                    case 12: cvtRGBtoYUV420p<2, 1>(src, dst); break;
-                    case 20: cvtRGBtoYUV420p<0, 2>(src, dst); break;
-                    case 22: cvtRGBtoYUV420p<2, 2>(src, dst); break;
-                    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
-                };
-            }
+            if (dcn <= 0) dcn = 1;
+            uidx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_RGB2YUV_IYUV || code == CV_RGBA2YUV_IYUV) ? 1 : 2;
+            CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
+            CV_Assert( dcn == 1 );
+            CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
+            _dst.create(Size(sz.width, sz.height / 2 * 3), CV_MAKETYPE(depth, dcn));
+            dst = _dst.getMat();
+            hal::cvtBGRtoThreePlaneYUV(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                                       scn, swapBlue(code), uidx);
             break;
         case CV_YUV2RGB_UYVY: case CV_YUV2BGR_UYVY: case CV_YUV2RGBA_UYVY: case CV_YUV2BGRA_UYVY:
         case CV_YUV2RGB_YUY2: case CV_YUV2BGR_YUY2: case CV_YUV2RGB_YVYU: case CV_YUV2BGR_YVYU:
         case CV_YUV2RGBA_YUY2: case CV_YUV2BGRA_YUY2: case CV_YUV2RGBA_YVYU: case CV_YUV2BGRA_YVYU:
-            {
-                //http://www.fourcc.org/yuv.php#UYVY
-                //http://www.fourcc.org/yuv.php#YUY2
-                //http://www.fourcc.org/yuv.php#YVYU
-
-                if (dcn <= 0) dcn = (code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2RGBA_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 4 : 3;
-                const int bIdx = (code==CV_YUV2BGR_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2BGR_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2BGR_YVYU || code==CV_YUV2BGRA_YVYU) ? 0 : 2;
-                const int ycn  = (code==CV_YUV2RGB_UYVY || code==CV_YUV2BGR_UYVY || code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY) ? 1 : 0;
-                const int uIdx = (code==CV_YUV2RGB_YVYU || code==CV_YUV2BGR_YVYU || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 1 : 0;
-
-                CV_Assert( dcn == 3 || dcn == 4 );
-                CV_Assert( scn == 2 && depth == CV_8U );
-
-                _dst.create(sz, CV_8UC(dcn));
-                dst = _dst.getMat();
-
-                switch(dcn*1000 + bIdx*100 + uIdx*10 + ycn)
-                {
-                    case 3000: cvtYUV422toRGB<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 3001: cvtYUV422toRGB<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 3010: cvtYUV422toRGB<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 3011: cvtYUV422toRGB<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 3200: cvtYUV422toRGB<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 3201: cvtYUV422toRGB<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 3210: cvtYUV422toRGB<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 3211: cvtYUV422toRGB<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 4000: cvtYUV422toRGBA<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 4001: cvtYUV422toRGBA<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 4010: cvtYUV422toRGBA<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 4011: cvtYUV422toRGBA<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 4200: cvtYUV422toRGBA<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 4201: cvtYUV422toRGBA<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 4210: cvtYUV422toRGBA<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    case 4211: cvtYUV422toRGBA<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
-                    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
-                };
-            }
+            //http://www.fourcc.org/yuv.php#UYVY
+            //http://www.fourcc.org/yuv.php#YUY2
+            //http://www.fourcc.org/yuv.php#YVYU
+            if (dcn <= 0) dcn = (code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2RGBA_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 4 : 3;
+            ycn  = (code==CV_YUV2RGB_UYVY || code==CV_YUV2BGR_UYVY || code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY) ? 1 : 0;
+            uidx = (code==CV_YUV2RGB_YVYU || code==CV_YUV2BGR_YVYU || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 1 : 0;
+            CV_Assert( dcn == 3 || dcn == 4 );
+            CV_Assert( scn == 2 && depth == CV_8U );
+            _dst.create(sz, CV_8UC(dcn));
+            dst = _dst.getMat();
+            hal::cvtOnePlaneYUVtoBGR(src.data, src.step, dst.data, dst.step, src.cols, src.rows,
+                                     dcn, swapBlue(code), uidx, ycn);
             break;
         case CV_YUV2GRAY_UYVY: case CV_YUV2GRAY_YUY2:
             {
@@ -8431,42 +8633,22 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             }
             break;
         case CV_RGBA2mRGBA:
-            {
-                if (dcn <= 0) dcn = 4;
-                CV_Assert( scn == 4 && dcn == 4 );
-
-                _dst.create(sz, CV_MAKETYPE(depth, dcn));
-                dst = _dst.getMat();
-
-                if( depth == CV_8U )
-                {
-                    CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
-                }
-                else
-                {
-                    CV_Error( CV_StsBadArg, "Unsupported image depth" );
-                }
-            }
+            if (dcn <= 0) dcn = 4;
+            CV_Assert( scn == 4 && dcn == 4 && depth == CV_8U );
+            _dst.create(sz, CV_MAKETYPE(depth, dcn));
+            dst = _dst.getMat();
+            hal::cvtRGBAtoMultipliedRGBA(src.data, src.step, dst.data, dst.step, src.cols, src.rows);
             break;
         case CV_mRGBA2RGBA:
-            {
-                if (dcn <= 0) dcn = 4;
-                CV_Assert( scn == 4 && dcn == 4 );
-
-                _dst.create(sz, CV_MAKETYPE(depth, dcn));
-                dst = _dst.getMat();
-
-                if( depth == CV_8U )
-                    CvtColorLoop(src, dst, mRGBA2RGBA<uchar>());
-                else
-                {
-                    CV_Error( CV_StsBadArg, "Unsupported image depth" );
-                }
-            }
+            if (dcn <= 0) dcn = 4;
+            CV_Assert( scn == 4 && dcn == 4 && depth == CV_8U );
+            _dst.create(sz, CV_MAKETYPE(depth, dcn));
+            dst = _dst.getMat();
+            hal::cvtMultipliedRGBAtoRGBA(src.data, src.step, dst.data, dst.step, src.cols, src.rows);
             break;
         default:
             CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
-    }
+        }
 }
 
 CV_IMPL void
diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index 4396e0a5ab..a73a30f9dc 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -1813,7 +1813,7 @@ void circle( InputOutputArray _img, Point center, int radius,
     double buf[4];
     scalarToRawData(color, buf, img.type(), 0);
 
-    if( thickness > 1 || line_type >= CV_AA || shift > 0 )
+    if( thickness > 1 || line_type != LINE_8 || shift > 0 )
     {
         center.x <<= XY_SHIFT - shift;
         center.y <<= XY_SHIFT - shift;
diff --git a/modules/imgproc/src/emd.cpp b/modules/imgproc/src/emd.cpp
index 22468da6d1..4dddbc6f14 100644
--- a/modules/imgproc/src/emd.cpp
+++ b/modules/imgproc/src/emd.cpp
@@ -387,7 +387,7 @@ static int icvInitEMD( const float* signature1, int size1,
 
         }
         else if( weight < 0 )
-            CV_Error(CV_StsOutOfRange, "");
+            CV_Error(CV_StsBadArg, "signature1 must not contain negative weights");
     }
 
     for( i = 0; i < size2; i++ )
@@ -401,11 +401,13 @@ static int icvInitEMD( const float* signature1, int size1,
             state->idx2[dsize++] = i;
         }
         else if( weight < 0 )
-            CV_Error(CV_StsOutOfRange, "");
+            CV_Error(CV_StsBadArg, "signature2 must not contain negative weights");
     }
 
-    if( ssize == 0 || dsize == 0 )
-        CV_Error(CV_StsOutOfRange, "");
+    if( ssize == 0 )
+        CV_Error(CV_StsBadArg, "signature1 must contain at least one non-zero value");
+    if( dsize == 0 )
+        CV_Error(CV_StsBadArg, "signature2 must contain at least one non-zero value");
 
     /* if supply different than the demand, add a zero-cost dummy cluster */
     diff = s_sum - d_sum;
diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
index e42cc44ad9..ecc11fb0b4 100644
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -46,6 +46,7 @@
 #define OPENCV_IMGPROC_HAL_REPLACEMENT_HPP
 
 #include "opencv2/core/hal/interface.h"
+#include "opencv2/imgproc/hal/interface.h"
 
 #if defined __GNUC__
 #  pragma GCC diagnostic push
@@ -298,6 +299,294 @@ inline int hal_ni_warpPerspectve(int src_type, const uchar *src_data, size_t src
 #define cv_hal_warpPerspective hal_ni_warpPerspectve
 //! @endcond
 
+/**
+   @brief hal_cvtBGRtoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U, CV_16U, CV_32F)
+   @param scn source image channels (3 or 4)
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R channels will be swapped (BGR->RGB or RGB->BGR)
+   Convert between BGR, BGRA, RGB and RGBA image formats.
+ */
+inline int hal_ni_cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGRtoBGR5x5
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param greenBits number of bits for green channel (5 or 6)
+   Convert from BGR, BGRA, RGB and RGBA to packed BGR or RGB (16 bits per pixel, 555 or 565).
+   Support only CV_8U images (input 3 or 4 channels, output 2 channels).
+ */
+inline int hal_ni_cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGR5x5toBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param greenBits number of bits for green channel (5 or 6)
+   Convert from packed BGR or RGB (16 bits per pixel, 555 or 565) to BGR, BGRA, RGB and RGBA.
+   Support only CV_8U images (input 2 channels, output 3 or 4 channels).
+ */
+inline int hal_ni_cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGRtoGray
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   Convert from BGR, BGRA, RGB or RGBA to 1-channel gray.
+ */
+inline int hal_ni_cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtGraytoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param dcn destination image channels (3 or 4)
+   Convert from 1-channel gray to BGR, RGB, RGBA or BGRA.
+ */
+inline int hal_ni_cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGR5x5toGray
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param greenBits number of bits for green channel (5 or 6)
+   Convert from packed BGR (16 bits per pixel, 555 or 565) to 1-channel gray.
+   Support only CV_8U images.
+ */
+inline int hal_ni_cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtGraytoBGR5x5
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param greenBits number of bits for green channel (5 or 6)
+   Convert from 1-channel gray to packed BGR (16 bits per pixel, 555 or 565).
+   Support only CV_8U images.
+ */
+inline int hal_ni_cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGRtoYUV
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param isCbCr if set to true write output in YCbCr format
+   Convert from BGR, RGB, BGRA or RGBA to YUV or YCbCr.
+ */
+inline int hal_ni_cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtYUVtoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param isCbCr if set to true treat source as YCbCr
+   Convert from YUV or YCbCr to BGR, RGB, BGRA or RGBA.
+ */
+inline int hal_ni_cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGRtoXYZ
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   Convert from BGR, RGB, BGRA or RGBA to XYZ.
+ */
+inline int hal_ni_cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtXYZtoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   Convert from XYZ to BGR, RGB, BGRA or RGBA.
+ */
+inline int hal_ni_cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGRtoHSV
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U or CV_32F)
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param isFullRange if set to true write hue in range 0-255 (0-360 for float) otherwise in range 0-180
+   @param isHSV if set to true write HSV otherwise HSL
+   Convert from BGR, RGB, BGRA or RGBA to HSV or HSL.
+ */
+inline int hal_ni_cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtHSVtoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U or CV_32F)
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param isFullRange if set to true read hue in range 0-255 (0-360 for float) otherwise in range 0-180
+   @param isHSV if set to true treat source as HSV otherwise HSL
+   Convert from HSV or HSL to BGR, RGB, BGRA or RGBA.
+ */
+inline int hal_ni_cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGRtoLab
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U or CV_32F)
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param isLab if set to true write Lab otherwise Luv
+   @param srgb if set to true use sRGB gamma correction
+   Convert from BGR, RGB, BGRA or RGBA to Lab or Luv.
+ */
+inline int hal_ni_cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtLabtoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param depth image depth (one of CV_8U or CV_32F)
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param isLab if set to true treat input as Lab otherwise Luv
+   @param srgb if set to true use sRGB gamma correction
+   Convert from Lab or Luv to BGR, RGB, BGRA or RGBA.
+ */
+inline int hal_ni_cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtTwoPlaneYUVtoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param dst_width,dst_height destination image size
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
+   Convert from YUV (YUV420sp (or NV12/NV21) - Y plane followed by interleaved U/V plane) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+ */
+inline int hal_ni_cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtThreePlaneYUVtoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param dst_width,dst_height destination image size
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel plane index (0 or 1)
+   Convert from YUV (YUV420p (or YV12/YV21) - Y plane followed by U and V planes) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+ */
+inline int hal_ni_cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtBGRtoThreePlaneYUV
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param uIdx U-channel plane index (0 or 1)
+   Convert from BGR, RGB, BGRA or RGBA to YUV (YUV420p (or YV12/YV21) - Y plane followed by U and V planes).
+   Only for CV_8U.
+ */
+inline int hal_ni_cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtOnePlaneYUVtoBGR
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index (0 or 1)
+   @param ycn Y-channel index (0 or 1)
+   Convert from UYVY, YUY2 or YVYU to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+ */
+inline int hal_ni_cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+
+/**
+   @brief hal_cvtRGBAtoMultipliedRGBA
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   Convert from BGRA or RGBA to format with multiplied alpha channel.
+   Only for CV_8U.
+ */
+inline int hal_ni_cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtMultipliedRGBAtoRGBA
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   Convert from format with multiplied alpha channel to BGRA or RGBA.
+   Only for CV_8U.
+ */
+inline int hal_ni_cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_cvtBGRtoBGR hal_ni_cvtBGRtoBGR
+#define cv_hal_cvtBGRtoBGR5x5 hal_ni_cvtBGRtoBGR5x5
+#define cv_hal_cvtBGR5x5toBGR hal_ni_cvtBGR5x5toBGR
+#define cv_hal_cvtBGRtoGray hal_ni_cvtBGRtoGray
+#define cv_hal_cvtGraytoBGR hal_ni_cvtGraytoBGR
+#define cv_hal_cvtBGR5x5toGray hal_ni_cvtBGR5x5toGray
+#define cv_hal_cvtGraytoBGR5x5 hal_ni_cvtGraytoBGR5x5
+#define cv_hal_cvtBGRtoYUV hal_ni_cvtBGRtoYUV
+#define cv_hal_cvtYUVtoBGR hal_ni_cvtYUVtoBGR
+#define cv_hal_cvtBGRtoXYZ hal_ni_cvtBGRtoXYZ
+#define cv_hal_cvtXYZtoBGR hal_ni_cvtXYZtoBGR
+#define cv_hal_cvtBGRtoHSV hal_ni_cvtBGRtoHSV
+#define cv_hal_cvtHSVtoBGR hal_ni_cvtHSVtoBGR
+#define cv_hal_cvtBGRtoLab hal_ni_cvtBGRtoLab
+#define cv_hal_cvtLabtoBGR hal_ni_cvtLabtoBGR
+#define cv_hal_cvtTwoPlaneYUVtoBGR hal_ni_cvtTwoPlaneYUVtoBGR
+#define cv_hal_cvtThreePlaneYUVtoBGR hal_ni_cvtThreePlaneYUVtoBGR
+#define cv_hal_cvtBGRtoThreePlaneYUV hal_ni_cvtBGRtoThreePlaneYUV
+#define cv_hal_cvtOnePlaneYUVtoBGR hal_ni_cvtOnePlaneYUVtoBGR
+#define cv_hal_cvtRGBAtoMultipliedRGBA hal_ni_cvtRGBAtoMultipliedRGBA
+#define cv_hal_cvtMultipliedRGBAtoRGBA hal_ni_cvtMultipliedRGBAtoRGBA
+//! @endcond
+
 //! @}
 
 #if defined __GNUC__
@@ -306,7 +595,6 @@ inline int hal_ni_warpPerspectve(int src_type, const uchar *src_data, size_t src
 #  pragma warning( pop )
 #endif
 
-
 #include "custom_hal.hpp"
 
 //! @cond IGNORED
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 7e181684cb..640f07e95f 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -1188,6 +1188,7 @@ public:
 
     virtual void operator() (const Range & range) const
     {
+        Ipp32s levelNum = histSize + 1;
         Mat phist(hist->size(), hist->type(), Scalar::all(0));
 #if IPP_VERSION_X100 >= 900
         IppiSize roi = {src->cols, range.end - range.start};
@@ -1196,7 +1197,7 @@ public:
         IppiHistogramSpec *pSpec = NULL;
         Ipp8u *pBuffer = NULL;
 
-        if(ippiHistogramGetBufferSize(ipp8u, roi, &histSize, 1, 1, &specSize, &bufferSize) < 0)
+        if(ippiHistogramGetBufferSize(ipp8u, roi, &levelNum, 1, 1, &specSize, &bufferSize) < 0)
         {
             *ok = false;
             return;
@@ -1217,7 +1218,7 @@ public:
             return;
         }
 
-        if(ippiHistogramUniformInit(ipp8u, (Ipp32f*)&low, (Ipp32f*)&high, (Ipp32s*)&histSize, 1, pSpec) < 0)
+        if(ippiHistogramUniformInit(ipp8u, (Ipp32f*)&low, (Ipp32f*)&high, (Ipp32s*)&levelNum, 1, pSpec) < 0)
         {
             if(pSpec)   ippFree(pSpec);
             if(pBuffer) ippFree(pBuffer);
@@ -1233,7 +1234,7 @@ public:
 #else
         CV_SUPPRESS_DEPRECATED_START
         IppStatus status = ippiHistogramEven_8u_C1R(src->ptr(range.start), (int)src->step, ippiSize(src->cols, range.end - range.start),
-            phist.ptr<Ipp32s>(), (Ipp32s*)(Ipp32f*)*levels, histSize, (Ipp32s)low, (Ipp32s)high);
+            phist.ptr<Ipp32s>(), (Ipp32s*)(Ipp32f*)*levels, levelNum, (Ipp32s)low, (Ipp32s)high);
         CV_SUPPRESS_DEPRECATED_END
 #endif
         if(status < 0)
@@ -1282,7 +1283,7 @@ static bool ipp_calchist(const Mat* images, int nimages, const int* channels,
                 !accumulate && uniform)
         {
             ihist.setTo(Scalar::all(0));
-            AutoBuffer<Ipp32f> levels(histSize[0] + 1);
+            AutoBuffer<Ipp32f> levels(histSize[0]);
 
             bool ok = true;
             const Mat & src = images[0];
@@ -1290,7 +1291,7 @@ static bool ipp_calchist(const Mat* images, int nimages, const int* channels,
 #ifdef HAVE_CONCURRENCY
             nstripes = 1;
 #endif
-            IPPCalcHistInvoker invoker(src, ihist, levels, histSize[0] + 1, ranges[0][0], ranges[0][1], &ok);
+            IPPCalcHistInvoker invoker(src, ihist, levels, histSize[0], ranges[0][0], ranges[0][1], &ok);
             Range range(0, src.rows);
             parallel_for_(range, invoker, nstripes);
 
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index d98ec11ad2..54054883df 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -3477,7 +3477,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
 {
     Size ssize = _src.size();
 
-    CV_Assert( ssize.area() > 0 );
+    CV_Assert( ssize.width > 0 && ssize.height > 0 );
     CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
     if( dsize.area() == 0 )
     {
@@ -3498,10 +3498,11 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
     _dst.create(dsize, src.type());
     Mat dst = _dst.getMat();
 
-    if (dsize == ssize) {
-      // Source and destination are of same size. Use simple copy.
-      src.copyTo(dst);
-      return;
+    if (dsize == ssize)
+    {
+        // Source and destination are of same size. Use simple copy.
+        src.copyTo(dst);
+        return;
     }
 
     hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index f401a679d5..952d4e3fd2 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -229,6 +229,8 @@ struct ColumnSum<int, uchar> :
 
         #if CV_SSE2
             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+        #elif CV_NEON
+            bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
         #endif
 
         if( width != (int)sum.size() )
@@ -256,8 +258,11 @@ struct ColumnSum<int, uchar> :
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width - 4; i+=4 )
-                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                if(haveNEON)
+                {
+                    for( ; i <= width - 4; i+=4 )
+                        vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                }
                 #endif
                 for( ; i < width; i++ )
                     SUM[i] += Sp[i];
@@ -303,20 +308,23 @@ struct ColumnSum<int, uchar> :
                     }
                 }
                 #elif CV_NEON
-                float32x4_t v_scale = vdupq_n_f32((float)_scale);
-                for( ; i <= width-8; i+=8 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
-                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+                    float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                    for( ; i <= width-8; i+=8 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                        int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 
-                    uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
-                    uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+                        uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                        uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
 
-                    uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
-                    vst1_u8(D + i, vqmovn_u16(v_dst));
+                        uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
+                        vst1_u8(D + i, vqmovn_u16(v_dst));
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
-                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                    }
                 }
                 #endif
                 for( ; i < width; i++ )
@@ -351,16 +359,19 @@ struct ColumnSum<int, uchar> :
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width-8; i+=8 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
-                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+                    for( ; i <= width-8; i+=8 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                        int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 
-                    uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
-                    vst1_u8(D + i, vqmovn_u16(v_dst));
+                        uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
+                        vst1_u8(D + i, vqmovn_u16(v_dst));
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
-                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                    }
                 }
                 #endif
 
@@ -404,6 +415,8 @@ struct ColumnSum<int, short> :
 
         #if CV_SSE2
             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+        #elif CV_NEON
+            bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
         #endif
 
         if( width != (int)sum.size() )
@@ -411,6 +424,7 @@ struct ColumnSum<int, short> :
             sum.resize(width);
             sumCount = 0;
         }
+
         SUM = &sum[0];
         if( sumCount == 0 )
         {
@@ -430,8 +444,11 @@ struct ColumnSum<int, short> :
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width - 4; i+=4 )
-                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                if(haveNEON)
+                {
+                    for( ; i <= width - 4; i+=4 )
+                        vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                }
                 #endif
                 for( ; i < width; i++ )
                     SUM[i] += Sp[i];
@@ -475,18 +492,21 @@ struct ColumnSum<int, short> :
                     }
                 }
                 #elif CV_NEON
-                float32x4_t v_scale = vdupq_n_f32((float)_scale);
-                for( ; i <= width-8; i+=8 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
-                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+                    float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                    for( ; i <= width-8; i+=8 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                        int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 
-                    int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
-                    int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
-                    vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
+                        int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                        int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+                        vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
-                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                    }
                 }
                 #endif
                 for( ; i < width; i++ )
@@ -520,15 +540,18 @@ struct ColumnSum<int, short> :
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width-8; i+=8 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
-                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+                    for( ; i <= width-8; i+=8 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                        int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 
-                    vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
+                        vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
-                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                    }
                 }
                 #endif
 
@@ -570,8 +593,11 @@ struct ColumnSum<int, ushort> :
         int* SUM;
         bool haveScale = scale != 1;
         double _scale = scale;
+
         #if CV_SSE2
-                bool haveSSE2 =  checkHardwareSupport(CV_CPU_SSE2);
+            bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+        #elif CV_NEON
+            bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
         #endif
 
         if( width != (int)sum.size() )
@@ -579,6 +605,7 @@ struct ColumnSum<int, ushort> :
             sum.resize(width);
             sumCount = 0;
         }
+
         SUM = &sum[0];
         if( sumCount == 0 )
         {
@@ -590,16 +617,19 @@ struct ColumnSum<int, ushort> :
                 #if CV_SSE2
                 if(haveSSE2)
                 {
-                    for( ; i < width-4; i+=4 )
+                    for( ; i <= width-4; i+=4 )
                     {
                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
-                        _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
+                        _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width - 4; i+=4 )
-                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                if(haveNEON)
+                {
+                    for( ; i <= width - 4; i+=4 )
+                        vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                }
                 #endif
                 for( ; i < width; i++ )
                     SUM[i] += Sp[i];
@@ -642,18 +672,21 @@ struct ColumnSum<int, ushort> :
                     }
                 }
                 #elif CV_NEON
-                float32x4_t v_scale = vdupq_n_f32((float)_scale);
-                for( ; i <= width-8; i+=8 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
-                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+                    float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                    for( ; i <= width-8; i+=8 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                        int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 
-                    uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
-                    uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
-                    vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
+                        uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                        uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+                        vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
-                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                    }
                 }
                 #endif
                 for( ; i < width; i++ )
@@ -686,15 +719,18 @@ struct ColumnSum<int, ushort> :
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width-8; i+=8 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
-                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+                    for( ; i <= width-8; i+=8 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                        int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 
-                    vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
+                        vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
-                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                    }
                 }
                 #endif
 
@@ -738,6 +774,8 @@ struct ColumnSum<int, int> :
 
         #if CV_SSE2
             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+        #elif CV_NEON
+            bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
         #endif
 
         if( width != (int)sum.size() )
@@ -745,6 +783,7 @@ struct ColumnSum<int, int> :
             sum.resize(width);
             sumCount = 0;
         }
+
         SUM = &sum[0];
         if( sumCount == 0 )
         {
@@ -764,8 +803,11 @@ struct ColumnSum<int, int> :
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width - 4; i+=4 )
-                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                if(haveNEON)
+                {
+                    for( ; i <= width - 4; i+=4 )
+                        vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                }
                 #endif
                 for( ; i < width; i++ )
                     SUM[i] += Sp[i];
@@ -803,15 +845,18 @@ struct ColumnSum<int, int> :
                     }
                 }
                 #elif CV_NEON
-                float32x4_t v_scale = vdupq_n_f32((float)_scale);
-                for( ; i <= width-4; i+=4 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                    for( ; i <= width-4; i+=4 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
 
-                    int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
-                    vst1q_s32(D + i, v_s0d);
+                        int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                        vst1q_s32(D + i, v_s0d);
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    }
                 }
                 #endif
                 for( ; i < width; i++ )
@@ -838,12 +883,15 @@ struct ColumnSum<int, int> :
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width-4; i+=4 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    for( ; i <= width-4; i+=4 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
 
-                    vst1q_s32(D + i, v_s0);
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(D + i, v_s0);
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    }
                 }
                 #endif
 
@@ -887,7 +935,9 @@ struct ColumnSum<int, float> :
         double _scale = scale;
 
         #if CV_SSE2
-        bool haveSSE2 =  checkHardwareSupport(CV_CPU_SSE2);
+            bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+        #elif CV_NEON
+            bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
         #endif
 
         if( width != (int)sum.size() )
@@ -899,26 +949,27 @@ struct ColumnSum<int, float> :
         SUM = &sum[0];
         if( sumCount == 0 )
         {
-            memset((void *)SUM, 0, sizeof(int) * width);
-
+            memset((void*)SUM, 0, width*sizeof(int));
             for( ; sumCount < ksize - 1; sumCount++, src++ )
             {
                 const int* Sp = (const int*)src[0];
                 i = 0;
-
                 #if CV_SSE2
                 if(haveSSE2)
                 {
-                    for( ; i < width-4; i+=4 )
+                    for( ; i <= width-4; i+=4 )
                     {
                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
-                        _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
+                        _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width - 4; i+=4 )
-                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                if(haveNEON)
+                {
+                    for( ; i <= width - 4; i+=4 )
+                        vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                }
                 #endif
 
                 for( ; i < width; i++ )
@@ -956,17 +1007,20 @@ struct ColumnSum<int, float> :
                     }
                 }
                 #elif CV_NEON
-                float32x4_t v_scale = vdupq_n_f32((float)_scale);
-                for( ; i <= width-8; i+=8 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
-                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+                    float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                    for( ; i <= width-8; i+=8 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                        int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 
-                    vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
-                    vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+                        vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                        vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
-                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                    }
                 }
                 #endif
 
@@ -995,16 +1049,19 @@ struct ColumnSum<int, float> :
                     }
                 }
                 #elif CV_NEON
-                for( ; i <= width-8; i+=8 )
+                if(haveNEON)
                 {
-                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
-                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+                    for( ; i <= width-8; i+=8 )
+                    {
+                        int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                        int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 
-                    vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
-                    vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
+                        vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
+                        vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
 
-                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
-                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                        vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                        vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                    }
                 }
                 #endif
 
@@ -2791,7 +2848,7 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
 
     bool useSortNet = ksize == 3 || (ksize == 5
 #if !(CV_SSE2 || CV_NEON)
-            && src0.depth() > CV_8U
+            && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 )
 #endif
         );
 
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index 13f0fa284b..a63c59ef34 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -397,7 +397,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
     size_t dst_step = _dst.step/sizeof(dst[0]);
 
 #if CV_SSE2
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
+    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 #endif
 
     if( _src.isContinuous() && _dst.isContinuous() )
@@ -665,7 +665,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
     size_t src_step = _src.step/sizeof(src[0]);
     size_t dst_step = _dst.step/sizeof(dst[0]);
 
-#if CV_SSE2
+#if CV_SSE
     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
 #endif
 
@@ -720,7 +720,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-#if CV_SSE2
+#if CV_SSE
                 if( useSIMD )
                 {
                     __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
@@ -758,7 +758,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-#if CV_SSE2
+#if CV_SSE
                 if( useSIMD )
                 {
                     __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
@@ -796,7 +796,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-#if CV_SSE2
+#if CV_SSE
                 if( useSIMD )
                 {
                     __m128 thresh4 = _mm_set1_ps(thresh);
@@ -827,7 +827,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-#if CV_SSE2
+#if CV_SSE
                 if( useSIMD )
                 {
                     __m128 thresh4 = _mm_set1_ps(thresh);
@@ -866,7 +866,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-#if CV_SSE2
+#if CV_SSE
                 if( useSIMD )
                 {
                     __m128 thresh4 = _mm_set1_ps(thresh);
@@ -915,6 +915,10 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
     size_t src_step = _src.step / sizeof(src[0]);
     size_t dst_step = _dst.step / sizeof(dst[0]);
 
+#if CV_SSE2
+    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+#endif
+
     if (_src.isContinuous() && _dst.isContinuous())
     {
         roi.width *= roi.height;
@@ -927,6 +931,45 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
+#if CV_SSE2
+            if( useSIMD )
+            {
+                __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    __m128d v0, v1, v2, v3;
+                    v0 = _mm_loadu_pd( src + j );
+                    v1 = _mm_loadu_pd( src + j + 2 );
+                    v2 = _mm_loadu_pd( src + j + 4 );
+                    v3 = _mm_loadu_pd( src + j + 6 );
+                    v0 = _mm_cmpgt_pd( v0, thresh2 );
+                    v1 = _mm_cmpgt_pd( v1, thresh2 );
+                    v2 = _mm_cmpgt_pd( v2, thresh2 );
+                    v3 = _mm_cmpgt_pd( v3, thresh2 );
+                    v0 = _mm_and_pd( v0, maxval2 );
+                    v1 = _mm_and_pd( v1, maxval2 );
+                    v2 = _mm_and_pd( v2, maxval2 );
+                    v3 = _mm_and_pd( v3, maxval2 );
+                    _mm_storeu_pd( dst + j, v0 );
+                    _mm_storeu_pd( dst + j + 2, v1 );
+                    _mm_storeu_pd( dst + j + 4, v2 );
+                    _mm_storeu_pd( dst + j + 6, v3 );
+                }
+            }
+#elif CV_NEON && defined(__aarch64__)
+            float64x2_t v_thresh = vdupq_n_f64(thresh);
+            uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
+
+            for( ; j <= roi.width - 4; j += 4 )
+            {
+                float64x2_t v_src0 = vld1q_f64(src + j);
+                float64x2_t v_src1 = vld1q_f64(src + j + 2);
+                uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), v_maxval);
+                uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), v_maxval);
+                vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
+                vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
+            }
+#endif
 
             for (; j < roi.width; j++)
                 dst[j] = src[j] > thresh ? maxval : 0;
@@ -938,6 +981,45 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         {
             j = 0;
 
+#if CV_SSE2
+            if( useSIMD )
+            {
+                __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    __m128d v0, v1, v2, v3;
+                    v0 = _mm_loadu_pd( src + j );
+                    v1 = _mm_loadu_pd( src + j + 2 );
+                    v2 = _mm_loadu_pd( src + j + 4 );
+                    v3 = _mm_loadu_pd( src + j + 6 );
+                    v0 = _mm_cmple_pd( v0, thresh2 );
+                    v1 = _mm_cmple_pd( v1, thresh2 );
+                    v2 = _mm_cmple_pd( v2, thresh2 );
+                    v3 = _mm_cmple_pd( v3, thresh2 );
+                    v0 = _mm_and_pd( v0, maxval2 );
+                    v1 = _mm_and_pd( v1, maxval2 );
+                    v2 = _mm_and_pd( v2, maxval2 );
+                    v3 = _mm_and_pd( v3, maxval2 );
+                    _mm_storeu_pd( dst + j, v0 );
+                    _mm_storeu_pd( dst + j + 2, v1 );
+                    _mm_storeu_pd( dst + j + 4, v2 );
+                    _mm_storeu_pd( dst + j + 6, v3 );
+                }
+            }
+#elif CV_NEON && defined(__aarch64__)
+            float64x2_t v_thresh = vdupq_n_f64(thresh);
+            uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
+
+            for( ; j <= roi.width - 4; j += 4 )
+            {
+                float64x2_t v_src0 = vld1q_f64(src + j);
+                float64x2_t v_src1 = vld1q_f64(src + j + 2);
+                uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), v_maxval);
+                uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), v_maxval);
+                vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
+                vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
+            }
+#endif
             for (; j < roi.width; j++)
                 dst[j] = src[j] <= thresh ? maxval : 0;
         }
@@ -948,6 +1030,40 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         {
             j = 0;
 
+#if CV_SSE2
+            if( useSIMD )
+            {
+                __m128d thresh2 = _mm_set1_pd(thresh);
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    __m128d v0, v1, v2, v3;
+                    v0 = _mm_loadu_pd( src + j );
+                    v1 = _mm_loadu_pd( src + j + 2 );
+                    v2 = _mm_loadu_pd( src + j + 4 );
+                    v3 = _mm_loadu_pd( src + j + 6 );
+                    v0 = _mm_min_pd( v0, thresh2 );
+                    v1 = _mm_min_pd( v1, thresh2 );
+                    v2 = _mm_min_pd( v2, thresh2 );
+                    v3 = _mm_min_pd( v3, thresh2 );
+                    _mm_storeu_pd( dst + j, v0 );
+                    _mm_storeu_pd( dst + j + 2, v1 );
+                    _mm_storeu_pd( dst + j + 4, v2 );
+                    _mm_storeu_pd( dst + j + 6, v3 );
+                }
+            }
+#elif CV_NEON && defined(__aarch64__)
+            float64x2_t v_thresh = vdupq_n_f64(thresh);
+
+            for( ; j <= roi.width - 4; j += 4 )
+            {
+                float64x2_t v_src0 = vld1q_f64(src + j);
+                float64x2_t v_src1 = vld1q_f64(src + j + 2);
+                float64x2_t v_dst0 = vminq_f64(v_src0, v_thresh);
+                float64x2_t v_dst1 = vminq_f64(v_src1, v_thresh);
+                vst1q_f64(dst + j, v_dst0);
+                vst1q_f64(dst + j + 2, v_dst1);
+            }
+#endif
             for (; j < roi.width; j++)
                 dst[j] = std::min(src[j], thresh);
         }
@@ -958,6 +1074,42 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         {
             j = 0;
 
+#if CV_SSE2
+            if( useSIMD )
+            {
+                __m128d thresh2 = _mm_set1_pd(thresh);
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    __m128d v0, v1, v2, v3;
+                    v0 = _mm_loadu_pd( src + j );
+                    v1 = _mm_loadu_pd( src + j + 2 );
+                    v2 = _mm_loadu_pd( src + j + 4 );
+                    v3 = _mm_loadu_pd( src + j + 6 );
+                    v0 = _mm_and_pd( v0, _mm_cmpgt_pd(v0, thresh2));
+                    v1 = _mm_and_pd( v1, _mm_cmpgt_pd(v1, thresh2));
+                    v2 = _mm_and_pd( v2, _mm_cmpgt_pd(v2, thresh2));
+                    v3 = _mm_and_pd( v3, _mm_cmpgt_pd(v3, thresh2));
+                    _mm_storeu_pd( dst + j, v0 );
+                    _mm_storeu_pd( dst + j + 2, v1 );
+                    _mm_storeu_pd( dst + j + 4, v2 );
+                    _mm_storeu_pd( dst + j + 6, v3 );
+                }
+            }
+#elif CV_NEON && defined(__aarch64__)
+            float64x2_t v_thresh = vdupq_n_f64(thresh);
+
+            for( ; j <= roi.width - 4; j += 4 )
+            {
+                float64x2_t v_src0 = vld1q_f64(src + j);
+                float64x2_t v_src1 = vld1q_f64(src + j + 2);
+                uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh),
+                                              vreinterpretq_u64_f64(v_src0));
+                uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh),
+                                              vreinterpretq_u64_f64(v_src1));
+                vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
+                vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
+            }
+#endif
             for (; j < roi.width; j++)
             {
                 double v = src[j];
@@ -971,6 +1123,42 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         {
             j = 0;
 
+#if CV_SSE2
+            if( useSIMD )
+            {
+                __m128d thresh2 = _mm_set1_pd(thresh);
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    __m128d v0, v1, v2, v3;
+                    v0 = _mm_loadu_pd( src + j );
+                    v1 = _mm_loadu_pd( src + j + 2 );
+                    v2 = _mm_loadu_pd( src + j + 4 );
+                    v3 = _mm_loadu_pd( src + j + 6 );
+                    v0 = _mm_and_pd( v0, _mm_cmple_pd(v0, thresh2));
+                    v1 = _mm_and_pd( v1, _mm_cmple_pd(v1, thresh2));
+                    v2 = _mm_and_pd( v2, _mm_cmple_pd(v2, thresh2));
+                    v3 = _mm_and_pd( v3, _mm_cmple_pd(v3, thresh2));
+                    _mm_storeu_pd( dst + j, v0 );
+                    _mm_storeu_pd( dst + j + 2, v1 );
+                    _mm_storeu_pd( dst + j + 4, v2 );
+                    _mm_storeu_pd( dst + j + 6, v3 );
+                }
+            }
+#elif CV_NEON && defined(__aarch64__)
+            float64x2_t v_thresh = vdupq_n_f64(thresh);
+
+            for( ; j <= roi.width - 4; j += 4 )
+            {
+                float64x2_t v_src0 = vld1q_f64(src + j);
+                float64x2_t v_src1 = vld1q_f64(src + j + 2);
+                uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh),
+                                              vreinterpretq_u64_f64(v_src0));
+                uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh),
+                                              vreinterpretq_u64_f64(v_src1));
+                vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
+                vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
+            }
+#endif
             for (; j < roi.width; j++)
             {
                 double v = src[j];
diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp
index 4756b7f42e..7a9c912836 100644
--- a/modules/imgproc/test/test_imgwarp_strict.cpp
+++ b/modules/imgproc/test/test_imgwarp_strict.cpp
@@ -1218,3 +1218,34 @@ TEST(Imgproc_Resize_Test, accuracy) { CV_Resize_Test test; test.safe_run(); }
 TEST(Imgproc_Remap_Test, accuracy) { CV_Remap_Test test; test.safe_run(); }
 TEST(Imgproc_WarpAffine_Test, accuracy) { CV_WarpAffine_Test test; test.safe_run(); }
 TEST(Imgproc_WarpPerspective_Test, accuracy) { CV_WarpPerspective_Test test; test.safe_run(); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef OPENCV_TEST_BIGDATA
+
+CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)
+
+class Imgproc_Resize :
+        public ::testing::TestWithParam<Interpolation>
+{
+public:
+    virtual void SetUp()
+    {
+        inter = GetParam();
+    }
+
+protected:
+    int inter;
+};
+
+TEST_P(Imgproc_Resize, BigSize)
+{
+    cv::Mat src(46342, 46342, CV_8UC3, cv::Scalar::all(10)), dst;
+    ASSERT_FALSE(src.empty());
+
+    ASSERT_NO_THROW(cv::resize(src, dst, cv::Size(), 0.5, 0.5, inter));
+}
+
+INSTANTIATE_TEST_CASE_P(Imgproc, Imgproc_Resize, Interpolation::all());
+
+#endif
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index d4560a972e..f8d3f7d0a5 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -991,12 +991,12 @@ class JavaWrapperGenerator(object):
 
         if classinfo.base:
             classinfo.addImports(classinfo.base)
-            type_dict["Ptr_"+name] = \
-                { "j_type" : name,
-                  "jn_type" : "long", "jn_args" : (("__int64", ".nativeObj"),),
-                  "jni_name" : "Ptr<"+name+">(("+name+"*)%(n)s_nativeObj)", "jni_type" : "jlong",
-                  "suffix" : "J" }
-        logging.info('ok: %s', classinfo)
+        type_dict["Ptr_"+name] = \
+            { "j_type" : name,
+              "jn_type" : "long", "jn_args" : (("__int64", ".nativeObj"),),
+              "jni_name" : "Ptr<"+name+">(("+classinfo.fullName(isCPP=True)+"*)%(n)s_nativeObj)", "jni_type" : "jlong",
+              "suffix" : "J" }
+        logging.info('ok: class %s, name: %s, base: %s', classinfo, name, classinfo.base)
 
     def add_const(self, decl): # [ "const cname", val, [], [] ]
         constinfo = ConstInfo(decl, namespaces=self.namespaces)
@@ -1347,7 +1347,7 @@ class JavaWrapperGenerator(object):
                 ret = "return (jlong) new %s(_retval_);" % self.fullTypeName(fi.ctype)
             elif fi.ctype.startswith('Ptr_'):
                 c_prologue.append("typedef Ptr<%s> %s;" % (self.fullTypeName(fi.ctype[4:]), fi.ctype))
-                ret = "return (jlong)(new %(ctype)s(_retval_));" % { 'ctype':fi.ctype }
+                ret = "%(ctype)s* curval = new %(ctype)s(_retval_);return (jlong)curval->get();" % { 'ctype':fi.ctype }
             elif self.isWrapped(ret_type): # pointer to wrapped class:
                 ret = "return (jlong) _retval_;"
             elif type_dict[fi.ctype]["jni_type"] == "jdoubleArray":
diff --git a/modules/java/generator/src/cpp/Mat.cpp b/modules/java/generator/src/cpp/Mat.cpp
index d8483cd02f..c85a3d7400 100644
--- a/modules/java/generator/src/cpp/Mat.cpp
+++ b/modules/java/generator/src/cpp/Mat.cpp
@@ -1815,6 +1815,29 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutD
 
 } // extern "C"
 
+namespace {
+  /// map java-array-types to assigned data
+  template<class T> struct JavaOpenCVTrait;
+
+/// less typing for specialisations
+#define JOCvT(t,s,c1,c2) \
+  template<> struct JavaOpenCVTrait<t##Array> { \
+    typedef t value_type;    /* type of array element */ \
+    static const char get[]; /* name of getter */ \
+    static const char put[]; /* name of putter */ \
+    enum {cvtype_1 = c1, cvtype_2 = c2 }; /* allowed OpenCV-types */ \
+  }; \
+  const char JavaOpenCVTrait<t##Array>::get[] = "Mat::nGet" s "()"; \
+  const char JavaOpenCVTrait<t##Array>::put[] = "Mat::nPut" s "()"
+
+  JOCvT(jbyte, "B", CV_8U, CV_8S);
+  JOCvT(jshort, "S", CV_16U, CV_16S);
+  JOCvT(jint, "I", CV_32S, CV_32S);
+  JOCvT(jfloat, "F", CV_32F, CV_32F);
+  JOCvT(jdouble, "D", CV_64F, CV_64F);
+#undef JOCvT
+}
+
 template<typename T> static int mat_put(cv::Mat* m, int row, int col, int count, char* buff)
 {
     if(! m) return 0;
@@ -1845,26 +1868,19 @@ template<typename T> static int mat_put(cv::Mat* m, int row, int col, int count,
     return res;
 }
 
-
-extern "C" {
-
-JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutB
-    (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jbyteArray vals);
-
-JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutB
-    (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jbyteArray vals)
+template<class ARRAY> static jint java_mat_put(JNIEnv* env, jlong self, jint row, jint col, jint count, ARRAY vals)
 {
-    static const char method_name[] = "Mat::nPutB()";
+    static const char *method_name = JavaOpenCVTrait<ARRAY>::put;
     try {
         LOGD("%s", method_name);
         cv::Mat* me = (cv::Mat*) self;
         if(! self) return 0; // no native object behind
-        if(me->depth() != CV_8U && me->depth() != CV_8S) return 0; // incompatible type
+        if(me->depth() != JavaOpenCVTrait<ARRAY>::cvtype_1 && me->depth() != JavaOpenCVTrait<ARRAY>::cvtype_2) return 0; // incompatible type
         if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
 
         char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_put<char>(me, row, col, count, values);
-        env->ReleasePrimitiveArrayCritical(vals, values, 0);
+        int res = mat_put<typename JavaOpenCVTrait<ARRAY>::value_type>(me, row, col, count, values);
+        env->ReleasePrimitiveArrayCritical(vals, values, JNI_ABORT);
         return res;
     } catch(const std::exception &e) {
         throwJavaException(env, &e, method_name);
@@ -1875,31 +1891,24 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutB
     return 0;
 }
 
+extern "C" {
+
+JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutB
+    (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jbyteArray vals);
+
+JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutB
+    (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jbyteArray vals)
+{
+  return java_mat_put(env, self, row, col, count, vals);
+}
+
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutS
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jshortArray vals);
 
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutS
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jshortArray vals)
 {
-    static const char method_name[] = "Mat::nPutS()";
-    try {
-        LOGD("%s", method_name);
-        cv::Mat* me = (cv::Mat*) self;
-        if(! self) return 0; // no native object behind
-        if(me->depth() != CV_16U && me->depth() != CV_16S) return 0; // incompatible type
-        if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
-
-        char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_put<short>(me, row, col, count, values);
-        env->ReleasePrimitiveArrayCritical(vals, values, 0);
-        return res;
-    } catch(const std::exception &e) {
-        throwJavaException(env, &e, method_name);
-    } catch (...) {
-        throwJavaException(env, 0, method_name);
-    }
-
-    return 0;
+  return java_mat_put(env, self, row, col, count, vals);
 }
 
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutI
@@ -1908,25 +1917,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutI
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutI
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jintArray vals)
 {
-    static const char method_name[] = "Mat::nPutI()";
-    try {
-        LOGD("%s", method_name);
-        cv::Mat* me = (cv::Mat*) self;
-        if(! self) return 0; // no native object behind
-        if(me->depth() != CV_32S) return 0; // incompatible type
-        if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
-
-        char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_put<int>(me, row, col, count, values);
-        env->ReleasePrimitiveArrayCritical(vals, values, 0);
-        return res;
-    } catch(const std::exception &e) {
-        throwJavaException(env, &e, method_name);
-    } catch (...) {
-        throwJavaException(env, 0, method_name);
-    }
-
-    return 0;
+  return java_mat_put(env, self, row, col, count, vals);
 }
 
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutF
@@ -1935,31 +1926,12 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutF
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutF
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jfloatArray vals)
 {
-    static const char method_name[] = "Mat::nPutF()";
-    try {
-        LOGD("%s", method_name);
-        cv::Mat* me = (cv::Mat*) self;
-        if(! self) return 0; // no native object behind
-        if(me->depth() != CV_32F) return 0; // incompatible type
-        if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
-
-        char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_put<float>(me, row, col, count, values);
-        env->ReleasePrimitiveArrayCritical(vals, values, 0);
-        return res;
-    } catch(const std::exception &e) {
-        throwJavaException(env, &e, method_name);
-    } catch (...) {
-        throwJavaException(env, 0, method_name);
-    }
-
-    return 0;
+  return java_mat_put(env, self, row, col, count, vals);
 }
 
-
 } // extern "C"
 
-template<typename T> int mat_get(cv::Mat* m, int row, int col, int count, char* buff)
+template<typename T> static int mat_get(cv::Mat* m, int row, int col, int count, char* buff)
 {
     if(! m) return 0;
     if(! buff) return 0;
@@ -1989,24 +1961,17 @@ template<typename T> int mat_get(cv::Mat* m, int row, int col, int count, char*
     return res;
 }
 
-extern "C" {
-
-JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetB
-    (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jbyteArray vals);
-
-JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetB
-    (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jbyteArray vals)
-{
-    static const char method_name[] = "Mat::nGetB()";
+template<class ARRAY> static jint java_mat_get(JNIEnv* env, jlong self, jint row, jint col, jint count, ARRAY vals) {
+    static const char *method_name = JavaOpenCVTrait<ARRAY>::get;
     try {
         LOGD("%s", method_name);
         cv::Mat* me = (cv::Mat*) self;
         if(! self) return 0; // no native object behind
-        if(me->depth() != CV_8U && me->depth() != CV_8S) return 0; // incompatible type
+        if(me->depth() != JavaOpenCVTrait<ARRAY>::cvtype_1 && me->depth() != JavaOpenCVTrait<ARRAY>::cvtype_2) return 0; // incompatible type
         if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
 
         char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_get<char>(me, row, col, count, values);
+        int res = mat_get<typename JavaOpenCVTrait<ARRAY>::value_type>(me, row, col, count, values);
         env->ReleasePrimitiveArrayCritical(vals, values, 0);
         return res;
     } catch(const std::exception &e) {
@@ -2018,31 +1983,24 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetB
     return 0;
 }
 
+extern "C" {
+
+JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetB
+    (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jbyteArray vals);
+
+JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetB
+    (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jbyteArray vals)
+{
+  return java_mat_get(env, self, row, col, count, vals);
+}
+
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetS
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jshortArray vals);
 
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetS
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jshortArray vals)
 {
-    static const char method_name[] = "Mat::nGetS()";
-    try {
-        LOGD("%s", method_name);
-        cv::Mat* me = (cv::Mat*) self;
-        if(! self) return 0; // no native object behind
-        if(me->depth() != CV_16U && me->depth() != CV_16S) return 0; // incompatible type
-        if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
-
-        char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_get<short>(me, row, col, count, values);
-        env->ReleasePrimitiveArrayCritical(vals, values, 0);
-        return res;
-    } catch(const std::exception &e) {
-        throwJavaException(env, &e, method_name);
-    } catch (...) {
-        throwJavaException(env, 0, method_name);
-    }
-
-    return 0;
+  return java_mat_get(env, self, row, col, count, vals);
 }
 
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetI
@@ -2051,25 +2009,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetI
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetI
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jintArray vals)
 {
-    static const char method_name[] = "Mat::nGetI()";
-    try {
-        LOGD("%s", method_name);
-        cv::Mat* me = (cv::Mat*) self;
-        if(! self) return 0; // no native object behind
-        if(me->depth() != CV_32S) return 0; // incompatible type
-        if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
-
-        char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_get<int>(me, row, col, count, values);
-        env->ReleasePrimitiveArrayCritical(vals, values, 0);
-        return res;
-    } catch(const std::exception &e) {
-        throwJavaException(env, &e, method_name);
-    } catch (...) {
-        throwJavaException(env, 0, method_name);
-    }
-
-    return 0;
+  return java_mat_get(env, self, row, col, count, vals);
 }
 
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetF
@@ -2078,25 +2018,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetF
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetF
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jfloatArray vals)
 {
-    static const char method_name[] = "Mat::nGetF()";
-    try {
-        LOGD("%s", method_name);
-        cv::Mat* me = (cv::Mat*) self;
-        if(! self) return 0; // no native object behind
-        if(me->depth() != CV_32F) return 0; // incompatible type
-        if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
-
-        char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_get<float>(me, row, col, count, values);
-        env->ReleasePrimitiveArrayCritical(vals, values, 0);
-        return res;
-    } catch(const std::exception &e) {
-        throwJavaException(env, &e, method_name);
-    } catch (...) {
-        throwJavaException(env, 0, method_name);
-    }
-
-    return 0;
+  return java_mat_get(env, self, row, col, count, vals);
 }
 
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetD
@@ -2105,25 +2027,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetD
 JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetD
     (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jdoubleArray vals)
 {
-    static const char method_name[] = "Mat::nGetD()";
-    try {
-        LOGD("%s", method_name);
-        cv::Mat* me = (cv::Mat*) self;
-        if(! self) return 0; // no native object behind
-        if(me->depth() != CV_64F) return 0; // incompatible type
-        if(me->rows<=row || me->cols<=col) return 0; // indexes out of range
-
-        char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0);
-        int res = mat_get<double>(me, row, col, count, values);
-        env->ReleasePrimitiveArrayCritical(vals, values, 0);
-        return res;
-    } catch(const std::exception &e) {
-        throwJavaException(env, &e, method_name);
-    } catch (...) {
-        throwJavaException(env, 0, method_name);
-    }
-
-    return 0;
+  return java_mat_get(env, self, row, col, count, vals);
 }
 
 JNIEXPORT jdoubleArray JNICALL Java_org_opencv_core_Mat_nGet
diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp
index cea8aec48c..d016810874 100644
--- a/modules/ml/include/opencv2/ml.hpp
+++ b/modules/ml/include/opencv2/ml.hpp
@@ -285,7 +285,7 @@ public:
         <number_of_variables_in_responses>`, containing types of each input and output variable. See
         ml::VariableTypes.
      */
-    CV_WRAP static Ptr<TrainData> create(InputArray samples, int layout, InputArray responses,
+    CV_WRAP static Ptr<cv::ml::TrainData> create(InputArray samples, int layout, InputArray responses,
                                  InputArray varIdx=noArray(), InputArray sampleIdx=noArray(),
                                  InputArray sampleWeights=noArray(), InputArray varType=noArray());
 };
@@ -320,7 +320,7 @@ public:
     @param flags optional flags, depending on the model. Some of the models can be updated with the
         new training samples, not completely overwritten (such as NormalBayesClassifier or ANN_MLP).
      */
-    CV_WRAP virtual bool train( const Ptr<TrainData>& trainData, int flags=0 );
+    CV_WRAP virtual bool train( const Ptr<cv::ml::TrainData>& trainData, int flags=0 );
 
     /** @brief Trains the statistical model
 
@@ -343,7 +343,7 @@ public:
     The method uses StatModel::predict to compute the error. For regression models the error is
     computed as RMS, for classifiers - as a percent of missclassified samples (0%-100%).
      */
-    CV_WRAP virtual float calcError( const Ptr<TrainData>& data, bool test, OutputArray resp ) const;
+    CV_WRAP virtual float calcError( const Ptr<cv::ml::TrainData>& data, bool test, OutputArray resp ) const;
 
     /** @brief Predicts response(s) for the provided sample(s)
 
@@ -357,7 +357,7 @@ public:
 
     The class must implement static `create()` method with no parameters or with all default parameter values
     */
-    template<typename _Tp> static Ptr<_Tp> train(const Ptr<TrainData>& data, int flags=0)
+    template<typename _Tp> static Ptr<_Tp> train(const Ptr<cv::ml::TrainData>& data, int flags=0)
     {
         Ptr<_Tp> model = _Tp::create();
         return !model.empty() && model->train(data, flags) ? model : Ptr<_Tp>();
@@ -667,7 +667,7 @@ public:
     regression (SVM::EPS_SVR or SVM::NU_SVR). If it is SVM::ONE_CLASS, no optimization is made and
     the usual %SVM with parameters specified in params is executed.
      */
-    virtual bool trainAuto( const Ptr<TrainData>& data, int kFold = 10,
+    virtual bool trainAuto( const Ptr<cv::ml::TrainData>& data, int kFold = 10,
                     ParamGrid Cgrid = SVM::getDefaultGrid(SVM::C),
                     ParamGrid gammaGrid  = SVM::getDefaultGrid(SVM::GAMMA),
                     ParamGrid pGrid      = SVM::getDefaultGrid(SVM::P),
diff --git a/modules/python/common.cmake b/modules/python/common.cmake
index cf74f8dd95..7113c32472 100644
--- a/modules/python/common.cmake
+++ b/modules/python/common.cmake
@@ -25,11 +25,12 @@ foreach(m ${OPENCV_PYTHON_MODULES})
 endforeach(m)
 
 # header blacklist
-ocv_list_filterout(opencv_hdrs ".h$")
-ocv_list_filterout(opencv_hdrs "cuda")
-ocv_list_filterout(opencv_hdrs "cudev")
-ocv_list_filterout(opencv_hdrs "/hal/")
-ocv_list_filterout(opencv_hdrs "detection_based_tracker.hpp") # Conditional compilation
+ocv_list_filterout(opencv_hdrs "modules/.*.h$")
+ocv_list_filterout(opencv_hdrs "modules/core/.*/cuda")
+ocv_list_filterout(opencv_hdrs "modules/cuda.*")
+ocv_list_filterout(opencv_hdrs "modules/cudev")
+ocv_list_filterout(opencv_hdrs "modules/core/.*/hal/")
+ocv_list_filterout(opencv_hdrs "modules/.*/detection_based_tracker.hpp") # Conditional compilation
 
 set(cv2_generated_hdrs
     "${CMAKE_CURRENT_BINARY_DIR}/pyopencv_generated_include.h"
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index 3bb98e4f8c..4b07e60fa4 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -1268,6 +1268,52 @@ static PyObject *pycvCreateTrackbar(PyObject*, PyObject *args)
     ERRWRAP2(createTrackbar(trackbar_name, window_name, value, count, OnChange, Py_BuildValue("OO", on_change, Py_None)));
     Py_RETURN_NONE;
 }
+
+static void OnButtonChange(int state, void *param)
+{
+    PyGILState_STATE gstate;
+    gstate = PyGILState_Ensure();
+
+    PyObject *o = (PyObject*)param;
+    PyObject *args;
+    if(PyTuple_GetItem(o, 1) != NULL)
+    {
+        args = Py_BuildValue("(iO)", state, PyTuple_GetItem(o,1));
+    }
+    else
+    {
+        args = Py_BuildValue("(i)", state);
+    }
+
+    PyObject *r = PyObject_Call(PyTuple_GetItem(o, 0), args, NULL);
+    if (r == NULL)
+        PyErr_Print();
+    Py_DECREF(args);
+    PyGILState_Release(gstate);
+}
+
+static PyObject *pycvCreateButton(PyObject*, PyObject *args, PyObject *kw)
+{
+    const char* keywords[] = {"buttonName", "onChange", "userData", "buttonType", "initialButtonState", NULL};
+    PyObject *on_change;
+    PyObject *userdata = NULL;
+    char* button_name;
+    int button_type = 0;
+    bool initial_button_state = false;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "sO|Oii", (char**)keywords, &button_name, &on_change, &userdata, &button_type, &initial_button_state))
+        return NULL;
+    if (!PyCallable_Check(on_change)) {
+        PyErr_SetString(PyExc_TypeError, "onChange must be callable");
+        return NULL;
+    }
+    if (userdata == NULL) {
+        userdata = Py_None;
+    }
+
+    ERRWRAP2(createButton(button_name, OnButtonChange, Py_BuildValue("OO", on_change, userdata), button_type, initial_button_state));
+    Py_RETURN_NONE;
+}
 #endif
 
 ///////////////////////////////////////////////////////////////////////////////////////
@@ -1300,6 +1346,7 @@ static int convert_to_char(PyObject *o, char *dst, const char *name = "no_name")
 static PyMethodDef special_methods[] = {
 #ifdef HAVE_OPENCV_HIGHGUI
   {"createTrackbar", pycvCreateTrackbar, METH_VARARGS, "createTrackbar(trackbarName, windowName, value, count, onChange) -> None"},
+  {"createButton", (PyCFunction)pycvCreateButton, METH_VARARGS | METH_KEYWORDS, "createButton(buttonName, onChange [, userData, buttonType, initialButtonState]) -> None"},
   {"setMouseCallback", (PyCFunction)pycvSetMouseCallback, METH_VARARGS | METH_KEYWORDS, "setMouseCallback(windowName, onMouse [, param]) -> None"},
 #endif
   {NULL, NULL},
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 6b3b97b6af..0acb3d3f1e 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -65,60 +65,71 @@ namespace cv
 //! @addtogroup videoio
 //! @{
 
-// Camera API
-enum { CAP_ANY          = 0,     // autodetect
-       CAP_VFW          = 200,   // platform native
-       CAP_V4L          = 200,
-       CAP_V4L2         = CAP_V4L,
-       CAP_FIREWARE     = 300,   // IEEE 1394 drivers
-       CAP_FIREWIRE     = CAP_FIREWARE,
-       CAP_IEEE1394     = CAP_FIREWARE,
-       CAP_DC1394       = CAP_FIREWARE,
-       CAP_CMU1394      = CAP_FIREWARE,
-       CAP_QT           = 500,   // QuickTime
-       CAP_UNICAP       = 600,   // Unicap drivers
-       CAP_DSHOW        = 700,   // DirectShow (via videoInput)
-       CAP_PVAPI        = 800,   // PvAPI, Prosilica GigE SDK
-       CAP_OPENNI       = 900,   // OpenNI (for Kinect)
-       CAP_OPENNI_ASUS  = 910,   // OpenNI (for Asus Xtion)
-       CAP_ANDROID      = 1000,  // Android - not used
-       CAP_XIAPI        = 1100,  // XIMEA Camera API
-       CAP_AVFOUNDATION = 1200,  // AVFoundation framework for iOS (OS X Lion will have the same API)
-       CAP_GIGANETIX    = 1300,  // Smartek Giganetix GigEVisionSDK
-       CAP_MSMF         = 1400,  // Microsoft Media Foundation (via videoInput)
-       CAP_WINRT        = 1410,  // Microsoft Windows Runtime using Media Foundation
-       CAP_INTELPERC    = 1500,  // Intel Perceptual Computing SDK
-       CAP_OPENNI2      = 1600,  // OpenNI2 (for Kinect)
-       CAP_OPENNI2_ASUS = 1610,  // OpenNI2 (for Asus Xtion and Occipital Structure sensors)
-       CAP_GPHOTO2      = 1700,  // gPhoto2 connection
-       CAP_GSTREAMER    = 1800,  // GStreamer
-       CAP_FFMPEG       = 1900,  // FFMPEG
-       CAP_IMAGES       = 2000   // OpenCV Image Sequence (e.g. img_%02d.jpg)
+/** @brief Capture API backends.
+
+Select preferred API for a capture object.
+To be used in the constructor VideoCapture::VideoCapture or VideoCapture::open
+
+@note Backends are available only if they have been built with your OpenCV binaries.<br>
+Check in <tt>cvconfig.h</tt> to know which APIs are currently available (e.g. <tt>HAVE_MSMF, HAVE_VFW, HAVE_LIBV4L</tt>).
+To enable/disable APIs, you have to:
+  1. re-configure OpenCV using the appropriates CMake switches
+     (e.g. <tt>-DWITH_MSMF=ON -DWITH_VFW=ON ... </tt>) or checking related switch in cmake-gui
+  2. rebuild OpenCV itself
+*/
+enum { CAP_ANY          = 0,            //!< Auto detect
+       CAP_VFW          = 200,          //!< Video For Windows (platform native)
+       CAP_V4L          = 200,          //!< V4L/V4L2 capturing support via libv4l
+       CAP_V4L2         = CAP_V4L,      //!< Same as CAP_V4L
+       CAP_FIREWIRE     = 300,          //!< IEEE 1394 drivers
+       CAP_FIREWARE     = CAP_FIREWIRE, //!< Same as CAP_FIREWIRE
+       CAP_IEEE1394     = CAP_FIREWIRE, //!< Same as CAP_FIREWIRE
+       CAP_DC1394       = CAP_FIREWIRE, //!< Same as CAP_FIREWIRE
+       CAP_CMU1394      = CAP_FIREWIRE, //!< Same as CAP_FIREWIRE
+       CAP_QT           = 500,          //!< QuickTime
+       CAP_UNICAP       = 600,          //!< Unicap drivers
+       CAP_DSHOW        = 700,          //!< DirectShow (via videoInput)
+       CAP_PVAPI        = 800,          //!< PvAPI, Prosilica GigE SDK
+       CAP_OPENNI       = 900,          //!< OpenNI (for Kinect)
+       CAP_OPENNI_ASUS  = 910,          //!< OpenNI (for Asus Xtion)
+       CAP_ANDROID      = 1000,         //!< Android - not used
+       CAP_XIAPI        = 1100,         //!< XIMEA Camera API
+       CAP_AVFOUNDATION = 1200,         //!< AVFoundation framework for iOS (OS X Lion will have the same API)
+       CAP_GIGANETIX    = 1300,         //!< Smartek Giganetix GigEVisionSDK
+       CAP_MSMF         = 1400,         //!< Microsoft Media Foundation (via videoInput)
+       CAP_WINRT        = 1410,         //!< Microsoft Windows Runtime using Media Foundation
+       CAP_INTELPERC    = 1500,         //!< Intel Perceptual Computing SDK
+       CAP_OPENNI2      = 1600,         //!< OpenNI2 (for Kinect)
+       CAP_OPENNI2_ASUS = 1610,         //!< OpenNI2 (for Asus Xtion and Occipital Structure sensors)
+       CAP_GPHOTO2      = 1700,         //!< gPhoto2 connection
+       CAP_GSTREAMER    = 1800,         //!< GStreamer
+       CAP_FFMPEG       = 1900,         //!< FFMPEG
+       CAP_IMAGES       = 2000          //!< OpenCV Image Sequence (e.g. img_%02d.jpg)
      };
 
-// generic properties (based on DC1394 properties)
-enum { CAP_PROP_POS_MSEC       =0,
-       CAP_PROP_POS_FRAMES     =1,
-       CAP_PROP_POS_AVI_RATIO  =2,
-       CAP_PROP_FRAME_WIDTH    =3,
-       CAP_PROP_FRAME_HEIGHT   =4,
-       CAP_PROP_FPS            =5,
-       CAP_PROP_FOURCC         =6,
-       CAP_PROP_FRAME_COUNT    =7,
-       CAP_PROP_FORMAT         =8,
-       CAP_PROP_MODE           =9,
-       CAP_PROP_BRIGHTNESS    =10,
-       CAP_PROP_CONTRAST      =11,
-       CAP_PROP_SATURATION    =12,
-       CAP_PROP_HUE           =13,
-       CAP_PROP_GAIN          =14,
-       CAP_PROP_EXPOSURE      =15,
-       CAP_PROP_CONVERT_RGB   =16,
-       CAP_PROP_WHITE_BALANCE_BLUE_U =17,
-       CAP_PROP_RECTIFICATION =18,
+//! generic properties (based on DC1394 properties)
+enum { CAP_PROP_POS_MSEC       =0, //!< Current position of the video file in milliseconds.
+       CAP_PROP_POS_FRAMES     =1, //!< 0-based index of the frame to be decoded/captured next.
+       CAP_PROP_POS_AVI_RATIO  =2, //!< Relative position of the video file: 0=start of the film, 1=end of the film.
+       CAP_PROP_FRAME_WIDTH    =3, //!< Width of the frames in the video stream.
+       CAP_PROP_FRAME_HEIGHT   =4, //!< Height of the frames in the video stream.
+       CAP_PROP_FPS            =5, //!< Frame rate.
+       CAP_PROP_FOURCC         =6, //!< 4-character code of codec. see VideoWriter::fourcc .
+       CAP_PROP_FRAME_COUNT    =7, //!< Number of frames in the video file.
+       CAP_PROP_FORMAT         =8, //!< Format of the %Mat objects returned by VideoCapture::retrieve().
+       CAP_PROP_MODE           =9, //!< Backend-specific value indicating the current capture mode.
+       CAP_PROP_BRIGHTNESS    =10, //!< Brightness of the image (only for cameras).
+       CAP_PROP_CONTRAST      =11, //!< Contrast of the image (only for cameras).
+       CAP_PROP_SATURATION    =12, //!< Saturation of the image (only for cameras).
+       CAP_PROP_HUE           =13, //!< Hue of the image (only for cameras).
+       CAP_PROP_GAIN          =14, //!< Gain of the image (only for cameras).
+       CAP_PROP_EXPOSURE      =15, //!< Exposure (only for cameras).
+       CAP_PROP_CONVERT_RGB   =16, //!< Boolean flags indicating whether images should be converted to RGB.
+       CAP_PROP_WHITE_BALANCE_BLUE_U =17, //!< Currently unsupported.
+       CAP_PROP_RECTIFICATION =18, //!< Rectification flag for stereo cameras (note: only supported by DC1394 v 2.x backend currently).
        CAP_PROP_MONOCHROME    =19,
        CAP_PROP_SHARPNESS     =20,
-       CAP_PROP_AUTO_EXPOSURE =21, // DC1394: exposure control done by camera, user can adjust refernce level using this feature
+       CAP_PROP_AUTO_EXPOSURE =21, //!< DC1394: exposure control done by camera, user can adjust reference level using this feature.
        CAP_PROP_GAMMA         =22,
        CAP_PROP_TEMPERATURE   =23,
        CAP_PROP_TRIGGER       =24,
@@ -139,40 +150,43 @@ enum { CAP_PROP_POS_MSEC       =0,
      };
 
 
-// Generic camera output modes.
-// Currently, these are supported through the libv4l interface only.
-enum { CAP_MODE_BGR  = 0, // BGR24 (default)
-       CAP_MODE_RGB  = 1, // RGB24
-       CAP_MODE_GRAY = 2, // Y8
-       CAP_MODE_YUYV = 3  // YUYV
+//! @brief Generic camera output modes.
+//! @note Currently, these are supported through the libv4l interface only.
+enum { CAP_MODE_BGR  = 0, //!< BGR24 (default)
+       CAP_MODE_RGB  = 1, //!< RGB24
+       CAP_MODE_GRAY = 2, //!< Y8
+       CAP_MODE_YUYV = 3  //!< YUYV
      };
 
 
-// DC1394 only
-// modes of the controlling registers (can be: auto, manual, auto single push, absolute Latter allowed with any other mode)
-// every feature can have only one mode turned on at a time
-enum { CAP_PROP_DC1394_OFF                = -4, //turn the feature off (not controlled manually nor automatically)
-       CAP_PROP_DC1394_MODE_MANUAL        = -3, //set automatically when a value of the feature is set by the user
+/** @brief DC1394 only
+
+modes of the controlling registers (can be: auto, manual, auto single push, absolute Latter allowed with any other mode)
+every feature can have only one mode turned on at a time
+*/
+enum { CAP_PROP_DC1394_OFF                = -4, //!< turn the feature off (not controlled manually nor automatically).
+       CAP_PROP_DC1394_MODE_MANUAL        = -3, //!< set automatically when a value of the feature is set by the user.
        CAP_PROP_DC1394_MODE_AUTO          = -2,
        CAP_PROP_DC1394_MODE_ONE_PUSH_AUTO = -1,
        CAP_PROP_DC1394_MAX                = 31
      };
 
 
-// OpenNI map generators
+//! OpenNI map generators
 enum { CAP_OPENNI_DEPTH_GENERATOR = 1 << 31,
        CAP_OPENNI_IMAGE_GENERATOR = 1 << 30,
-       CAP_OPENNI_GENERATORS_MASK = CAP_OPENNI_DEPTH_GENERATOR + CAP_OPENNI_IMAGE_GENERATOR
+       CAP_OPENNI_IR_GENERATOR    = 1 << 29,
+       CAP_OPENNI_GENERATORS_MASK = CAP_OPENNI_DEPTH_GENERATOR + CAP_OPENNI_IMAGE_GENERATOR + CAP_OPENNI_IR_GENERATOR
      };
 
-// Properties of cameras available through OpenNI interfaces
+//! Properties of cameras available through OpenNI interfaces
 enum { CAP_PROP_OPENNI_OUTPUT_MODE       = 100,
-       CAP_PROP_OPENNI_FRAME_MAX_DEPTH   = 101, // in mm
-       CAP_PROP_OPENNI_BASELINE          = 102, // in mm
-       CAP_PROP_OPENNI_FOCAL_LENGTH      = 103, // in pixels
-       CAP_PROP_OPENNI_REGISTRATION      = 104, // flag that synchronizes the remapping depth map to image map
-                                                // by changing depth generator's view point (if the flag is "on") or
-                                                // sets this view point to its normal one (if the flag is "off").
+       CAP_PROP_OPENNI_FRAME_MAX_DEPTH   = 101, //!< In mm
+       CAP_PROP_OPENNI_BASELINE          = 102, //!< In mm
+       CAP_PROP_OPENNI_FOCAL_LENGTH      = 103, //!< In pixels
+       CAP_PROP_OPENNI_REGISTRATION      = 104, //!< Flag that synchronizes the remapping depth map to image map
+                                                //!< by changing depth generator's view point (if the flag is "on") or
+                                                //!< sets this view point to its normal one (if the flag is "off").
        CAP_PROP_OPENNI_REGISTRATION_ON   = CAP_PROP_OPENNI_REGISTRATION,
        CAP_PROP_OPENNI_APPROX_FRAME_SYNC = 105,
        CAP_PROP_OPENNI_MAX_BUFFER_SIZE   = 106,
@@ -183,28 +197,31 @@ enum { CAP_PROP_OPENNI_OUTPUT_MODE       = 100,
        CAP_PROP_OPENNI2_MIRROR           = 111
      };
 
-// OpenNI shortcats
+//! OpenNI shortcuts
 enum { CAP_OPENNI_IMAGE_GENERATOR_PRESENT         = CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
        CAP_OPENNI_IMAGE_GENERATOR_OUTPUT_MODE     = CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_OUTPUT_MODE,
+       CAP_OPENNI_DEPTH_GENERATOR_PRESENT         = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
        CAP_OPENNI_DEPTH_GENERATOR_BASELINE        = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_BASELINE,
        CAP_OPENNI_DEPTH_GENERATOR_FOCAL_LENGTH    = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_FOCAL_LENGTH,
        CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION    = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_REGISTRATION,
-       CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON = CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION
+       CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON = CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION,
+       CAP_OPENNI_IR_GENERATOR_PRESENT            = CAP_OPENNI_IR_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
      };
 
-// OpenNI data given from depth generator
-enum { CAP_OPENNI_DEPTH_MAP         = 0, // Depth values in mm (CV_16UC1)
-       CAP_OPENNI_POINT_CLOUD_MAP   = 1, // XYZ in meters (CV_32FC3)
-       CAP_OPENNI_DISPARITY_MAP     = 2, // Disparity in pixels (CV_8UC1)
-       CAP_OPENNI_DISPARITY_MAP_32F = 3, // Disparity in pixels (CV_32FC1)
-       CAP_OPENNI_VALID_DEPTH_MASK  = 4, // CV_8UC1
+//! OpenNI data given from depth generator
+enum { CAP_OPENNI_DEPTH_MAP         = 0, //!< Depth values in mm (CV_16UC1)
+       CAP_OPENNI_POINT_CLOUD_MAP   = 1, //!< XYZ in meters (CV_32FC3)
+       CAP_OPENNI_DISPARITY_MAP     = 2, //!< Disparity in pixels (CV_8UC1)
+       CAP_OPENNI_DISPARITY_MAP_32F = 3, //!< Disparity in pixels (CV_32FC1)
+       CAP_OPENNI_VALID_DEPTH_MASK  = 4, //!< CV_8UC1
+
+       CAP_OPENNI_BGR_IMAGE         = 5, //!< Data given from RGB image generator
+       CAP_OPENNI_GRAY_IMAGE        = 6, //!< Data given from RGB image generator
 
-       // Data given from RGB image generator
-       CAP_OPENNI_BGR_IMAGE         = 5,
-       CAP_OPENNI_GRAY_IMAGE        = 6
+       CAP_OPENNI_IR_IMAGE          = 7  //!< Data given from IR image generator
      };
 
-// Supported output modes of OpenNI image generator
+//! Supported output modes of OpenNI image generator
 enum { CAP_OPENNI_VGA_30HZ  = 0,
        CAP_OPENNI_SXGA_15HZ = 1,
        CAP_OPENNI_SXGA_30HZ = 2,
@@ -213,202 +230,202 @@ enum { CAP_OPENNI_VGA_30HZ  = 0,
      };
 
 
-// GStreamer
-enum { CAP_PROP_GSTREAMER_QUEUE_LENGTH = 200 // default is 1
+//! GStreamer
+enum { CAP_PROP_GSTREAMER_QUEUE_LENGTH = 200 //!< Default is 1
      };
 
 
-// PVAPI
-enum { CAP_PROP_PVAPI_MULTICASTIP           = 300, // ip for anable multicast master mode. 0 for disable multicast
-       CAP_PROP_PVAPI_FRAMESTARTTRIGGERMODE = 301, // FrameStartTriggerMode: Determines how a frame is initiated
-       CAP_PROP_PVAPI_DECIMATIONHORIZONTAL  = 302, // Horizontal sub-sampling of the image
-       CAP_PROP_PVAPI_DECIMATIONVERTICAL    = 303, // Vertical sub-sampling of the image
-       CAP_PROP_PVAPI_BINNINGX              = 304, // Horizontal binning factor
-       CAP_PROP_PVAPI_BINNINGY              = 305, // Vertical binning factor
-       CAP_PROP_PVAPI_PIXELFORMAT           = 306  // Pixel format
+//! PVAPI
+enum { CAP_PROP_PVAPI_MULTICASTIP           = 300, //!< IP for enable multicast master mode. 0 for disable multicast.
+       CAP_PROP_PVAPI_FRAMESTARTTRIGGERMODE = 301, //!< FrameStartTriggerMode: Determines how a frame is initiated.
+       CAP_PROP_PVAPI_DECIMATIONHORIZONTAL  = 302, //!< Horizontal sub-sampling of the image.
+       CAP_PROP_PVAPI_DECIMATIONVERTICAL    = 303, //!< Vertical sub-sampling of the image.
+       CAP_PROP_PVAPI_BINNINGX              = 304, //!< Horizontal binning factor.
+       CAP_PROP_PVAPI_BINNINGY              = 305, //!< Vertical binning factor.
+       CAP_PROP_PVAPI_PIXELFORMAT           = 306  //!< Pixel format.
      };
 
-// PVAPI: FrameStartTriggerMode
-enum { CAP_PVAPI_FSTRIGMODE_FREERUN     = 0,    // Freerun
-       CAP_PVAPI_FSTRIGMODE_SYNCIN1     = 1,    // SyncIn1
-       CAP_PVAPI_FSTRIGMODE_SYNCIN2     = 2,    // SyncIn2
-       CAP_PVAPI_FSTRIGMODE_FIXEDRATE   = 3,    // FixedRate
-       CAP_PVAPI_FSTRIGMODE_SOFTWARE    = 4     // Software
+//! PVAPI: FrameStartTriggerMode
+enum { CAP_PVAPI_FSTRIGMODE_FREERUN     = 0,    //!< Freerun
+       CAP_PVAPI_FSTRIGMODE_SYNCIN1     = 1,    //!< SyncIn1
+       CAP_PVAPI_FSTRIGMODE_SYNCIN2     = 2,    //!< SyncIn2
+       CAP_PVAPI_FSTRIGMODE_FIXEDRATE   = 3,    //!< FixedRate
+       CAP_PVAPI_FSTRIGMODE_SOFTWARE    = 4     //!< Software
      };
 
-// PVAPI: DecimationHorizontal, DecimationVertical
-enum { CAP_PVAPI_DECIMATION_OFF       = 1,    // Off
-       CAP_PVAPI_DECIMATION_2OUTOF4   = 2,    // 2 out of 4 decimation
-       CAP_PVAPI_DECIMATION_2OUTOF8   = 4,    // 2 out of 8 decimation
-       CAP_PVAPI_DECIMATION_2OUTOF16  = 8     // 2 out of 16 decimation
+//! PVAPI: DecimationHorizontal, DecimationVertical
+enum { CAP_PVAPI_DECIMATION_OFF       = 1,    //!< Off
+       CAP_PVAPI_DECIMATION_2OUTOF4   = 2,    //!< 2 out of 4 decimation
+       CAP_PVAPI_DECIMATION_2OUTOF8   = 4,    //!< 2 out of 8 decimation
+       CAP_PVAPI_DECIMATION_2OUTOF16  = 8     //!< 2 out of 16 decimation
      };
 
-// PVAPI: PixelFormat
-enum { CAP_PVAPI_PIXELFORMAT_MONO8    = 1,    // Mono8
-       CAP_PVAPI_PIXELFORMAT_MONO16   = 2,    // Mono16
-       CAP_PVAPI_PIXELFORMAT_BAYER8   = 3,    // Bayer8
-       CAP_PVAPI_PIXELFORMAT_BAYER16  = 4,    // Bayer16
-       CAP_PVAPI_PIXELFORMAT_RGB24    = 5,    // Rgb24
-       CAP_PVAPI_PIXELFORMAT_BGR24    = 6,    // Bgr24
-       CAP_PVAPI_PIXELFORMAT_RGBA32   = 7,    // Rgba32
-       CAP_PVAPI_PIXELFORMAT_BGRA32   = 8,    // Bgra32
+//! PVAPI: PixelFormat
+enum { CAP_PVAPI_PIXELFORMAT_MONO8    = 1,    //!< Mono8
+       CAP_PVAPI_PIXELFORMAT_MONO16   = 2,    //!< Mono16
+       CAP_PVAPI_PIXELFORMAT_BAYER8   = 3,    //!< Bayer8
+       CAP_PVAPI_PIXELFORMAT_BAYER16  = 4,    //!< Bayer16
+       CAP_PVAPI_PIXELFORMAT_RGB24    = 5,    //!< Rgb24
+       CAP_PVAPI_PIXELFORMAT_BGR24    = 6,    //!< Bgr24
+       CAP_PVAPI_PIXELFORMAT_RGBA32   = 7,    //!< Rgba32
+       CAP_PVAPI_PIXELFORMAT_BGRA32   = 8,    //!< Bgra32
      };
 
-       // Properties of cameras available through XIMEA SDK interface
-enum { CAP_PROP_XI_DOWNSAMPLING                                 = 400, // Change image resolution by binning or skipping.
-       CAP_PROP_XI_DATA_FORMAT                                  = 401, // Output data format.
-       CAP_PROP_XI_OFFSET_X                                     = 402, // Horizontal offset from the origin to the area of interest (in pixels).
-       CAP_PROP_XI_OFFSET_Y                                     = 403, // Vertical offset from the origin to the area of interest (in pixels).
-       CAP_PROP_XI_TRG_SOURCE                                   = 404, // Defines source of trigger.
-       CAP_PROP_XI_TRG_SOFTWARE                                 = 405, // Generates an internal trigger. PRM_TRG_SOURCE must be set to TRG_SOFTWARE.
-       CAP_PROP_XI_GPI_SELECTOR                                 = 406, // Selects general purpose input
-       CAP_PROP_XI_GPI_MODE                                     = 407, // Set general purpose input mode
-       CAP_PROP_XI_GPI_LEVEL                                    = 408, // Get general purpose level
-       CAP_PROP_XI_GPO_SELECTOR                                 = 409, // Selects general purpose output
-       CAP_PROP_XI_GPO_MODE                                     = 410, // Set general purpose output mode
-       CAP_PROP_XI_LED_SELECTOR                                 = 411, // Selects camera signalling LED
-       CAP_PROP_XI_LED_MODE                                     = 412, // Define camera signalling LED functionality
-       CAP_PROP_XI_MANUAL_WB                                    = 413, // Calculates White Balance(must be called during acquisition)
-       CAP_PROP_XI_AUTO_WB                                      = 414, // Automatic white balance
-       CAP_PROP_XI_AEAG                                         = 415, // Automatic exposure/gain
-       CAP_PROP_XI_EXP_PRIORITY                                 = 416, // Exposure priority (0.5 - exposure 50%, gain 50%).
-       CAP_PROP_XI_AE_MAX_LIMIT                                 = 417, // Maximum limit of exposure in AEAG procedure
-       CAP_PROP_XI_AG_MAX_LIMIT                                 = 418,  // Maximum limit of gain in AEAG procedure
-       CAP_PROP_XI_AEAG_LEVEL                                   = 419, // Average intensity of output signal AEAG should achieve(in %)
-       CAP_PROP_XI_TIMEOUT                                      = 420, // Image capture timeout in milliseconds
-       CAP_PROP_XI_EXPOSURE                                     = 421, // Exposure time in microseconds
-       CAP_PROP_XI_EXPOSURE_BURST_COUNT                         = 422, // Sets the number of times of exposure in one frame.
-       CAP_PROP_XI_GAIN_SELECTOR                                = 423, // Gain selector for parameter Gain allows to select different type of gains.
-       CAP_PROP_XI_GAIN                                         = 424, // Gain in dB
-       CAP_PROP_XI_DOWNSAMPLING_TYPE                            = 426, // Change image downsampling type.
-       CAP_PROP_XI_BINNING_SELECTOR                             = 427, // Binning engine selector.
-       CAP_PROP_XI_BINNING_VERTICAL                             = 428, // Vertical Binning - number of vertical photo-sensitive cells to combine together.
-       CAP_PROP_XI_BINNING_HORIZONTAL                           = 429, // Horizontal Binning - number of horizontal photo-sensitive cells to combine together.
-       CAP_PROP_XI_BINNING_PATTERN                              = 430, // Binning pattern type.
-       CAP_PROP_XI_DECIMATION_SELECTOR                          = 431, // Decimation engine selector.
-       CAP_PROP_XI_DECIMATION_VERTICAL                          = 432, // Vertical Decimation - vertical sub-sampling of the image - reduces the vertical resolution of the image by the specified vertical decimation factor.
-       CAP_PROP_XI_DECIMATION_HORIZONTAL                        = 433, // Horizontal Decimation - horizontal sub-sampling of the image - reduces the horizontal resolution of the image by the specified vertical decimation factor.
-       CAP_PROP_XI_DECIMATION_PATTERN                           = 434, // Decimation pattern type.
-       CAP_PROP_XI_TEST_PATTERN_GENERATOR_SELECTOR              = 587, // Selects which test pattern generator is controlled by the TestPattern feature.
-       CAP_PROP_XI_TEST_PATTERN                                 = 588, // Selects which test pattern type is generated by the selected generator.
-       CAP_PROP_XI_IMAGE_DATA_FORMAT                            = 435, // Output data format.
-       CAP_PROP_XI_SHUTTER_TYPE                                 = 436, // Change sensor shutter type(CMOS sensor).
-       CAP_PROP_XI_SENSOR_TAPS                                  = 437, // Number of taps
-       CAP_PROP_XI_AEAG_ROI_OFFSET_X                            = 439, // Automatic exposure/gain ROI offset X
-       CAP_PROP_XI_AEAG_ROI_OFFSET_Y                            = 440, // Automatic exposure/gain ROI offset Y
-       CAP_PROP_XI_AEAG_ROI_WIDTH                               = 441, // Automatic exposure/gain ROI Width
-       CAP_PROP_XI_AEAG_ROI_HEIGHT                              = 442, // Automatic exposure/gain ROI Height
-       CAP_PROP_XI_BPC                                          = 445, // Correction of bad pixels
-       CAP_PROP_XI_WB_KR                                        = 448, // White balance red coefficient
-       CAP_PROP_XI_WB_KG                                        = 449, // White balance green coefficient
-       CAP_PROP_XI_WB_KB                                        = 450, // White balance blue coefficient
-       CAP_PROP_XI_WIDTH                                        = 451, // Width of the Image provided by the device (in pixels).
-       CAP_PROP_XI_HEIGHT                                       = 452, // Height of the Image provided by the device (in pixels).
-       CAP_PROP_XI_REGION_SELECTOR                              = 589, // Selects Region in Multiple ROI which parameters are set by width, height, ... ,region mode
-       CAP_PROP_XI_REGION_MODE                                  = 595, // Activates/deactivates Region selected by Region Selector
-       CAP_PROP_XI_LIMIT_BANDWIDTH                              = 459, // Set/get bandwidth(datarate)(in Megabits)
-       CAP_PROP_XI_SENSOR_DATA_BIT_DEPTH                        = 460, // Sensor output data bit depth.
-       CAP_PROP_XI_OUTPUT_DATA_BIT_DEPTH                        = 461, // Device output data bit depth.
-       CAP_PROP_XI_IMAGE_DATA_BIT_DEPTH                         = 462, // bitdepth of data returned by function xiGetImage
-       CAP_PROP_XI_OUTPUT_DATA_PACKING                          = 463, // Device output data packing (or grouping) enabled. Packing could be enabled if output_data_bit_depth > 8 and packing capability is available.
-       CAP_PROP_XI_OUTPUT_DATA_PACKING_TYPE                     = 464, // Data packing type. Some cameras supports only specific packing type.
-       CAP_PROP_XI_IS_COOLED                                    = 465, // Returns 1 for cameras that support cooling.
-       CAP_PROP_XI_COOLING                                      = 466, // Start camera cooling.
-       CAP_PROP_XI_TARGET_TEMP                                  = 467, // Set sensor target temperature for cooling.
-       CAP_PROP_XI_CHIP_TEMP                                    = 468, // Camera sensor temperature
-       CAP_PROP_XI_HOUS_TEMP                                    = 469, // Camera housing tepmerature
-       CAP_PROP_XI_HOUS_BACK_SIDE_TEMP                          = 590, // Camera housing back side tepmerature
-       CAP_PROP_XI_SENSOR_BOARD_TEMP                            = 596, // Camera sensor board temperature
-       CAP_PROP_XI_CMS                                          = 470, // Mode of color management system.
-       CAP_PROP_XI_APPLY_CMS                                    = 471, // Enable applying of CMS profiles to xiGetImage (see XI_PRM_INPUT_CMS_PROFILE, XI_PRM_OUTPUT_CMS_PROFILE).
-       CAP_PROP_XI_IMAGE_IS_COLOR                               = 474, // Returns 1 for color cameras.
-       CAP_PROP_XI_COLOR_FILTER_ARRAY                           = 475, // Returns color filter array type of RAW data.
-       CAP_PROP_XI_GAMMAY                                       = 476, // Luminosity gamma
-       CAP_PROP_XI_GAMMAC                                       = 477, // Chromaticity gamma
-       CAP_PROP_XI_SHARPNESS                                    = 478, // Sharpness Strenght
-       CAP_PROP_XI_CC_MATRIX_00                                 = 479, // Color Correction Matrix element [0][0]
-       CAP_PROP_XI_CC_MATRIX_01                                 = 480, // Color Correction Matrix element [0][1]
-       CAP_PROP_XI_CC_MATRIX_02                                 = 481, // Color Correction Matrix element [0][2]
-       CAP_PROP_XI_CC_MATRIX_03                                 = 482, // Color Correction Matrix element [0][3]
-       CAP_PROP_XI_CC_MATRIX_10                                 = 483, // Color Correction Matrix element [1][0]
-       CAP_PROP_XI_CC_MATRIX_11                                 = 484, // Color Correction Matrix element [1][1]
-       CAP_PROP_XI_CC_MATRIX_12                                 = 485, // Color Correction Matrix element [1][2]
-       CAP_PROP_XI_CC_MATRIX_13                                 = 486, // Color Correction Matrix element [1][3]
-       CAP_PROP_XI_CC_MATRIX_20                                 = 487, // Color Correction Matrix element [2][0]
-       CAP_PROP_XI_CC_MATRIX_21                                 = 488, // Color Correction Matrix element [2][1]
-       CAP_PROP_XI_CC_MATRIX_22                                 = 489, // Color Correction Matrix element [2][2]
-       CAP_PROP_XI_CC_MATRIX_23                                 = 490, // Color Correction Matrix element [2][3]
-       CAP_PROP_XI_CC_MATRIX_30                                 = 491, // Color Correction Matrix element [3][0]
-       CAP_PROP_XI_CC_MATRIX_31                                 = 492, // Color Correction Matrix element [3][1]
-       CAP_PROP_XI_CC_MATRIX_32                                 = 493, // Color Correction Matrix element [3][2]
-       CAP_PROP_XI_CC_MATRIX_33                                 = 494, // Color Correction Matrix element [3][3]
-       CAP_PROP_XI_DEFAULT_CC_MATRIX                            = 495, // Set default Color Correction Matrix
-       CAP_PROP_XI_TRG_SELECTOR                                 = 498, // Selects the type of trigger.
-       CAP_PROP_XI_ACQ_FRAME_BURST_COUNT                        = 499, // Sets number of frames acquired by burst. This burst is used only if trigger is set to FrameBurstStart
-       CAP_PROP_XI_DEBOUNCE_EN                                  = 507, // Enable/Disable debounce to selected GPI
-       CAP_PROP_XI_DEBOUNCE_T0                                  = 508, // Debounce time (x * 10us)
-       CAP_PROP_XI_DEBOUNCE_T1                                  = 509, // Debounce time (x * 10us)
-       CAP_PROP_XI_DEBOUNCE_POL                                 = 510, // Debounce polarity (pol = 1 t0 - falling edge, t1 - rising edge)
-       CAP_PROP_XI_LENS_MODE                                    = 511, // Status of lens control interface. This shall be set to XI_ON before any Lens operations.
-       CAP_PROP_XI_LENS_APERTURE_VALUE                          = 512, // Current lens aperture value in stops. Examples: 2.8, 4, 5.6, 8, 11
-       CAP_PROP_XI_LENS_FOCUS_MOVEMENT_VALUE                    = 513, // Lens current focus movement value to be used by XI_PRM_LENS_FOCUS_MOVE in motor steps.
-       CAP_PROP_XI_LENS_FOCUS_MOVE                              = 514, // Moves lens focus motor by steps set in XI_PRM_LENS_FOCUS_MOVEMENT_VALUE.
-       CAP_PROP_XI_LENS_FOCUS_DISTANCE                          = 515, // Lens focus distance in cm.
-       CAP_PROP_XI_LENS_FOCAL_LENGTH                            = 516, // Lens focal distance in mm.
-       CAP_PROP_XI_LENS_FEATURE_SELECTOR                        = 517, // Selects the current feature which is accessible by XI_PRM_LENS_FEATURE.
-       CAP_PROP_XI_LENS_FEATURE                                 = 518, // Allows access to lens feature value currently selected by XI_PRM_LENS_FEATURE_SELECTOR.
-       CAP_PROP_XI_DEVICE_MODEL_ID                              = 521, // Return device model id
-       CAP_PROP_XI_DEVICE_SN                                    = 522, // Return device serial number
-       CAP_PROP_XI_IMAGE_DATA_FORMAT_RGB32_ALPHA                = 529, // The alpha channel of RGB32 output image format.
-       CAP_PROP_XI_IMAGE_PAYLOAD_SIZE                           = 530, // Buffer size in bytes sufficient for output image returned by xiGetImage
-       CAP_PROP_XI_TRANSPORT_PIXEL_FORMAT                       = 531, // Current format of pixels on transport layer.
-       CAP_PROP_XI_SENSOR_CLOCK_FREQ_HZ                         = 532, // Sensor clock frequency in Hz.
-       CAP_PROP_XI_SENSOR_CLOCK_FREQ_INDEX                      = 533, // Sensor clock frequency index. Sensor with selected frequencies have possibility to set the frequency only by this index.
-       CAP_PROP_XI_SENSOR_OUTPUT_CHANNEL_COUNT                  = 534, // Number of output channels from sensor used for data transfer.
-       CAP_PROP_XI_FRAMERATE                                    = 535, // Define framerate in Hz
-       CAP_PROP_XI_COUNTER_SELECTOR                             = 536, // Select counter
-       CAP_PROP_XI_COUNTER_VALUE                                = 537, // Counter status
-       CAP_PROP_XI_ACQ_TIMING_MODE                              = 538, // Type of sensor frames timing.
-       CAP_PROP_XI_AVAILABLE_BANDWIDTH                          = 539, // Calculate and return available interface bandwidth(int Megabits)
-       CAP_PROP_XI_BUFFER_POLICY                                = 540, // Data move policy
-       CAP_PROP_XI_LUT_EN                                       = 541, // Activates LUT.
-       CAP_PROP_XI_LUT_INDEX                                    = 542, // Control the index (offset) of the coefficient to access in the LUT.
-       CAP_PROP_XI_LUT_VALUE                                    = 543, // Value at entry LUTIndex of the LUT
-       CAP_PROP_XI_TRG_DELAY                                    = 544, // Specifies the delay in microseconds (us) to apply after the trigger reception before activating it.
-       CAP_PROP_XI_TS_RST_MODE                                  = 545, // Defines how time stamp reset engine will be armed
-       CAP_PROP_XI_TS_RST_SOURCE                                = 546, // Defines which source will be used for timestamp reset. Writing this parameter will trigger settings of engine (arming)
-       CAP_PROP_XI_IS_DEVICE_EXIST                              = 547, // Returns 1 if camera connected and works properly.
-       CAP_PROP_XI_ACQ_BUFFER_SIZE                              = 548, // Acquisition buffer size in buffer_size_unit. Default bytes.
-       CAP_PROP_XI_ACQ_BUFFER_SIZE_UNIT                         = 549, // Acquisition buffer size unit in bytes. Default 1. E.g. Value 1024 means that buffer_size is in KiBytes
-       CAP_PROP_XI_ACQ_TRANSPORT_BUFFER_SIZE                    = 550, // Acquisition transport buffer size in bytes
-       CAP_PROP_XI_BUFFERS_QUEUE_SIZE                           = 551, // Queue of field/frame buffers
-       CAP_PROP_XI_ACQ_TRANSPORT_BUFFER_COMMIT                  = 552, // Number of buffers to commit to low level
-       CAP_PROP_XI_RECENT_FRAME                                 = 553, // GetImage returns most recent frame
-       CAP_PROP_XI_DEVICE_RESET                                 = 554, // Resets the camera to default state.
-       CAP_PROP_XI_COLUMN_FPN_CORRECTION                        = 555, // Correction of column FPN
-       CAP_PROP_XI_ROW_FPN_CORRECTION                           = 591, // Correction of row FPN
-       CAP_PROP_XI_SENSOR_MODE                                  = 558, // Current sensor mode. Allows to select sensor mode by one integer. Setting of this parameter affects: image dimensions and downsampling.
-       CAP_PROP_XI_HDR                                          = 559, // Enable High Dynamic Range feature.
-       CAP_PROP_XI_HDR_KNEEPOINT_COUNT                          = 560, // The number of kneepoints in the PWLR.
-       CAP_PROP_XI_HDR_T1                                       = 561, // position of first kneepoint(in % of XI_PRM_EXPOSURE)
-       CAP_PROP_XI_HDR_T2                                       = 562, // position of second kneepoint (in % of XI_PRM_EXPOSURE)
-       CAP_PROP_XI_KNEEPOINT1                                   = 563, // value of first kneepoint (% of sensor saturation)
-       CAP_PROP_XI_KNEEPOINT2                                   = 564, // value of second kneepoint (% of sensor saturation)
-       CAP_PROP_XI_IMAGE_BLACK_LEVEL                            = 565, // Last image black level counts. Can be used for Offline processing to recall it.
-       CAP_PROP_XI_HW_REVISION                                  = 571, // Returns hardware revision number.
-       CAP_PROP_XI_DEBUG_LEVEL                                  = 572, // Set debug level
-       CAP_PROP_XI_AUTO_BANDWIDTH_CALCULATION                   = 573, // Automatic bandwidth calculation,
-       CAP_PROP_XI_FFS_FILE_ID                                  = 594, // File number.
-       CAP_PROP_XI_FFS_FILE_SIZE                                = 580, // Size of file.
-       CAP_PROP_XI_FREE_FFS_SIZE                                = 581, // Size of free camera FFS.
-       CAP_PROP_XI_USED_FFS_SIZE                                = 582, // Size of used camera FFS.
-       CAP_PROP_XI_FFS_ACCESS_KEY                               = 583, // Setting of key enables file operations on some cameras.
-       CAP_PROP_XI_SENSOR_FEATURE_SELECTOR                      = 585, // Selects the current feature which is accessible by XI_PRM_SENSOR_FEATURE_VALUE.
-       CAP_PROP_XI_SENSOR_FEATURE_VALUE                         = 586, // Allows access to sensor feature value currently selected by XI_PRM_SENSOR_FEATURE_SELECTOR.
+//! Properties of cameras available through XIMEA SDK interface
+enum { CAP_PROP_XI_DOWNSAMPLING                                 = 400, //!< Change image resolution by binning or skipping.
+       CAP_PROP_XI_DATA_FORMAT                                  = 401, //!< Output data format.
+       CAP_PROP_XI_OFFSET_X                                     = 402, //!< Horizontal offset from the origin to the area of interest (in pixels).
+       CAP_PROP_XI_OFFSET_Y                                     = 403, //!< Vertical offset from the origin to the area of interest (in pixels).
+       CAP_PROP_XI_TRG_SOURCE                                   = 404, //!< Defines source of trigger.
+       CAP_PROP_XI_TRG_SOFTWARE                                 = 405, //!< Generates an internal trigger. PRM_TRG_SOURCE must be set to TRG_SOFTWARE.
+       CAP_PROP_XI_GPI_SELECTOR                                 = 406, //!< Selects general purpose input.
+       CAP_PROP_XI_GPI_MODE                                     = 407, //!< Set general purpose input mode.
+       CAP_PROP_XI_GPI_LEVEL                                    = 408, //!< Get general purpose level.
+       CAP_PROP_XI_GPO_SELECTOR                                 = 409, //!< Selects general purpose output.
+       CAP_PROP_XI_GPO_MODE                                     = 410, //!< Set general purpose output mode.
+       CAP_PROP_XI_LED_SELECTOR                                 = 411, //!< Selects camera signalling LED.
+       CAP_PROP_XI_LED_MODE                                     = 412, //!< Define camera signalling LED functionality.
+       CAP_PROP_XI_MANUAL_WB                                    = 413, //!< Calculates White Balance(must be called during acquisition).
+       CAP_PROP_XI_AUTO_WB                                      = 414, //!< Automatic white balance.
+       CAP_PROP_XI_AEAG                                         = 415, //!< Automatic exposure/gain.
+       CAP_PROP_XI_EXP_PRIORITY                                 = 416, //!< Exposure priority (0.5 - exposure 50%, gain 50%).
+       CAP_PROP_XI_AE_MAX_LIMIT                                 = 417, //!< Maximum limit of exposure in AEAG procedure.
+       CAP_PROP_XI_AG_MAX_LIMIT                                 = 418, //!< Maximum limit of gain in AEAG procedure.
+       CAP_PROP_XI_AEAG_LEVEL                                   = 419, //!< Average intensity of output signal AEAG should achieve(in %).
+       CAP_PROP_XI_TIMEOUT                                      = 420, //!< Image capture timeout in milliseconds.
+       CAP_PROP_XI_EXPOSURE                                     = 421, //!< Exposure time in microseconds.
+       CAP_PROP_XI_EXPOSURE_BURST_COUNT                         = 422, //!< Sets the number of times of exposure in one frame.
+       CAP_PROP_XI_GAIN_SELECTOR                                = 423, //!< Gain selector for parameter Gain allows to select different type of gains.
+       CAP_PROP_XI_GAIN                                         = 424, //!< Gain in dB.
+       CAP_PROP_XI_DOWNSAMPLING_TYPE                            = 426, //!< Change image downsampling type.
+       CAP_PROP_XI_BINNING_SELECTOR                             = 427, //!< Binning engine selector.
+       CAP_PROP_XI_BINNING_VERTICAL                             = 428, //!< Vertical Binning - number of vertical photo-sensitive cells to combine together.
+       CAP_PROP_XI_BINNING_HORIZONTAL                           = 429, //!< Horizontal Binning - number of horizontal photo-sensitive cells to combine together.
+       CAP_PROP_XI_BINNING_PATTERN                              = 430, //!< Binning pattern type.
+       CAP_PROP_XI_DECIMATION_SELECTOR                          = 431, //!< Decimation engine selector.
+       CAP_PROP_XI_DECIMATION_VERTICAL                          = 432, //!< Vertical Decimation - vertical sub-sampling of the image - reduces the vertical resolution of the image by the specified vertical decimation factor.
+       CAP_PROP_XI_DECIMATION_HORIZONTAL                        = 433, //!< Horizontal Decimation - horizontal sub-sampling of the image - reduces the horizontal resolution of the image by the specified vertical decimation factor.
+       CAP_PROP_XI_DECIMATION_PATTERN                           = 434, //!< Decimation pattern type.
+       CAP_PROP_XI_TEST_PATTERN_GENERATOR_SELECTOR              = 587, //!< Selects which test pattern generator is controlled by the TestPattern feature.
+       CAP_PROP_XI_TEST_PATTERN                                 = 588, //!< Selects which test pattern type is generated by the selected generator.
+       CAP_PROP_XI_IMAGE_DATA_FORMAT                            = 435, //!< Output data format.
+       CAP_PROP_XI_SHUTTER_TYPE                                 = 436, //!< Change sensor shutter type(CMOS sensor).
+       CAP_PROP_XI_SENSOR_TAPS                                  = 437, //!< Number of taps.
+       CAP_PROP_XI_AEAG_ROI_OFFSET_X                            = 439, //!< Automatic exposure/gain ROI offset X.
+       CAP_PROP_XI_AEAG_ROI_OFFSET_Y                            = 440, //!< Automatic exposure/gain ROI offset Y.
+       CAP_PROP_XI_AEAG_ROI_WIDTH                               = 441, //!< Automatic exposure/gain ROI Width.
+       CAP_PROP_XI_AEAG_ROI_HEIGHT                              = 442, //!< Automatic exposure/gain ROI Height.
+       CAP_PROP_XI_BPC                                          = 445, //!< Correction of bad pixels.
+       CAP_PROP_XI_WB_KR                                        = 448, //!< White balance red coefficient.
+       CAP_PROP_XI_WB_KG                                        = 449, //!< White balance green coefficient.
+       CAP_PROP_XI_WB_KB                                        = 450, //!< White balance blue coefficient.
+       CAP_PROP_XI_WIDTH                                        = 451, //!< Width of the Image provided by the device (in pixels).
+       CAP_PROP_XI_HEIGHT                                       = 452, //!< Height of the Image provided by the device (in pixels).
+       CAP_PROP_XI_REGION_SELECTOR                              = 589, //!< Selects Region in Multiple ROI which parameters are set by width, height, ... ,region mode.
+       CAP_PROP_XI_REGION_MODE                                  = 595, //!< Activates/deactivates Region selected by Region Selector.
+       CAP_PROP_XI_LIMIT_BANDWIDTH                              = 459, //!< Set/get bandwidth(datarate)(in Megabits).
+       CAP_PROP_XI_SENSOR_DATA_BIT_DEPTH                        = 460, //!< Sensor output data bit depth.
+       CAP_PROP_XI_OUTPUT_DATA_BIT_DEPTH                        = 461, //!< Device output data bit depth.
+       CAP_PROP_XI_IMAGE_DATA_BIT_DEPTH                         = 462, //!< bitdepth of data returned by function xiGetImage.
+       CAP_PROP_XI_OUTPUT_DATA_PACKING                          = 463, //!< Device output data packing (or grouping) enabled. Packing could be enabled if output_data_bit_depth > 8 and packing capability is available.
+       CAP_PROP_XI_OUTPUT_DATA_PACKING_TYPE                     = 464, //!< Data packing type. Some cameras supports only specific packing type.
+       CAP_PROP_XI_IS_COOLED                                    = 465, //!< Returns 1 for cameras that support cooling.
+       CAP_PROP_XI_COOLING                                      = 466, //!< Start camera cooling.
+       CAP_PROP_XI_TARGET_TEMP                                  = 467, //!< Set sensor target temperature for cooling.
+       CAP_PROP_XI_CHIP_TEMP                                    = 468, //!< Camera sensor temperature.
+       CAP_PROP_XI_HOUS_TEMP                                    = 469, //!< Camera housing temperature.
+       CAP_PROP_XI_HOUS_BACK_SIDE_TEMP                          = 590, //!< Camera housing back side temperature.
+       CAP_PROP_XI_SENSOR_BOARD_TEMP                            = 596, //!< Camera sensor board temperature.
+       CAP_PROP_XI_CMS                                          = 470, //!< Mode of color management system.
+       CAP_PROP_XI_APPLY_CMS                                    = 471, //!< Enable applying of CMS profiles to xiGetImage (see XI_PRM_INPUT_CMS_PROFILE, XI_PRM_OUTPUT_CMS_PROFILE).
+       CAP_PROP_XI_IMAGE_IS_COLOR                               = 474, //!< Returns 1 for color cameras.
+       CAP_PROP_XI_COLOR_FILTER_ARRAY                           = 475, //!< Returns color filter array type of RAW data.
+       CAP_PROP_XI_GAMMAY                                       = 476, //!< Luminosity gamma.
+       CAP_PROP_XI_GAMMAC                                       = 477, //!< Chromaticity gamma.
+       CAP_PROP_XI_SHARPNESS                                    = 478, //!< Sharpness Strength.
+       CAP_PROP_XI_CC_MATRIX_00                                 = 479, //!< Color Correction Matrix element [0][0].
+       CAP_PROP_XI_CC_MATRIX_01                                 = 480, //!< Color Correction Matrix element [0][1].
+       CAP_PROP_XI_CC_MATRIX_02                                 = 481, //!< Color Correction Matrix element [0][2].
+       CAP_PROP_XI_CC_MATRIX_03                                 = 482, //!< Color Correction Matrix element [0][3].
+       CAP_PROP_XI_CC_MATRIX_10                                 = 483, //!< Color Correction Matrix element [1][0].
+       CAP_PROP_XI_CC_MATRIX_11                                 = 484, //!< Color Correction Matrix element [1][1].
+       CAP_PROP_XI_CC_MATRIX_12                                 = 485, //!< Color Correction Matrix element [1][2].
+       CAP_PROP_XI_CC_MATRIX_13                                 = 486, //!< Color Correction Matrix element [1][3].
+       CAP_PROP_XI_CC_MATRIX_20                                 = 487, //!< Color Correction Matrix element [2][0].
+       CAP_PROP_XI_CC_MATRIX_21                                 = 488, //!< Color Correction Matrix element [2][1].
+       CAP_PROP_XI_CC_MATRIX_22                                 = 489, //!< Color Correction Matrix element [2][2].
+       CAP_PROP_XI_CC_MATRIX_23                                 = 490, //!< Color Correction Matrix element [2][3].
+       CAP_PROP_XI_CC_MATRIX_30                                 = 491, //!< Color Correction Matrix element [3][0].
+       CAP_PROP_XI_CC_MATRIX_31                                 = 492, //!< Color Correction Matrix element [3][1].
+       CAP_PROP_XI_CC_MATRIX_32                                 = 493, //!< Color Correction Matrix element [3][2].
+       CAP_PROP_XI_CC_MATRIX_33                                 = 494, //!< Color Correction Matrix element [3][3].
+       CAP_PROP_XI_DEFAULT_CC_MATRIX                            = 495, //!< Set default Color Correction Matrix.
+       CAP_PROP_XI_TRG_SELECTOR                                 = 498, //!< Selects the type of trigger.
+       CAP_PROP_XI_ACQ_FRAME_BURST_COUNT                        = 499, //!< Sets number of frames acquired by burst. This burst is used only if trigger is set to FrameBurstStart.
+       CAP_PROP_XI_DEBOUNCE_EN                                  = 507, //!< Enable/Disable debounce to selected GPI.
+       CAP_PROP_XI_DEBOUNCE_T0                                  = 508, //!< Debounce time (x * 10us).
+       CAP_PROP_XI_DEBOUNCE_T1                                  = 509, //!< Debounce time (x * 10us).
+       CAP_PROP_XI_DEBOUNCE_POL                                 = 510, //!< Debounce polarity (pol = 1 t0 - falling edge, t1 - rising edge).
+       CAP_PROP_XI_LENS_MODE                                    = 511, //!< Status of lens control interface. This shall be set to XI_ON before any Lens operations.
+       CAP_PROP_XI_LENS_APERTURE_VALUE                          = 512, //!< Current lens aperture value in stops. Examples: 2.8, 4, 5.6, 8, 11.
+       CAP_PROP_XI_LENS_FOCUS_MOVEMENT_VALUE                    = 513, //!< Lens current focus movement value to be used by XI_PRM_LENS_FOCUS_MOVE in motor steps.
+       CAP_PROP_XI_LENS_FOCUS_MOVE                              = 514, //!< Moves lens focus motor by steps set in XI_PRM_LENS_FOCUS_MOVEMENT_VALUE.
+       CAP_PROP_XI_LENS_FOCUS_DISTANCE                          = 515, //!< Lens focus distance in cm.
+       CAP_PROP_XI_LENS_FOCAL_LENGTH                            = 516, //!< Lens focal distance in mm.
+       CAP_PROP_XI_LENS_FEATURE_SELECTOR                        = 517, //!< Selects the current feature which is accessible by XI_PRM_LENS_FEATURE.
+       CAP_PROP_XI_LENS_FEATURE                                 = 518, //!< Allows access to lens feature value currently selected by XI_PRM_LENS_FEATURE_SELECTOR.
+       CAP_PROP_XI_DEVICE_MODEL_ID                              = 521, //!< Return device model id.
+       CAP_PROP_XI_DEVICE_SN                                    = 522, //!< Return device serial number.
+       CAP_PROP_XI_IMAGE_DATA_FORMAT_RGB32_ALPHA                = 529, //!< The alpha channel of RGB32 output image format.
+       CAP_PROP_XI_IMAGE_PAYLOAD_SIZE                           = 530, //!< Buffer size in bytes sufficient for output image returned by xiGetImage.
+       CAP_PROP_XI_TRANSPORT_PIXEL_FORMAT                       = 531, //!< Current format of pixels on transport layer.
+       CAP_PROP_XI_SENSOR_CLOCK_FREQ_HZ                         = 532, //!< Sensor clock frequency in Hz.
+       CAP_PROP_XI_SENSOR_CLOCK_FREQ_INDEX                      = 533, //!< Sensor clock frequency index. Sensor with selected frequencies have possibility to set the frequency only by this index.
+       CAP_PROP_XI_SENSOR_OUTPUT_CHANNEL_COUNT                  = 534, //!< Number of output channels from sensor used for data transfer.
+       CAP_PROP_XI_FRAMERATE                                    = 535, //!< Define framerate in Hz.
+       CAP_PROP_XI_COUNTER_SELECTOR                             = 536, //!< Select counter.
+       CAP_PROP_XI_COUNTER_VALUE                                = 537, //!< Counter status.
+       CAP_PROP_XI_ACQ_TIMING_MODE                              = 538, //!< Type of sensor frames timing.
+       CAP_PROP_XI_AVAILABLE_BANDWIDTH                          = 539, //!< Calculate and return available interface bandwidth(int Megabits).
+       CAP_PROP_XI_BUFFER_POLICY                                = 540, //!< Data move policy.
+       CAP_PROP_XI_LUT_EN                                       = 541, //!< Activates LUT.
+       CAP_PROP_XI_LUT_INDEX                                    = 542, //!< Control the index (offset) of the coefficient to access in the LUT.
+       CAP_PROP_XI_LUT_VALUE                                    = 543, //!< Value at entry LUTIndex of the LUT.
+       CAP_PROP_XI_TRG_DELAY                                    = 544, //!< Specifies the delay in microseconds (us) to apply after the trigger reception before activating it.
+       CAP_PROP_XI_TS_RST_MODE                                  = 545, //!< Defines how time stamp reset engine will be armed.
+       CAP_PROP_XI_TS_RST_SOURCE                                = 546, //!< Defines which source will be used for timestamp reset. Writing this parameter will trigger settings of engine (arming).
+       CAP_PROP_XI_IS_DEVICE_EXIST                              = 547, //!< Returns 1 if camera connected and works properly.
+       CAP_PROP_XI_ACQ_BUFFER_SIZE                              = 548, //!< Acquisition buffer size in buffer_size_unit. Default bytes.
+       CAP_PROP_XI_ACQ_BUFFER_SIZE_UNIT                         = 549, //!< Acquisition buffer size unit in bytes. Default 1. E.g. Value 1024 means that buffer_size is in KiBytes.
+       CAP_PROP_XI_ACQ_TRANSPORT_BUFFER_SIZE                    = 550, //!< Acquisition transport buffer size in bytes.
+       CAP_PROP_XI_BUFFERS_QUEUE_SIZE                           = 551, //!< Queue of field/frame buffers.
+       CAP_PROP_XI_ACQ_TRANSPORT_BUFFER_COMMIT                  = 552, //!< Number of buffers to commit to low level.
+       CAP_PROP_XI_RECENT_FRAME                                 = 553, //!< GetImage returns most recent frame.
+       CAP_PROP_XI_DEVICE_RESET                                 = 554, //!< Resets the camera to default state.
+       CAP_PROP_XI_COLUMN_FPN_CORRECTION                        = 555, //!< Correction of column FPN.
+       CAP_PROP_XI_ROW_FPN_CORRECTION                           = 591, //!< Correction of row FPN.
+       CAP_PROP_XI_SENSOR_MODE                                  = 558, //!< Current sensor mode. Allows to select sensor mode by one integer. Setting of this parameter affects: image dimensions and downsampling.
+       CAP_PROP_XI_HDR                                          = 559, //!< Enable High Dynamic Range feature.
+       CAP_PROP_XI_HDR_KNEEPOINT_COUNT                          = 560, //!< The number of kneepoints in the PWLR.
+       CAP_PROP_XI_HDR_T1                                       = 561, //!< Position of first kneepoint(in % of XI_PRM_EXPOSURE).
+       CAP_PROP_XI_HDR_T2                                       = 562, //!< Position of second kneepoint (in % of XI_PRM_EXPOSURE).
+       CAP_PROP_XI_KNEEPOINT1                                   = 563, //!< Value of first kneepoint (% of sensor saturation).
+       CAP_PROP_XI_KNEEPOINT2                                   = 564, //!< Value of second kneepoint (% of sensor saturation).
+       CAP_PROP_XI_IMAGE_BLACK_LEVEL                            = 565, //!< Last image black level counts. Can be used for Offline processing to recall it.
+       CAP_PROP_XI_HW_REVISION                                  = 571, //!< Returns hardware revision number.
+       CAP_PROP_XI_DEBUG_LEVEL                                  = 572, //!< Set debug level.
+       CAP_PROP_XI_AUTO_BANDWIDTH_CALCULATION                   = 573, //!< Automatic bandwidth calculation.
+       CAP_PROP_XI_FFS_FILE_ID                                  = 594, //!< File number.
+       CAP_PROP_XI_FFS_FILE_SIZE                                = 580, //!< Size of file.
+       CAP_PROP_XI_FREE_FFS_SIZE                                = 581, //!< Size of free camera FFS.
+       CAP_PROP_XI_USED_FFS_SIZE                                = 582, //!< Size of used camera FFS.
+       CAP_PROP_XI_FFS_ACCESS_KEY                               = 583, //!< Setting of key enables file operations on some cameras.
+       CAP_PROP_XI_SENSOR_FEATURE_SELECTOR                      = 585, //!< Selects the current feature which is accessible by XI_PRM_SENSOR_FEATURE_VALUE.
+       CAP_PROP_XI_SENSOR_FEATURE_VALUE                         = 586, //!< Allows access to sensor feature value currently selected by XI_PRM_SENSOR_FEATURE_SELECTOR.
      };
 
 
-// Properties of cameras available through AVFOUNDATION interface
+//! Properties of cameras available through AVFOUNDATION interface
 enum { CAP_PROP_IOS_DEVICE_FOCUS        = 9001,
        CAP_PROP_IOS_DEVICE_EXPOSURE     = 9002,
        CAP_PROP_IOS_DEVICE_FLASH        = 9003,
@@ -417,7 +434,7 @@ enum { CAP_PROP_IOS_DEVICE_FOCUS        = 9001,
      };
 
 
-// Properties of cameras available through Smartek Giganetix Ethernet Vision interface
+//! Properties of cameras available through Smartek Giganetix Ethernet Vision interface
 /* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */
 enum { CAP_PROP_GIGA_FRAME_OFFSET_X   = 10001,
        CAP_PROP_GIGA_FRAME_OFFSET_Y   = 10002,
@@ -436,36 +453,39 @@ enum { CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
        CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007
      };
 
-// Intel PerC streams
+//! Intel PerC streams
 enum { CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
        CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
        CAP_INTELPERC_GENERATORS_MASK = CAP_INTELPERC_DEPTH_GENERATOR + CAP_INTELPERC_IMAGE_GENERATOR
      };
 
-enum { CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
-       CAP_INTELPERC_UVDEPTH_MAP            = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
-       CAP_INTELPERC_IR_MAP                 = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
+enum { CAP_INTELPERC_DEPTH_MAP              = 0, //!< Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
+       CAP_INTELPERC_UVDEPTH_MAP            = 1, //!< Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
+       CAP_INTELPERC_IR_MAP                 = 2, //!< Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
        CAP_INTELPERC_IMAGE                  = 3
      };
 
-enum { VIDEOWRITER_PROP_QUALITY = 1,    // Quality (0..100%) of the videostream encoded
-       VIDEOWRITER_PROP_FRAMEBYTES = 2, // (Read-only): Size of just encoded video frame
-       VIDEOWRITER_PROP_NSTRIPES = 3    // Number of stripes for parallel encoding. -1 for auto detection
+enum { VIDEOWRITER_PROP_QUALITY = 1,    //!< Current quality (0..100%) of the encoded videostream. Can be adjusted dynamically in some codecs.
+       VIDEOWRITER_PROP_FRAMEBYTES = 2, //!< (Read-only): Size of just encoded video frame. Note that the encoding order may be different from representation order.
+       VIDEOWRITER_PROP_NSTRIPES = 3    //!< Number of stripes for parallel encoding. -1 for auto detection.
      };
 
-// gPhoto2 properties, if propertyId is less than 0 then work on widget with that __additive inversed__ camera setting ID
-// Get IDs by using CAP_PROP_GPHOTO2_WIDGET_ENUMERATE.
-// @see CvCaptureCAM_GPHOTO2 for more info
-enum { CAP_PROP_GPHOTO2_PREVIEW           = 17001, // Capture only preview from liveview mode.
-       CAP_PROP_GPHOTO2_WIDGET_ENUMERATE  = 17002, // Readonly, returns (const char *).
-       CAP_PROP_GPHOTO2_RELOAD_CONFIG     = 17003, // Trigger, only by set. Reload camera settings.
-       CAP_PROP_GPHOTO2_RELOAD_ON_CHANGE  = 17004, // Reload all settings on set.
-       CAP_PROP_GPHOTO2_COLLECT_MSGS      = 17005, // Collect messages with details.
-       CAP_PROP_GPHOTO2_FLUSH_MSGS        = 17006, // Readonly, returns (const char *).
-       CAP_PROP_SPEED                     = 17007, // Exposure speed. Can be readonly, depends on camera program.
-       CAP_PROP_APERTURE                  = 17008, // Aperture. Can be readonly, depends on camera program.
-       CAP_PROP_EXPOSUREPROGRAM           = 17009, // Camera exposure program.
-       CAP_PROP_VIEWFINDER                = 17010  // Enter liveview mode.
+/** @brief gPhoto2 properties
+
+if propertyId is less than 0 then work on widget with that __additive inversed__ camera setting ID
+Get IDs by using CAP_PROP_GPHOTO2_WIDGET_ENUMERATE.
+@see CvCaptureCAM_GPHOTO2 for more info
+*/
+enum { CAP_PROP_GPHOTO2_PREVIEW           = 17001, //!< Capture only preview from liveview mode.
+       CAP_PROP_GPHOTO2_WIDGET_ENUMERATE  = 17002, //!< Readonly, returns (const char *).
+       CAP_PROP_GPHOTO2_RELOAD_CONFIG     = 17003, //!< Trigger, only by set. Reload camera settings.
+       CAP_PROP_GPHOTO2_RELOAD_ON_CHANGE  = 17004, //!< Reload all settings on set.
+       CAP_PROP_GPHOTO2_COLLECT_MSGS      = 17005, //!< Collect messages with details.
+       CAP_PROP_GPHOTO2_FLUSH_MSGS        = 17006, //!< Readonly, returns (const char *).
+       CAP_PROP_SPEED                     = 17007, //!< Exposure speed. Can be readonly, depends on camera program.
+       CAP_PROP_APERTURE                  = 17008, //!< Aperture. Can be readonly, depends on camera program.
+       CAP_PROP_EXPOSUREPROGRAM           = 17009, //!< Camera exposure program.
+       CAP_PROP_VIEWFINDER                = 17010  //!< Enter liveview mode.
      };
 
 //enum {
@@ -537,13 +557,13 @@ public:
     img_%02d.jpg, which will read samples like img_00.jpg, img_01.jpg, img_02.jpg, ...)
 
     @param apiPreference preferred Capture API to use. Can be used to enforce a specific reader
-    implementation if multiple are available: e.g. CAP_FFMPEG or CAP_IMAGES
+    implementation if multiple are available: e.g. cv::CAP_FFMPEG or cv::CAP_IMAGES
     */
     CV_WRAP VideoCapture(const String& filename, int apiPreference);
 
     /** @overload
     @param index = camera_id + domain_offset (CAP_*). id of the video capturing device to open. If there is a single
-    camera connected, just pass 0. Advanced Usage: to open Camera 1 using the MS Media Foundation API: index = 1 + CAP_MSMF
+    camera connected, just pass 0. Advanced Usage: to open Camera 1 using the MS Media Foundation API: index = 1 + cv::CAP_MSMF
     */
     CV_WRAP VideoCapture(int index);
 
@@ -560,7 +580,7 @@ public:
 
     /** @overload
     @param index = camera_id + domain_offset (CAP_*). id of the video capturing device to open. If there is a single
-    camera connected, just pass 0. Advanced Usage: to open Camera 1 using the MS Media Foundation API: index = 1 + CAP_MSMF
+    camera connected, just pass 0. Advanced Usage: to open Camera 1 using the MS Media Foundation API: index = 1 + cv::CAP_MSMF
     */
     CV_WRAP virtual bool open(int index);
 
@@ -628,28 +648,26 @@ public:
     /** @brief Sets a property in the VideoCapture.
 
     @param propId Property identifier. It can be one of the following:
-     -   **CAP_PROP_POS_MSEC** Current position of the video file in milliseconds.
-     -   **CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
-     -   **CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
-         film, 1 - end of the film.
-     -   **CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
-     -   **CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
-     -   **CAP_PROP_FPS** Frame rate.
-     -   **CAP_PROP_FOURCC** 4-character code of codec.
-     -   **CAP_PROP_FRAME_COUNT** Number of frames in the video file.
-     -   **CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
-     -   **CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
-     -   **CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
-     -   **CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
-     -   **CAP_PROP_SATURATION** Saturation of the image (only for cameras).
-     -   **CAP_PROP_HUE** Hue of the image (only for cameras).
-     -   **CAP_PROP_GAIN** Gain of the image (only for cameras).
-     -   **CAP_PROP_EXPOSURE** Exposure (only for cameras).
-     -   **CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
-         to RGB.
-     -   **CAP_PROP_WHITE_BALANCE** Currently unsupported
-     -   **CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
-         by DC1394 v 2.x backend currently)
+    -   cv::CAP_PROP_POS_MSEC @copydoc cv::CAP_PROP_POS_MSEC
+    -   cv::CAP_PROP_POS_FRAMES @copydoc CAP_PROP_POS_FRAMES
+    -   cv::CAP_PROP_POS_AVI_RATIO @copydoc cv::CAP_PROP_POS_AVI_RATIO
+    -   cv::CAP_PROP_FRAME_WIDTH @copydoc cv::CAP_PROP_FRAME_WIDTH
+    -   cv::CAP_PROP_FRAME_HEIGHT @copydoc cv::CAP_PROP_FRAME_HEIGHT
+    -   cv::CAP_PROP_FPS @copydoc cv::CAP_PROP_FPS
+    -   cv::CAP_PROP_FOURCC @copydoc cv::CAP_PROP_FOURCC
+    -   cv::CAP_PROP_FRAME_COUNT @copydoc cv::CAP_PROP_FRAME_COUNT
+    -   cv::CAP_PROP_FORMAT @copydoc cv::CAP_PROP_FORMAT
+    -   cv::CAP_PROP_MODE @copydoc cv::CAP_PROP_MODE
+    -   cv::CAP_PROP_BRIGHTNESS @copydoc cv::CAP_PROP_BRIGHTNESS
+    -   cv::CAP_PROP_CONTRAST @copydoc cv::CAP_PROP_CONTRAST
+    -   cv::CAP_PROP_SATURATION @copydoc cv::CAP_PROP_SATURATION
+    -   cv::CAP_PROP_HUE @copydoc cv::CAP_PROP_HUE
+    -   cv::CAP_PROP_GAIN @copydoc cv::CAP_PROP_GAIN
+    -   cv::CAP_PROP_EXPOSURE @copydoc cv::CAP_PROP_EXPOSURE
+    -   cv::CAP_PROP_CONVERT_RGB @copydoc cv::CAP_PROP_CONVERT_RGB
+    -   cv::CAP_PROP_WHITE_BALANCE_BLUE_U @copydoc cv::CAP_PROP_WHITE_BALANCE_BLUE_U
+    -   cv::CAP_PROP_WHITE_BALANCE_RED_V @copydoc cv::CAP_PROP_WHITE_BALANCE_RED_V
+    -   cv::CAP_PROP_RECTIFICATION @copydoc cv::CAP_PROP_RECTIFICATION
     @param value Value of the property.
      */
     CV_WRAP virtual bool set(int propId, double value);
@@ -657,29 +675,26 @@ public:
     /** @brief Returns the specified VideoCapture property
 
     @param propId Property identifier. It can be one of the following:
-     -   **CAP_PROP_POS_MSEC** Current position of the video file in milliseconds or video
-         capture timestamp.
-     -   **CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
-     -   **CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
-         film, 1 - end of the film.
-     -   **CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
-     -   **CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
-     -   **CAP_PROP_FPS** Frame rate.
-     -   **CAP_PROP_FOURCC** 4-character code of codec.
-     -   **CAP_PROP_FRAME_COUNT** Number of frames in the video file.
-     -   **CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
-     -   **CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
-     -   **CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
-     -   **CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
-     -   **CAP_PROP_SATURATION** Saturation of the image (only for cameras).
-     -   **CAP_PROP_HUE** Hue of the image (only for cameras).
-     -   **CAP_PROP_GAIN** Gain of the image (only for cameras).
-     -   **CAP_PROP_EXPOSURE** Exposure (only for cameras).
-     -   **CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
-         to RGB.
-     -   **CAP_PROP_WHITE_BALANCE** Currently not supported
-     -   **CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
-         by DC1394 v 2.x backend currently)
+    -   cv::CAP_PROP_POS_MSEC @copydoc cv::CAP_PROP_POS_MSEC
+    -   cv::CAP_PROP_POS_FRAMES @copydoc CAP_PROP_POS_FRAMES
+    -   cv::CAP_PROP_POS_AVI_RATIO @copydoc cv::CAP_PROP_POS_AVI_RATIO
+    -   cv::CAP_PROP_FRAME_WIDTH @copydoc cv::CAP_PROP_FRAME_WIDTH
+    -   cv::CAP_PROP_FRAME_HEIGHT @copydoc cv::CAP_PROP_FRAME_HEIGHT
+    -   cv::CAP_PROP_FPS @copydoc cv::CAP_PROP_FPS
+    -   cv::CAP_PROP_FOURCC @copydoc cv::CAP_PROP_FOURCC
+    -   cv::CAP_PROP_FRAME_COUNT @copydoc cv::CAP_PROP_FRAME_COUNT
+    -   cv::CAP_PROP_FORMAT @copydoc cv::CAP_PROP_FORMAT
+    -   cv::CAP_PROP_MODE @copydoc cv::CAP_PROP_MODE
+    -   cv::CAP_PROP_BRIGHTNESS @copydoc cv::CAP_PROP_BRIGHTNESS
+    -   cv::CAP_PROP_CONTRAST @copydoc cv::CAP_PROP_CONTRAST
+    -   cv::CAP_PROP_SATURATION @copydoc cv::CAP_PROP_SATURATION
+    -   cv::CAP_PROP_HUE @copydoc cv::CAP_PROP_HUE
+    -   cv::CAP_PROP_GAIN @copydoc cv::CAP_PROP_GAIN
+    -   cv::CAP_PROP_EXPOSURE @copydoc cv::CAP_PROP_EXPOSURE
+    -   cv::CAP_PROP_CONVERT_RGB @copydoc cv::CAP_PROP_CONVERT_RGB
+    -   cv::CAP_PROP_WHITE_BALANCE_BLUE_U @copydoc cv::CAP_PROP_WHITE_BALANCE_BLUE_U
+    -   cv::CAP_PROP_WHITE_BALANCE_RED_V @copydoc cv::CAP_PROP_WHITE_BALANCE_RED_V
+    -   cv::CAP_PROP_RECTIFICATION @copydoc cv::CAP_PROP_RECTIFICATION
 
     @note When querying a property that is not supported by the backend used by the VideoCapture
     class, value 0 is returned.
@@ -692,7 +707,7 @@ public:
     img_%02d.jpg, which will read samples like img_00.jpg, img_01.jpg, img_02.jpg, ...)
 
     @param apiPreference preferred Capture API to use. Can be used to enforce a specific reader
-    implementation if multiple are available: e.g. CAP_FFMPEG or CAP_IMAGES
+    implementation if multiple are available: e.g. cv::CAP_FFMPEG or cv::CAP_IMAGES
 
     The methods first call VideoCapture::release to close the already opened file or camera.
      */
@@ -767,8 +782,8 @@ public:
     /** @brief Sets a property in the VideoWriter.
 
      @param propId Property identifier. It can be one of the following:
-     -   **VIDEOWRITER_PROP_QUALITY** Quality (0..100%) of the videostream encoded. Can be adjusted dynamically in some codecs.
-     -   **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
+     -   cv::VIDEOWRITER_PROP_QUALITY @copydoc cv::VIDEOWRITER_PROP_QUALITY
+     -   cv::VIDEOWRITER_PROP_NSTRIPES @copydoc cv::VIDEOWRITER_PROP_NSTRIPES
      @param value Value of the property.
      */
     CV_WRAP virtual bool set(int propId, double value);
@@ -776,9 +791,9 @@ public:
     /** @brief Returns the specified VideoWriter property
 
      @param propId Property identifier. It can be one of the following:
-     -   **VIDEOWRITER_PROP_QUALITY** Current quality of the encoded videostream.
-     -   **VIDEOWRITER_PROP_FRAMEBYTES** (Read-only) Size of just encoded video frame; note that the encoding order may be different from representation order.
-     -   **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
+     -   cv::VIDEOWRITER_PROP_QUALITY @copydoc cv::VIDEOWRITER_PROP_QUALITY
+     -   cv::VIDEOWRITER_PROP_FRAMEBYTES @copydoc VIDEOWRITER_PROP_FRAMEBYTES
+     -   cv::VIDEOWRITER_PROP_NSTRIPES @copydoc cv::VIDEOWRITER_PROP_NSTRIPES
 
      @note When querying a property that is not supported by the backend used by the VideoWriter
      class, value 0 is returned.
diff --git a/modules/videoio/include/opencv2/videoio/videoio_c.h b/modules/videoio/include/opencv2/videoio/videoio_c.h
index 91d26ea5ae..e502c7be83 100644
--- a/modules/videoio/include/opencv2/videoio/videoio_c.h
+++ b/modules/videoio/include/opencv2/videoio/videoio_c.h
@@ -200,7 +200,8 @@ enum
     // OpenNI map generators
     CV_CAP_OPENNI_DEPTH_GENERATOR = 1 << 31,
     CV_CAP_OPENNI_IMAGE_GENERATOR = 1 << 30,
-    CV_CAP_OPENNI_GENERATORS_MASK = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_OPENNI_IMAGE_GENERATOR,
+    CV_CAP_OPENNI_IR_GENERATOR    = 1 << 29,
+    CV_CAP_OPENNI_GENERATORS_MASK = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_OPENNI_IMAGE_GENERATOR + CV_CAP_OPENNI_IR_GENERATOR,
 
     // Properties of cameras available through OpenNI interfaces
     CV_CAP_PROP_OPENNI_OUTPUT_MODE     = 100,
@@ -222,10 +223,12 @@ enum
 
     CV_CAP_OPENNI_IMAGE_GENERATOR_PRESENT         = CV_CAP_OPENNI_IMAGE_GENERATOR + CV_CAP_PROP_OPENNI_GENERATOR_PRESENT,
     CV_CAP_OPENNI_IMAGE_GENERATOR_OUTPUT_MODE     = CV_CAP_OPENNI_IMAGE_GENERATOR + CV_CAP_PROP_OPENNI_OUTPUT_MODE,
+    CV_CAP_OPENNI_DEPTH_GENERATOR_PRESENT         = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_PROP_OPENNI_GENERATOR_PRESENT,
     CV_CAP_OPENNI_DEPTH_GENERATOR_BASELINE        = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_PROP_OPENNI_BASELINE,
     CV_CAP_OPENNI_DEPTH_GENERATOR_FOCAL_LENGTH    = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_PROP_OPENNI_FOCAL_LENGTH,
     CV_CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION    = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_PROP_OPENNI_REGISTRATION,
     CV_CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON = CV_CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION,
+    CV_CAP_OPENNI_IR_GENERATOR_PRESENT            = CV_CAP_OPENNI_IR_GENERATOR + CV_CAP_PROP_OPENNI_GENERATOR_PRESENT,
 
     // Properties of cameras available through GStreamer interface
     CV_CAP_GSTREAMER_QUEUE_LENGTH           = 200, // default is 1
@@ -454,7 +457,10 @@ enum
 
     // Data given from RGB image generator.
     CV_CAP_OPENNI_BGR_IMAGE                 = 5,
-    CV_CAP_OPENNI_GRAY_IMAGE                = 6
+    CV_CAP_OPENNI_GRAY_IMAGE                = 6,
+
+    // Data given from IR image generator.
+    CV_CAP_OPENNI_IR_IMAGE                  = 7
 };
 
 // Supported output modes of OpenNI image generator
diff --git a/modules/videoio/src/cap_dshow.cpp b/modules/videoio/src/cap_dshow.cpp
index 4380b9b2c1..8b793ac202 100644
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@@ -2447,13 +2447,15 @@ static bool setSizeAndSubtype(videoDevice * VD, int attemptWidth, int attemptHei
     VD->pAmMediaType->subtype     = mediatype;
 
     //buffer size
-    if (mediatype == MEDIASUBTYPE_RGB24)
-    {
-        VD->pAmMediaType->lSampleSize = attemptWidth*attemptHeight*3;
+    if (mediatype == MEDIASUBTYPE_RGB24){
+        VD->pAmMediaType->lSampleSize = attemptWidth*attemptHeight * 3;
     }
-    else
-    {
-        // For compressed data, the value can be zero.
+    else if ((mediatype == MEDIASUBTYPE_YUY2) || (mediatype == MEDIASUBTYPE_YVYU) ||
+        (mediatype == MEDIASUBTYPE_UYVY)){
+
+        VD->pAmMediaType->lSampleSize = attemptWidth*attemptHeight * 2;
+    }
+    else{
         VD->pAmMediaType->lSampleSize = 0;
     }
 
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 8ac33d749d..4a725bc20b 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -971,7 +971,8 @@ bool CvCapture_FFMPEG::grabFrame()
         {
             //picture_pts = picture->best_effort_timestamp;
             if( picture_pts == AV_NOPTS_VALUE_ )
-                picture_pts = packet.pts != AV_NOPTS_VALUE_ && packet.pts != 0 ? packet.pts : packet.dts;
+                picture_pts = picture->pkt_pts != AV_NOPTS_VALUE_ && picture->pkt_pts != 0 ? picture->pkt_pts : picture->pkt_dts;
+
             frame_number++;
             valid = true;
         }
diff --git a/modules/videoio/src/cap_images.cpp b/modules/videoio/src/cap_images.cpp
index 00d5af47d1..e379561593 100644
--- a/modules/videoio/src/cap_images.cpp
+++ b/modules/videoio/src/cap_images.cpp
@@ -124,7 +124,7 @@ bool CvCapture_Images::grabFrame()
     }
 
     cvReleaseImage(&frame);
-    frame = cvLoadImage(str, CV_LOAD_IMAGE_ANYDEPTH | CV_LOAD_IMAGE_ANYCOLOR);
+    frame = cvLoadImage(str, CV_LOAD_IMAGE_UNCHANGED);
     if( frame )
         currentframe++;
 
diff --git a/modules/videoio/src/cap_openni2.cpp b/modules/videoio/src/cap_openni2.cpp
index 12faa297f2..aebf1889a1 100644
--- a/modules/videoio/src/cap_openni2.cpp
+++ b/modules/videoio/src/cap_openni2.cpp
@@ -65,8 +65,10 @@
 
 #define CV_STREAM_TIMEOUT 2000
 
-#define CV_DEPTH_STREAM 0
-#define CV_COLOR_STREAM 1
+#define CV_DEPTH_STREAM     0
+#define CV_COLOR_STREAM     1
+#define CV_IR_STREAM        2
+#define CV_MAX_NUM_STREAMS  3
 
 #include "OpenNI.h"
 #include "PS1080.h"
@@ -109,10 +111,9 @@ protected:
         IplImage iplHeader;
     };
 
-    static const int outputMapsTypesCount = 7;
+    static const int outputMapsTypesCount = 8;
 
-    static openni::VideoMode defaultColorOutputMode();
-    static openni::VideoMode defaultDepthOutputMode();
+    static openni::VideoMode defaultStreamOutputMode(int stream);
 
     IplImage* retrieveDepthMap();
     IplImage* retrievePointCloudMap();
@@ -121,13 +122,17 @@ protected:
     IplImage* retrieveValidDepthMask();
     IplImage* retrieveBGRImage();
     IplImage* retrieveGrayImage();
+    IplImage* retrieveIrImage();
 
+    openni::Status toggleStream(int stream, bool toggle);
     bool readCamerasParams();
 
     double getDepthGeneratorProperty(int propIdx) const;
     bool setDepthGeneratorProperty(int propIdx, double propVal);
     double getImageGeneratorProperty(int propIdx) const;
     bool setImageGeneratorProperty(int propIdx, double propVal);
+    double getIrGeneratorProperty(int propIdx) const;
+    bool setIrGeneratorProperty(int propIdx, double propVal);
     double getCommonProperty(int propIdx) const;
     bool setCommonProperty(int propIdx, double propVal);
 
@@ -137,9 +142,9 @@ protected:
     openni::Recorder recorder;
 
     // Data generators with its metadata
-    openni::VideoStream depth, color, **streams;
-    openni::VideoFrameRef depthFrame, colorFrame;
-    cv::Mat depthImage, colorImage;
+    openni::VideoStream streams[CV_MAX_NUM_STREAMS];
+    openni::VideoFrameRef streamFrames[CV_MAX_NUM_STREAMS];
+    cv::Mat streamImages[CV_MAX_NUM_STREAMS];
 
     int maxBufferSize, maxTimeDuration; // for approx sync
     bool isCircleBuffer;
@@ -157,9 +162,6 @@ protected:
     // The value for pixels without a valid disparity measurement
     int noSampleValue;
 
-    int currentStream;
-
-    int numStream;
     std::vector<OutputMap> outputMaps;
 };
 
@@ -177,27 +179,28 @@ bool CvCapture_OpenNI2::isOpened() const
     return isContextOpened;
 }
 
-openni::VideoMode CvCapture_OpenNI2::defaultColorOutputMode()
-{
-    openni::VideoMode mode;
-    mode.setResolution(640, 480);
-    mode.setFps(30);
-    mode.setPixelFormat(openni::PIXEL_FORMAT_RGB888);
-    return mode;
-}
-
-openni::VideoMode CvCapture_OpenNI2::defaultDepthOutputMode()
+openni::VideoMode CvCapture_OpenNI2::defaultStreamOutputMode(int stream)
 {
     openni::VideoMode mode;
     mode.setResolution(640, 480);
     mode.setFps(30);
-    mode.setPixelFormat(openni::PIXEL_FORMAT_DEPTH_1_MM);
+    switch (stream)
+    {
+    case CV_DEPTH_STREAM:
+        mode.setPixelFormat(openni::PIXEL_FORMAT_DEPTH_1_MM);
+        break;
+    case CV_COLOR_STREAM:
+        mode.setPixelFormat(openni::PIXEL_FORMAT_RGB888);
+        break;
+    case CV_IR_STREAM:
+        mode.setPixelFormat(openni::PIXEL_FORMAT_GRAY16);
+        break;
+    }
     return mode;
 }
 
 CvCapture_OpenNI2::CvCapture_OpenNI2( int index )
 {
-    numStream = 2;
     const char* deviceURI = openni::ANY_DEVICE;
     openni::Status status;
     int deviceType = DEVICE_DEFAULT;
@@ -215,13 +218,6 @@ CvCapture_OpenNI2::CvCapture_OpenNI2( int index )
         index %= 10;
     }
 
-    // Asus XTION and Occipital Structure Sensor do not have an image generator
-    if (deviceType == DEVICE_ASUS_XTION)
-        numStream = 1;
-
-    if( deviceType > DEVICE_MAX )
-        return;
-
     // Initialize and configure the context.
     status = openni::OpenNI::initialize();
 
@@ -247,83 +243,98 @@ CvCapture_OpenNI2::CvCapture_OpenNI2( int index )
         return;
     }
 
-    //device.setDepthColorSyncEnabled(true);
-
-
-    status = depth.create(device, openni::SENSOR_DEPTH);
-    if (status == openni::STATUS_OK)
+    status = toggleStream(CV_DEPTH_STREAM, true);
+    // Asus XTION and Occipital Structure Sensor do not have an image generator
+    if (deviceType != DEVICE_ASUS_XTION)
+        status = openni::Status(status | toggleStream(CV_COLOR_STREAM, true));
+    if (status != openni::STATUS_OK)
     {
-        if (depth.isValid())
-        {
-            CV_Assert(depth.setVideoMode(defaultDepthOutputMode()) == openni::STATUS_OK); // xn::DepthGenerator supports VGA only! (Jan 2011)
-        }
-
-        status = depth.start();
-        if (status != openni::STATUS_OK)
-        {
-            CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start depth stream: %s\n", openni::OpenNI::getExtendedError()));
-            depth.destroy();
-            return;
-        }
+        openni::OpenNI::shutdown();
+        return;
     }
-    else
+
+    if (!readCamerasParams())
     {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find depth stream:: %s\n", openni::OpenNI::getExtendedError()));
+        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Could not read cameras parameters\n"));
         return;
     }
 
-    streams = new openni::VideoStream*[numStream];
-    streams[CV_DEPTH_STREAM] = &depth;
 
-    // create a color object
-    status = color.create(device, openni::SENSOR_COLOR);
-    if (status == openni::STATUS_OK)
+    outputMaps.resize( outputMapsTypesCount );
+
+    isContextOpened = true;
+
+    setProperty(CV_CAP_PROP_OPENNI_REGISTRATION, 1.0);
+}
+
+openni::Status CvCapture_OpenNI2::toggleStream(int stream, bool toggle)
+{
+    openni::Status status;
+
+    // for logging
+    static const char* stream_names[CV_MAX_NUM_STREAMS] = {
+        "depth",
+        "color",
+        "IR"
+    };
+
+    static const openni::SensorType stream_sensor_types[CV_MAX_NUM_STREAMS] = {
+        openni::SENSOR_DEPTH,
+        openni::SENSOR_COLOR,
+        openni::SENSOR_IR
+    };
+
+    if (toggle) // want to open stream
     {
-        // Set map output mode.
-        if (color.isValid())
+        // already opened
+        if (streams[stream].isValid())
+            return openni::STATUS_OK;
+
+        // open stream
+        status = streams[stream].create(device, stream_sensor_types[stream]);
+        if (status == openni::STATUS_OK)
         {
-            CV_Assert(color.setVideoMode(defaultColorOutputMode()) == openni::STATUS_OK);
+            // set video mode
+            status = streams[stream].setVideoMode(defaultStreamOutputMode(stream)); // xn::DepthGenerator supports VGA only! (Jan 2011)
+            if (status != openni::STATUS_OK)
+            {
+                CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't set %s stream output mode: %s\n",
+                                                 stream_names[stream],
+                                                 openni::OpenNI::getExtendedError()));
+                streams[stream].destroy();
+                return status;
+            }
+
+            // start stream
+            status = streams[stream].start();
+            if (status != openni::STATUS_OK)
+            {
+                CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start %s stream: %s\n",
+                                                 stream_names[stream],
+                                                 openni::OpenNI::getExtendedError()));
+                streams[stream].destroy();
+                return status;
+            }
         }
-        status = color.start();
-        if (status != openni::STATUS_OK)
+        else
         {
-            CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start color stream: %s\n", openni::OpenNI::getExtendedError()));
-            color.destroy();
-            return;
+            CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find %s stream:: %s\n",
+                                             stream_names[stream],
+                                             openni::OpenNI::getExtendedError()));
+            return status;
         }
-        streams[CV_COLOR_STREAM] = &color;
     }
-    else if (numStream == 2)
+    else if (streams[stream].isValid()) // want to close stream
     {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find color stream: %s\n", openni::OpenNI::getExtendedError()));
-        return;
+        streams[stream].stop();
+        streams[stream].destroy();
     }
 
-    if( !readCamerasParams() )
-    {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Could not read cameras parameters\n"));
-        return;
-    }
-
-//    if( deviceType == DEVICE_ASUS_XTION )
-//    {
-//        //ps/asus specific
-//        imageGenerator.SetIntProperty("InputFormat", 1 /*XN_IO_IMAGE_FORMAT_YUV422*/);
-//        imageGenerator.SetPixelFormat(XN_PIXEL_FORMAT_RGB24);
-//        depthGenerator.SetIntProperty("RegistrationType", 1 /*XN_PROCESSING_HARDWARE*/);
-//    }
-
-
-    outputMaps.resize( outputMapsTypesCount );
-
-    isContextOpened = true;
-
-    setProperty(CV_CAP_PROP_OPENNI_REGISTRATION, 1.0);
+    return openni::STATUS_OK;
 }
 
 CvCapture_OpenNI2::CvCapture_OpenNI2(const char * filename)
 {
-    numStream = 2;
     openni::Status status;
 
     isContextOpened = false;
@@ -348,6 +359,13 @@ CvCapture_OpenNI2::CvCapture_OpenNI2(const char * filename)
         return;
     }
 
+    status = openni::Status(toggleStream(CV_DEPTH_STREAM, true) | toggleStream(CV_COLOR_STREAM, true));
+    if (status != openni::STATUS_OK)
+    {
+        openni::OpenNI::shutdown();
+        return;
+    }
+
     if( !readCamerasParams() )
     {
         CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Could not read cameras parameters\n"));
@@ -361,17 +379,20 @@ CvCapture_OpenNI2::CvCapture_OpenNI2(const char * filename)
 
 CvCapture_OpenNI2::~CvCapture_OpenNI2()
 {
-    this->depthFrame.release();
-    this->colorFrame.release();
-    this->depth.stop();
-    this->color.stop();
+    for (int i = 0; i < CV_MAX_NUM_STREAMS; ++i)
+    {
+        streamFrames[i].release();
+        streams[i].stop();
+        streams[i].destroy();
+    }
+    device.close();
     openni::OpenNI::shutdown();
 }
 
 bool CvCapture_OpenNI2::readCamerasParams()
 {
     double pixelSize = 0;
-    if (depth.getProperty<double>(XN_STREAM_PROPERTY_ZERO_PLANE_PIXEL_SIZE, &pixelSize) != openni::STATUS_OK)
+    if (streams[CV_DEPTH_STREAM].getProperty<double>(XN_STREAM_PROPERTY_ZERO_PLANE_PIXEL_SIZE, &pixelSize) != openni::STATUS_OK)
     {
         CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read pixel size!\n"));
         return false;
@@ -382,13 +403,13 @@ bool CvCapture_OpenNI2::readCamerasParams()
 
     // focal length of IR camera in pixels for VGA resolution
     int zeroPlanDistance; // in mm
-    if (depth.getProperty(XN_STREAM_PROPERTY_ZERO_PLANE_DISTANCE, &zeroPlanDistance) != openni::STATUS_OK)
+    if (streams[CV_DEPTH_STREAM].getProperty(XN_STREAM_PROPERTY_ZERO_PLANE_DISTANCE, &zeroPlanDistance) != openni::STATUS_OK)
     {
         CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read virtual plane distance!\n"));
         return false;
     }
 
-    if (depth.getProperty<double>(XN_STREAM_PROPERTY_EMITTER_DCMOS_DISTANCE, &baseline) != openni::STATUS_OK)
+    if (streams[CV_DEPTH_STREAM].getProperty<double>(XN_STREAM_PROPERTY_EMITTER_DCMOS_DISTANCE, &baseline) != openni::STATUS_OK)
     {
         CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read base line!\n"));
         return false;
@@ -419,6 +440,10 @@ double CvCapture_OpenNI2::getProperty( int propIdx ) const
         {
             propValue = getDepthGeneratorProperty( purePropIdx );
         }
+        else if ((propIdx & CV_CAP_OPENNI_GENERATORS_MASK) == CV_CAP_OPENNI_IR_GENERATOR)
+        {
+            propValue = getIrGeneratorProperty(purePropIdx);
+        }
         else
         {
             propValue = getCommonProperty( purePropIdx );
@@ -443,6 +468,10 @@ bool CvCapture_OpenNI2::setProperty( int propIdx, double propValue )
         {
             isSet = setDepthGeneratorProperty( purePropIdx, propValue );
         }
+        else if ((propIdx & CV_CAP_OPENNI_GENERATORS_MASK) == CV_CAP_OPENNI_IR_GENERATOR)
+        {
+            isSet = setIrGeneratorProperty(purePropIdx, propValue);
+        }
         else
         {
             isSet = setCommonProperty( purePropIdx, propValue );
@@ -458,12 +487,6 @@ double CvCapture_OpenNI2::getCommonProperty( int propIdx ) const
 
     switch( propIdx )
     {
-    // There is a set of properties that correspond to depth generator by default
-    // (is they are pass without particular generator flag). Two reasons of this:
-    // 1) We can assume that depth generator is the main one for depth sensor.
-    // 2) In the initial vertions of OpenNI integration to OpenCV the value of
-    //    flag CV_CAP_OPENNI_DEPTH_GENERATOR was 0 (it isn't zero now).
-    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT :
     case CV_CAP_PROP_FRAME_WIDTH :
     case CV_CAP_PROP_FRAME_HEIGHT :
     case CV_CAP_PROP_FPS :
@@ -477,7 +500,9 @@ double CvCapture_OpenNI2::getCommonProperty( int propIdx ) const
         propValue = const_cast<CvCapture_OpenNI2 *>(this)->device.getDepthColorSyncEnabled();
     case CV_CAP_PROP_OPENNI2_MIRROR:
     {
-        bool isMirroring = color.getMirroringEnabled() && depth.getMirroringEnabled();
+        bool isMirroring = false;
+        for (int i = 0; i < CV_MAX_NUM_STREAMS; ++i)
+            isMirroring |= streams[i].getMirroringEnabled();
         propValue = isMirroring ? 1.0 : 0.0;
         break;
     }
@@ -497,8 +522,11 @@ bool CvCapture_OpenNI2::setCommonProperty( int propIdx, double propValue )
     case CV_CAP_PROP_OPENNI2_MIRROR:
     {
         bool mirror = propValue > 0.0 ? true : false;
-        isSet = color.setMirroringEnabled(mirror) == openni::STATUS_OK;
-        isSet = depth.setMirroringEnabled(mirror) == openni::STATUS_OK;
+        for (int i = 0; i < CV_MAX_NUM_STREAMS; ++i)
+        {
+            if (streams[i].isValid())
+                isSet |= streams[i].setMirroringEnabled(mirror) == openni::STATUS_OK;
+        }
     }
         break;
     // There is a set of properties that correspond to depth generator by default
@@ -509,6 +537,7 @@ bool CvCapture_OpenNI2::setCommonProperty( int propIdx, double propValue )
     case CV_CAP_PROP_OPENNI2_SYNC:
         isSet = device.setDepthColorSyncEnabled(propValue > 0.0) == openni::STATUS_OK;
         break;
+
     default:
         CV_Error( CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for setting.\n", propIdx) );
     }
@@ -519,29 +548,28 @@ bool CvCapture_OpenNI2::setCommonProperty( int propIdx, double propValue )
 double CvCapture_OpenNI2::getDepthGeneratorProperty( int propIdx ) const
 {
     double propValue = 0;
-    if( !depth.isValid() )
+    if( !streams[CV_DEPTH_STREAM].isValid() )
         return propValue;
 
     openni::VideoMode mode;
 
     switch( propIdx )
     {
-    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT :
-        CV_DbgAssert(depth.isValid());
-        propValue = 1.;
+    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
+        propValue = streams[CV_DEPTH_STREAM].isValid();
         break;
     case CV_CAP_PROP_FRAME_WIDTH :
-        propValue = depth.getVideoMode().getResolutionX();
+        propValue = streams[CV_DEPTH_STREAM].getVideoMode().getResolutionX();
         break;
     case CV_CAP_PROP_FRAME_HEIGHT :
-            propValue = depth.getVideoMode().getResolutionY();
+            propValue = streams[CV_DEPTH_STREAM].getVideoMode().getResolutionY();
         break;
     case CV_CAP_PROP_FPS :
-        mode = depth.getVideoMode();
+        mode = streams[CV_DEPTH_STREAM].getVideoMode();
         propValue = mode.getFps();
         break;
     case CV_CAP_PROP_OPENNI_FRAME_MAX_DEPTH :
-        propValue = depth.getMaxPixelValue();
+        propValue = streams[CV_DEPTH_STREAM].getMaxPixelValue();
         break;
     case CV_CAP_PROP_OPENNI_BASELINE :
         propValue = baseline;
@@ -553,10 +581,10 @@ double CvCapture_OpenNI2::getDepthGeneratorProperty( int propIdx ) const
         propValue = device.getImageRegistrationMode();
         break;
     case CV_CAP_PROP_POS_MSEC :
-        propValue = (double)depthFrame.getTimestamp();
+        propValue = (double)streamFrames[CV_DEPTH_STREAM].getTimestamp();
         break;
     case CV_CAP_PROP_POS_FRAMES :
-        propValue = depthFrame.getFrameIndex();
+        propValue = streamFrames[CV_DEPTH_STREAM].getFrameIndex();
         break;
     default :
         CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for getting.\n", propIdx) );
@@ -569,17 +597,20 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue
 {
     bool isSet = false;
 
-    CV_Assert( depth.isValid() );
-
     switch( propIdx )
     {
+    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
+        if (isContextOpened)
+            isSet = toggleStream(CV_DEPTH_STREAM, propValue > 0.0) == openni::STATUS_OK;
+        break;
     case CV_CAP_PROP_OPENNI_REGISTRATION:
         {
+            CV_Assert(streams[CV_DEPTH_STREAM].isValid());
             if( propValue != 0.0 ) // "on"
             {
                 // if there isn't image generator (i.e. ASUS XtionPro doesn't have it)
                 // then the property isn't avaliable
-                if ( color.isValid() )
+                if ( streams[CV_COLOR_STREAM].isValid() )
                 {
                     openni::ImageRegistrationMode mode = propValue != 0.0 ? openni::IMAGE_REGISTRATION_DEPTH_TO_COLOR : openni::IMAGE_REGISTRATION_OFF;
                     if( !device.getImageRegistrationMode() == mode )
@@ -619,30 +650,29 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue
 double CvCapture_OpenNI2::getImageGeneratorProperty( int propIdx ) const
 {
     double propValue = 0.;
-    if( !color.isValid() )
+    if( !streams[CV_COLOR_STREAM].isValid() )
         return propValue;
 
     openni::VideoMode mode;
     switch( propIdx )
     {
-    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT :
-        CV_DbgAssert( color.isValid() );
-        propValue = 1.;
+    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
+        propValue = streams[CV_COLOR_STREAM].isValid();
         break;
     case CV_CAP_PROP_FRAME_WIDTH :
-            propValue = color.getVideoMode().getResolutionX();
+            propValue = streams[CV_COLOR_STREAM].getVideoMode().getResolutionX();
         break;
     case CV_CAP_PROP_FRAME_HEIGHT :
-            propValue = color.getVideoMode().getResolutionY();
+            propValue = streams[CV_COLOR_STREAM].getVideoMode().getResolutionY();
         break;
     case CV_CAP_PROP_FPS :
-            propValue = color.getVideoMode().getFps();
+            propValue = streams[CV_COLOR_STREAM].getVideoMode().getFps();
         break;
     case CV_CAP_PROP_POS_MSEC :
-        propValue = (double)colorFrame.getTimestamp();
+        propValue = (double)streamFrames[CV_COLOR_STREAM].getTimestamp();
         break;
     case CV_CAP_PROP_POS_FRAMES :
-        propValue = (double)colorFrame.getFrameIndex();
+        propValue = (double)streamFrames[CV_COLOR_STREAM].getFrameIndex();
         break;
     default :
         CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.\n", propIdx) );
@@ -654,14 +684,18 @@ double CvCapture_OpenNI2::getImageGeneratorProperty( int propIdx ) const
 bool CvCapture_OpenNI2::setImageGeneratorProperty(int propIdx, double propValue)
 {
     bool isSet = false;
-        if( !color.isValid() )
-            return isSet;
 
         switch( propIdx )
         {
+        case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
+            if (isContextOpened)
+                isSet = toggleStream(CV_COLOR_STREAM, propValue > 0.0) == openni::STATUS_OK;
+            break;
         case CV_CAP_PROP_OPENNI_OUTPUT_MODE :
         {
-            openni::VideoMode mode = color.getVideoMode();
+            if (!streams[CV_COLOR_STREAM].isValid())
+                return isSet;
+            openni::VideoMode mode = streams[CV_COLOR_STREAM].getVideoMode();
 
             switch( cvRound(propValue) )
             {
@@ -689,7 +723,7 @@ bool CvCapture_OpenNI2::setImageGeneratorProperty(int propIdx, double propValue)
                 CV_Error( CV_StsBadArg, "Unsupported image generator output mode.\n");
             }
 
-            openni::Status status = color.setVideoMode( mode );
+            openni::Status status = streams[CV_COLOR_STREAM].setVideoMode( mode );
             if( status != openni::STATUS_OK )
                 CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setImageGeneratorProperty : %s\n", openni::OpenNI::getExtendedError()));
             else
@@ -703,6 +737,96 @@ bool CvCapture_OpenNI2::setImageGeneratorProperty(int propIdx, double propValue)
     return isSet;
 }
 
+double CvCapture_OpenNI2::getIrGeneratorProperty(int propIdx) const
+{
+    double propValue = 0.;
+    if (!streams[CV_IR_STREAM].isValid())
+        return propValue;
+
+    openni::VideoMode mode;
+    switch (propIdx)
+    {
+    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
+        propValue = streams[CV_IR_STREAM].isValid();
+        break;
+    case CV_CAP_PROP_FRAME_WIDTH:
+        propValue = streams[CV_IR_STREAM].getVideoMode().getResolutionX();
+        break;
+    case CV_CAP_PROP_FRAME_HEIGHT:
+        propValue = streams[CV_IR_STREAM].getVideoMode().getResolutionY();
+        break;
+    case CV_CAP_PROP_FPS:
+        propValue = streams[CV_IR_STREAM].getVideoMode().getFps();
+        break;
+    case CV_CAP_PROP_POS_MSEC:
+        propValue = (double)streamFrames[CV_IR_STREAM].getTimestamp();
+        break;
+    case CV_CAP_PROP_POS_FRAMES:
+        propValue = (double)streamFrames[CV_IR_STREAM].getFrameIndex();
+        break;
+    default:
+        CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.\n", propIdx));
+    }
+
+    return propValue;
+}
+
+bool CvCapture_OpenNI2::setIrGeneratorProperty(int propIdx, double propValue)
+{
+    bool isSet = false;
+
+    switch (propIdx)
+    {
+    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
+        if (isContextOpened)
+            isSet = toggleStream(CV_IR_STREAM, propValue > 0.0) == openni::STATUS_OK;
+        break;
+    case CV_CAP_PROP_OPENNI_OUTPUT_MODE:
+    {
+        if (!streams[CV_IR_STREAM].isValid())
+            return isSet;
+        openni::VideoMode mode = streams[CV_IR_STREAM].getVideoMode();
+
+        switch (cvRound(propValue))
+        {
+        case CV_CAP_OPENNI_VGA_30HZ:
+            mode.setResolution(640, 480);
+            mode.setFps(30);
+            break;
+        case CV_CAP_OPENNI_SXGA_15HZ:
+            mode.setResolution(1280, 960);
+            mode.setFps(15);
+            break;
+        case CV_CAP_OPENNI_SXGA_30HZ:
+            mode.setResolution(1280, 960);
+            mode.setFps(30);
+            break;
+        case CV_CAP_OPENNI_QVGA_30HZ:
+            mode.setResolution(320, 240);
+            mode.setFps(30);
+            break;
+        case CV_CAP_OPENNI_QVGA_60HZ:
+            mode.setResolution(320, 240);
+            mode.setFps(60);
+            break;
+        default:
+            CV_Error(CV_StsBadArg, "Unsupported image generator output mode.\n");
+        }
+
+        openni::Status status = streams[CV_IR_STREAM].setVideoMode(mode);
+        if (status != openni::STATUS_OK)
+            CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setImageGeneratorProperty : %s\n", openni::OpenNI::getExtendedError()));
+        else
+            isSet = true;
+        break;
+    }
+    default:
+        CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.\n", propIdx));
+    }
+
+    return isSet;
+}
+
 bool CvCapture_OpenNI2::grabFrame()
 {
     if( !isOpened() )
@@ -710,14 +834,22 @@ bool CvCapture_OpenNI2::grabFrame()
 
     bool isGrabbed = false;
 
-    openni::Status status = openni::OpenNI::waitForAnyStream(streams, numStream, &currentStream, CV_STREAM_TIMEOUT);
+    int numActiveStreams = 0;
+    openni::VideoStream* streamPtrs[CV_MAX_NUM_STREAMS];
+    for (int i = 0; i < CV_MAX_NUM_STREAMS; ++i) {
+        streamPtrs[numActiveStreams++] = &streams[i];
+    }
+
+    int currentStream;
+    openni::Status status = openni::OpenNI::waitForAnyStream(streamPtrs, numActiveStreams, &currentStream, CV_STREAM_TIMEOUT);
     if( status != openni::STATUS_OK )
         return false;
 
-    if( depth.isValid() )
-        depth.readFrame(&depthFrame);
-    if (color.isValid())
-        color.readFrame(&colorFrame);
+    for (int i = 0; i < CV_MAX_NUM_STREAMS; ++i)
+    {
+        if (streams[i].isValid())
+            streams[i].readFrame(&streamFrames[i]);
+    }
     isGrabbed = true;
 
     return isGrabbed;
@@ -736,25 +868,25 @@ inline void getDepthMapFromMetaData(const openni::VideoFrameRef& depthMetaData,
 
 IplImage* CvCapture_OpenNI2::retrieveDepthMap()
 {
-    if( !depth.isValid() )
+    if( !streamFrames[CV_DEPTH_STREAM].isValid() )
         return 0;
 
-    getDepthMapFromMetaData( depthFrame, outputMaps[CV_CAP_OPENNI_DEPTH_MAP].mat, noSampleValue, shadowValue );
+    getDepthMapFromMetaData(streamFrames[CV_DEPTH_STREAM], outputMaps[CV_CAP_OPENNI_DEPTH_MAP].mat, noSampleValue, shadowValue );
 
     return outputMaps[CV_CAP_OPENNI_DEPTH_MAP].getIplImagePtr();
 }
 
 IplImage* CvCapture_OpenNI2::retrievePointCloudMap()
 {
-    if( !depthFrame.isValid() )
+    if( !streamFrames[CV_DEPTH_STREAM].isValid() )
         return 0;
 
     cv::Mat depthImg;
-    getDepthMapFromMetaData(depthFrame, depthImg, noSampleValue, shadowValue);
+    getDepthMapFromMetaData(streamFrames[CV_DEPTH_STREAM], depthImg, noSampleValue, shadowValue);
 
     const int badPoint = INVALID_PIXEL_VAL;
     const float badCoord = INVALID_COORDINATE_VAL;
-    int cols = depthFrame.getWidth(), rows = depthFrame.getHeight();
+    int cols = streamFrames[CV_DEPTH_STREAM].getWidth(), rows = streamFrames[CV_DEPTH_STREAM].getHeight();
     cv::Mat pointCloud_XYZ( rows, cols, CV_32FC3, cv::Scalar::all(badPoint) );
 
     float worldX, worldY, worldZ;
@@ -762,7 +894,7 @@ IplImage* CvCapture_OpenNI2::retrievePointCloudMap()
     {
         for (int x = 0; x < cols; x++)
         {
-            openni::CoordinateConverter::convertDepthToWorld(depth, x, y, depthImg.at<unsigned short>(y, x), &worldX, &worldY, &worldZ);
+            openni::CoordinateConverter::convertDepthToWorld(streams[CV_DEPTH_STREAM], x, y, depthImg.at<unsigned short>(y, x), &worldX, &worldY, &worldZ);
 
             if (depthImg.at<unsigned short>(y, x) == badPoint) // not valid
                 pointCloud_XYZ.at<cv::Point3f>(y, x) = cv::Point3f(badCoord, badCoord, badCoord);
@@ -803,11 +935,11 @@ static void computeDisparity_32F( const openni::VideoFrameRef& depthMetaData, cv
 
 IplImage* CvCapture_OpenNI2::retrieveDisparityMap()
 {
-    if (!depthFrame.isValid())
+    if (!streamFrames[CV_DEPTH_STREAM].isValid())
         return 0;
 
     cv::Mat disp32;
-    computeDisparity_32F(depthFrame, disp32, baseline, depthFocalLength_VGA, noSampleValue, shadowValue);
+    computeDisparity_32F(streamFrames[CV_DEPTH_STREAM], disp32, baseline, depthFocalLength_VGA, noSampleValue, shadowValue);
 
     disp32.convertTo( outputMaps[CV_CAP_OPENNI_DISPARITY_MAP].mat, CV_8UC1 );
 
@@ -816,21 +948,21 @@ IplImage* CvCapture_OpenNI2::retrieveDisparityMap()
 
 IplImage* CvCapture_OpenNI2::retrieveDisparityMap_32F()
 {
-    if (!depthFrame.isValid())
+    if (!streamFrames[CV_DEPTH_STREAM].isValid())
         return 0;
 
-    computeDisparity_32F(depthFrame, outputMaps[CV_CAP_OPENNI_DISPARITY_MAP_32F].mat, baseline, depthFocalLength_VGA, noSampleValue, shadowValue);
+    computeDisparity_32F(streamFrames[CV_DEPTH_STREAM], outputMaps[CV_CAP_OPENNI_DISPARITY_MAP_32F].mat, baseline, depthFocalLength_VGA, noSampleValue, shadowValue);
 
     return outputMaps[CV_CAP_OPENNI_DISPARITY_MAP_32F].getIplImagePtr();
 }
 
 IplImage* CvCapture_OpenNI2::retrieveValidDepthMask()
 {
-    if (!depthFrame.isValid())
+    if (!streamFrames[CV_DEPTH_STREAM].isValid())
         return 0;
 
     cv::Mat d;
-    getDepthMapFromMetaData(depthFrame, d, noSampleValue, shadowValue);
+    getDepthMapFromMetaData(streamFrames[CV_DEPTH_STREAM], d, noSampleValue, shadowValue);
 
     outputMaps[CV_CAP_OPENNI_VALID_DEPTH_MASK].mat = d != CvCapture_OpenNI2::INVALID_PIXEL_VAL;
 
@@ -850,30 +982,58 @@ inline void getBGRImageFromMetaData( const openni::VideoFrameRef& imageMetaData,
    cv::cvtColor(bufferImage, bgrImage, cv::COLOR_RGB2BGR);
 }
 
+inline void getGrayImageFromMetaData(const openni::VideoFrameRef& imageMetaData, cv::Mat& grayImage)
+{
+    if (imageMetaData.getVideoMode().getPixelFormat() == openni::PIXEL_FORMAT_GRAY8)
+    {
+        grayImage.create(imageMetaData.getHeight(), imageMetaData.getWidth(), CV_8UC1);
+        grayImage.data = (uchar*)imageMetaData.getData();
+    }
+    else if (imageMetaData.getVideoMode().getPixelFormat() == openni::PIXEL_FORMAT_GRAY16)
+    {
+        grayImage.create(imageMetaData.getHeight(), imageMetaData.getWidth(), CV_16UC1);
+        grayImage.data = (uchar*)imageMetaData.getData();
+    }
+    else
+    {
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported format of grabbed image\n");
+    }
+}
+
 IplImage* CvCapture_OpenNI2::retrieveBGRImage()
 {
-    if( !color.isValid() )
+    if( !streamFrames[CV_COLOR_STREAM].isValid() )
         return 0;
 
-    getBGRImageFromMetaData( colorFrame, outputMaps[CV_CAP_OPENNI_BGR_IMAGE].mat );
+    getBGRImageFromMetaData(streamFrames[CV_COLOR_STREAM], outputMaps[CV_CAP_OPENNI_BGR_IMAGE].mat );
 
     return outputMaps[CV_CAP_OPENNI_BGR_IMAGE].getIplImagePtr();
 }
 
 IplImage* CvCapture_OpenNI2::retrieveGrayImage()
 {
-    if (!colorFrame.isValid())
+    if (!streamFrames[CV_COLOR_STREAM].isValid())
         return 0;
 
-    CV_Assert(colorFrame.getVideoMode().getPixelFormat() == openni::PIXEL_FORMAT_RGB888); // RGB
+    CV_Assert(streamFrames[CV_COLOR_STREAM].getVideoMode().getPixelFormat() == openni::PIXEL_FORMAT_RGB888); // RGB
 
     cv::Mat rgbImage;
-    getBGRImageFromMetaData(colorFrame, rgbImage);
+    getBGRImageFromMetaData(streamFrames[CV_COLOR_STREAM], rgbImage);
     cv::cvtColor( rgbImage, outputMaps[CV_CAP_OPENNI_GRAY_IMAGE].mat, CV_BGR2GRAY );
 
     return outputMaps[CV_CAP_OPENNI_GRAY_IMAGE].getIplImagePtr();
 }
 
+IplImage* CvCapture_OpenNI2::retrieveIrImage()
+{
+    if (!streamFrames[CV_IR_STREAM].isValid())
+        return 0;
+
+    getGrayImageFromMetaData(streamFrames[CV_IR_STREAM], outputMaps[CV_CAP_OPENNI_IR_IMAGE].mat);
+
+    return outputMaps[CV_CAP_OPENNI_IR_IMAGE].getIplImagePtr();
+}
+
 IplImage* CvCapture_OpenNI2::retrieveFrame( int outputType )
 {
     IplImage* image = 0;
@@ -907,6 +1067,10 @@ IplImage* CvCapture_OpenNI2::retrieveFrame( int outputType )
     {
         image = retrieveGrayImage();
     }
+    else if( outputType == CV_CAP_OPENNI_IR_IMAGE )
+    {
+        image = retrieveIrImage();
+    }
 
     return image;
 }
diff --git a/platforms/android/android.toolchain.cmake b/platforms/android/android.toolchain.cmake
index 900ca8c91c..e0b4bf8d65 100644
--- a/platforms/android/android.toolchain.cmake
+++ b/platforms/android/android.toolchain.cmake
@@ -244,7 +244,7 @@ set( ANDROID_SUPPORTED_ABIS_mips "mips" )
 set( ANDROID_SUPPORTED_ABIS_mips64 "mips64" )
 
 # API level defaults
-set( ANDROID_DEFAULT_NDK_API_LEVEL 8 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL 9 )
 set( ANDROID_DEFAULT_NDK_API_LEVEL_arm64 21 )
 set( ANDROID_DEFAULT_NDK_API_LEVEL_x86 9 )
 set( ANDROID_DEFAULT_NDK_API_LEVEL_x86_64 21 )
diff --git a/platforms/android/build-tests/test_cmake_build.py b/platforms/android/build-tests/test_cmake_build.py
index 0e84928f0a..f02915c611 100644
--- a/platforms/android/build-tests/test_cmake_build.py
+++ b/platforms/android/build-tests/test_cmake_build.py
@@ -2,6 +2,9 @@
 
 import unittest
 import os, sys, subprocess, argparse, shutil, re
+import logging as log
+
+log.basicConfig(format='%(message)s', level=log.DEBUG)
 
 CMAKE_TEMPLATE='''\
 CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
@@ -83,10 +86,12 @@ class TestCmakeBuild(unittest.TestCase):
             "-DANDROID_TOOLCHAIN_NAME=%s" % self.toolchain,
             self.srcdir
         ]
+        log.info("Executing: %s" % cmd)
         retcode = subprocess.call(cmd)
         self.assertEqual(retcode, 0, "cmake failed")
 
         cmd = ["ninja"]
+        log.info("Executing: %s" % cmd)
         retcode = subprocess.call(cmd)
         self.assertEqual(retcode, 0, "make failed")
 
diff --git a/platforms/android/build_sdk.py b/platforms/android/build_sdk.py
index 812b465e7b..7d52eab9c0 100755
--- a/platforms/android/build_sdk.py
+++ b/platforms/android/build_sdk.py
@@ -73,8 +73,7 @@ class ABI:
     def __str__(self):
         return "%s (%s)" % (self.name, self.toolchain)
     def haveIPP(self):
-        return False
-        # return self.name == "x86" or self.name == "x86_64"
+        return self.name == "x86" or self.name == "x86_64"
 
 ABIs = [
     ABI("2", "armeabi-v7a", "arm-linux-androideabi-4.8", cmake_name="armeabi-v7a with NEON"),
@@ -131,7 +130,7 @@ class Builder:
             "-DBUILD_ANDROID_EXAMPLES=ON",
             "-DINSTALL_ANDROID_EXAMPLES=ON",
             "-DANDROID_STL=gnustl_static",
-            "-DANDROID_NATIVE_API_LEVEL=8",
+            "-DANDROID_NATIVE_API_LEVEL=9",
             "-DANDROID_ABI='%s'" % abi.cmake_name,
             "-DWITH_TBB=ON",
             "-DANDROID_TOOLCHAIN_NAME=%s" % abi.toolchain
@@ -143,7 +142,7 @@ class Builder:
         cmd.append(self.opencvdir)
 
         if self.use_ccache == True:
-            cmd.extend(["-DNDK_CCACHE=ccache", "-DENABLE_PRECOMPILED_HEADERS=OFF"])
+            cmd.append("-DNDK_CCACHE=ccache")
         if do_install:
             cmd.extend(["-DBUILD_TESTS=ON", "-DINSTALL_TESTS=ON"])
         execute(cmd)
@@ -238,15 +237,6 @@ class Builder:
         log.info("Copy docs: %s", self.docdest)
         shutil.copytree(self.docdest, os.path.join(self.resultdest, "sdk", "java", "javadoc"))
 
-        # Patch cmake config
-        with open(os.path.join(self.resultdest, "sdk", "native", "jni", "OpenCVConfig.cmake"), "r+t") as f:
-            contents = f.read()
-            contents, count = re.subn(r'OpenCV_ANDROID_NATIVE_API_LEVEL \d+', "OpenCV_ANDROID_NATIVE_API_LEVEL 8", contents)
-            f.seek(0)
-            f.write(contents)
-            f.truncate()
-            log.info("Patch cmake config: %s (%d changes)", f.name, count)
-
         # Clean samples
         path = os.path.join(self.resultdest, "samples")
         for item in os.listdir(path):
diff --git a/platforms/linux/aarch64-gnu.toolchain.cmake b/platforms/linux/aarch64-gnu.toolchain.cmake
new file mode 100644
index 0000000000..4e1c80b39d
--- /dev/null
+++ b/platforms/linux/aarch64-gnu.toolchain.cmake
@@ -0,0 +1,4 @@
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+set(GCC_COMPILER_VERSION "4.8" CACHE STRING "GCC Compiler version")
+set(GNU_MACHINE "aarch64-linux-gnu" CACHE STRING "GNU compiler triple")
+include("${CMAKE_CURRENT_LIST_DIR}/arm.toolchain.cmake")
diff --git a/platforms/linux/arm-gnueabi.toolchain.cmake b/platforms/linux/arm-gnueabi.toolchain.cmake
index 448dfa6b1c..d31da377fe 100644
--- a/platforms/linux/arm-gnueabi.toolchain.cmake
+++ b/platforms/linux/arm-gnueabi.toolchain.cmake
@@ -1,88 +1,3 @@
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_VERSION 1)
-set(CMAKE_SYSTEM_PROCESSOR arm)
-
 set(GCC_COMPILER_VERSION "4.6" CACHE STRING "GCC Compiler version")
-
-set(FLOAT_ABI_SUFFIX "")
-if (NOT SOFTFP)
-  set(FLOAT_ABI_SUFFIX "hf")
-endif()
-
-find_program(CMAKE_C_COMPILER NAMES arm-linux-gnueabi${FLOAT_ABI_SUFFIX}-gcc-${GCC_COMPILER_VERSION})
-find_program(CMAKE_CXX_COMPILER NAMES arm-linux-gnueabi${FLOAT_ABI_SUFFIX}-g++-${GCC_COMPILER_VERSION})
-set(ARM_LINUX_SYSROOT /usr/arm-linux-gnueabi${FLOAT_ABI_SUFFIX} CACHE PATH "ARM cross compilation system root")
-
-set(CMAKE_CXX_FLAGS           ""                    CACHE STRING "c++ flags")
-set(CMAKE_C_FLAGS             ""                    CACHE STRING "c flags")
-set(CMAKE_SHARED_LINKER_FLAGS ""                    CACHE STRING "shared linker flags")
-set(CMAKE_MODULE_LINKER_FLAGS ""                    CACHE STRING "module linker flags")
-set(CMAKE_EXE_LINKER_FLAGS    "-Wl,-z,nocopyreloc"  CACHE STRING "executable linker flags")
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mthumb -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
-set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -mthumb -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
-
-set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now ${CMAKE_SHARED_LINKER_FLAGS}")
-set(CMAKE_MODULE_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now ${CMAKE_MODULE_LINKER_FLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS    "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now ${CMAKE_EXE_LINKER_FLAGS}")
-
-if(USE_NEON)
-  message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." )
-  set(ENABLE_NEON TRUE)
-elseif(USE_VFPV3)
-  message(WARNING "You use obsolete variable USE_VFPV3 to enable VFPV3 instruction set. Use -DENABLE_VFPV3=ON instead." )
-  set(ENABLE_VFPV3 TRUE)
-endif()
-
-set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${ARM_LINUX_SYSROOT})
-
-if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
-    set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
-endif()
-
-set( CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries." )
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-
-# macro to find programs on the host OS
-macro( find_host_program )
- set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
- set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
- set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER )
- if( CMAKE_HOST_WIN32 )
-  SET( WIN32 1 )
-  SET( UNIX )
- elseif( CMAKE_HOST_APPLE )
-  SET( APPLE 1 )
-  SET( UNIX )
- endif()
- find_program( ${ARGN} )
- SET( WIN32 )
- SET( APPLE )
- SET( UNIX 1 )
- set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY )
- set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
- set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
-endmacro()
-
-# macro to find packages on the host OS
-macro( find_host_package )
- set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
- set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
- set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER )
- if( CMAKE_HOST_WIN32 )
-  SET( WIN32 1 )
-  SET( UNIX )
- elseif( CMAKE_HOST_APPLE )
-  SET( APPLE 1 )
-  SET( UNIX )
- endif()
- find_package( ${ARGN} )
- SET( WIN32 )
- SET( APPLE )
- SET( UNIX 1 )
- set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY )
- set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
- set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
-endmacro()
+set(GNU_MACHINE "arm-linux-gnueabi" CACHE STRING "GNU compiler triple")
+include("${CMAKE_CURRENT_LIST_DIR}/arm.toolchain.cmake")
diff --git a/platforms/linux/arm.toolchain.cmake b/platforms/linux/arm.toolchain.cmake
new file mode 100644
index 0000000000..23c03fb6d7
--- /dev/null
+++ b/platforms/linux/arm.toolchain.cmake
@@ -0,0 +1,87 @@
+if(COMMAND toolchain_save_config)
+  return() # prevent recursive call
+endif()
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+if(NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
+  set(CMAKE_SYSTEM_PROCESSOR arm)
+else()
+  #message("CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/gnu.toolchain.cmake")
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm AND NOT ARM_IGNORE_FP)
+  set(FLOAT_ABI_SUFFIX "")
+  if(NOT SOFTFP)
+    set(FLOAT_ABI_SUFFIX "hf")
+  endif()
+endif()
+
+if(NOT "x${GCC_COMPILER_VERSION}" STREQUAL "x")
+  set(__GCC_VER_SUFFIX "-${GCC_COMPILER_VERSION}")
+endif()
+
+if(NOT DEFINED CMAKE_C_COMPILER)
+  find_program(CMAKE_C_COMPILER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-gcc${__GCC_VER_SUFFIX})
+else()
+  #message(WARNING "CMAKE_C_COMPILER=${CMAKE_C_COMPILER} is defined")
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
+  find_program(CMAKE_CXX_COMPILER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-g++${__GCC_VER_SUFFIX})
+else()
+  #message(WARNING "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} is defined")
+endif()
+
+if(NOT DEFINED ARM_LINUX_SYSROOT AND DEFINED GNU_MACHINE)
+  set(ARM_LINUX_SYSROOT /usr/${GNU_MACHINE}${FLOAT_ABI_SUFFIX})
+endif()
+
+if(NOT DEFINED CMAKE_CXX_FLAGS)
+  set(CMAKE_CXX_FLAGS           "" CACHE INTERAL "")
+  set(CMAKE_C_FLAGS             "" CACHE INTERAL "")
+  set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERAL "")
+  set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERAL "")
+  set(CMAKE_EXE_LINKER_FLAGS    "" CACHE INTERAL "")
+
+  set(CMAKE_CXX_FLAGS           "${CMAKE_CXX_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
+  set(CMAKE_C_FLAGS             "${CMAKE_C_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
+    set(CMAKE_CXX_FLAGS           "-mthumb ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS             "-mthumb ${CMAKE_C_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,nocopyreloc")
+  endif()
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
+    set(ARM_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
+    set(ARM_LINKER_FLAGS "-Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
+  endif()
+  set(CMAKE_SHARED_LINKER_FLAGS "${ARM_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+  set(CMAKE_MODULE_LINKER_FLAGS "${ARM_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS    "${ARM_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
+else()
+  #message(WARNING "CMAKE_CXX_FLAGS='${CMAKE_CXX_FLAGS}' is defined")
+endif()
+
+if(USE_NEON)
+  message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." )
+  set(ENABLE_NEON TRUE)
+elseif(USE_VFPV3)
+  message(WARNING "You use obsolete variable USE_VFPV3 to enable VFPV3 instruction set. Use -DENABLE_VFPV3=ON instead." )
+  set(ENABLE_VFPV3 TRUE)
+endif()
+
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${ARM_LINUX_SYSROOT})
+
+if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+  set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
+endif()
+
+set(TOOLCHAIN_CONFIG_VARS ${TOOLCHAIN_CONFIG_VARS}
+    ARM_LINUX_SYSROOT
+    ENABLE_NEON
+    ENABLE_VFPV3
+    CUDA_TOOLKIT_ROOT_DIR
+)
+toolchain_save_config()
diff --git a/platforms/linux/gnu.toolchain.cmake b/platforms/linux/gnu.toolchain.cmake
new file mode 100644
index 0000000000..4050d83f61
--- /dev/null
+++ b/platforms/linux/gnu.toolchain.cmake
@@ -0,0 +1,106 @@
+cmake_minimum_required(VERSION 2.8)
+
+# load settings in case of "try compile"
+set(TOOLCHAIN_CONFIG_FILE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain.config.cmake")
+get_property(__IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE)
+if(__IN_TRY_COMPILE)
+  include("${CMAKE_CURRENT_SOURCE_DIR}/../toolchain.config.cmake" OPTIONAL) # CMAKE_BINARY_DIR is different
+  macro(toolchain_save_config)
+    # nothing
+  endmacro()
+else()
+  macro(toolchain_save_config)
+    set(__config "#message(\"Load TOOLCHAIN config...\")\n")
+    get_cmake_property(__variableNames VARIABLES)
+    set(__vars_list ${ARGN})
+    list(APPEND __vars_list
+        ${TOOLCHAIN_CONFIG_VARS}
+        CMAKE_SYSTEM_NAME
+        CMAKE_SYSTEM_VERSION
+        CMAKE_SYSTEM_PROCESSOR
+        CMAKE_C_COMPILER
+        CMAKE_CXX_COMPILER
+        CMAKE_C_FLAGS
+        CMAKE_CXX_FLAGS
+        CMAKE_SHARED_LINKER_FLAGS
+        CMAKE_MODULE_LINKER_FLAGS
+        CMAKE_EXE_LINKER_FLAGS
+        CMAKE_SKIP_RPATH
+        CMAKE_FIND_ROOT_PATH
+        GCC_COMPILER_VERSION
+    )
+    foreach(__var ${__variableNames})
+      foreach(_v ${__vars_list})
+        if("x${__var}" STREQUAL "x${_v}")
+          if(${__var} MATCHES " ")
+            set(__config "${__config}set(${__var} \"${${__var}}\")\n")
+          else()
+            set(__config "${__config}set(${__var} ${${__var}})\n")
+          endif()
+        endif()
+      endforeach()
+    endforeach()
+    if(EXISTS "${TOOLCHAIN_CONFIG_FILE}")
+      file(READ "${TOOLCHAIN_CONFIG_FILE}" __config_old)
+    endif()
+    if("${__config_old}" STREQUAL "${__config}")
+      # nothing
+    else()
+      #message("Update TOOLCHAIN config: ${__config}")
+      file(WRITE "${TOOLCHAIN_CONFIG_FILE}" "${__config}")
+    endif()
+    unset(__config)
+    unset(__config_old)
+    unset(__vars_list)
+    unset(__variableNames)
+  endmacro()
+endif() # IN_TRY_COMPILE
+
+set(CMAKE_SKIP_RPATH TRUE)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+
+# macro to find programs on the host OS
+macro(find_host_program)
+ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+ if(CMAKE_HOST_WIN32)
+  SET(WIN32 1)
+  SET(UNIX)
+ elseif(CMAKE_HOST_APPLE)
+  SET(APPLE 1)
+  SET(UNIX)
+ endif()
+ find_program(${ARGN})
+ SET(WIN32)
+ SET(APPLE)
+ SET(UNIX 1)
+ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro()
+
+# macro to find packages on the host OS
+macro(find_host_package)
+ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+ if(CMAKE_HOST_WIN32)
+  SET(WIN32 1)
+  SET(UNIX)
+ elseif(CMAKE_HOST_APPLE)
+  SET(APPLE 1)
+  SET(UNIX)
+ endif()
+ find_package(${ARGN})
+ SET(WIN32)
+ SET(APPLE)
+ SET(UNIX 1)
+ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro()
+
+set(CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries.")
diff --git a/platforms/scripts/valgrind.supp b/platforms/scripts/valgrind.supp
index b37ca2017b..54833e08bd 100644
--- a/platforms/scripts/valgrind.supp
+++ b/platforms/scripts/valgrind.supp
@@ -4,3 +4,10 @@
    fun:ippicvGetCpuFeatures
    fun:ippicvStaticInit
 }
+
+{
+   TBB - allocate_via_handler_v3 issue
+   Memcheck:Leak
+   fun:malloc
+   fun:_ZN3tbb8internal23allocate_via_handler_v3Em
+}
diff --git a/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp b/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp
index 9fd8494b2f..7d198dc53e 100644
--- a/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp
+++ b/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp
@@ -1,5 +1,5 @@
 #include <DetectionBasedTracker_jni.h>
-#include <opencv2/core/core.hpp>
+#include <opencv2/core.hpp>
 #include <opencv2/objdetect.hpp>
 
 #include <string>
diff --git a/samples/android/tutorial-2-mixedprocessing/jni/jni_part.cpp b/samples/android/tutorial-2-mixedprocessing/jni/jni_part.cpp
index 2c09961563..72fa0b24ab 100644
--- a/samples/android/tutorial-2-mixedprocessing/jni/jni_part.cpp
+++ b/samples/android/tutorial-2-mixedprocessing/jni/jni_part.cpp
@@ -1,7 +1,7 @@
 #include <jni.h>
-#include <opencv2/core/core.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/features2d/features2d.hpp>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/features2d.hpp>
 #include <vector>
 
 using namespace std;
diff --git a/samples/cpp/3calibration.cpp b/samples/cpp/3calibration.cpp
index 12f2c81fca..7079b6d062 100644
--- a/samples/cpp/3calibration.cpp
+++ b/samples/cpp/3calibration.cpp
@@ -2,10 +2,10 @@
  * 3calibration.cpp -- Calibrate 3 cameras in a horizontal line together.
  */
 
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/imgcodecs/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/calib3d.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/core/utility.hpp"
 
 #include <stdio.h>
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index d93d0196a2..c75a5c73e2 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -31,7 +31,8 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   endif()
 
   if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-missing-declarations")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-function -Wno-missing-declarations")
   endif()
 
   # ---------------------------------------------
@@ -57,8 +58,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
       ocv_target_link_libraries(${the_target} opencv_cudaarithm opencv_cudafilters)
     endif()
 
-    if(HAVE_opencv_ocl)
-      ocv_target_link_libraries(${the_target} opencv_ocl)
+    if("${srcs}" MATCHES "viz/" AND VTK_USE_FILE)
+      include(${VTK_USE_FILE})
+      ocv_target_link_libraries(${the_target} ${VTK_LIBRARIES})
+      add_definitions(-DUSE_VTK)
     endif()
 
     set_target_properties(${the_target} PROPERTIES
@@ -88,7 +91,9 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
     ocv_list_filterout(cpp_samples "/gpu/")
   endif()
 
-  ocv_list_filterout(cpp_samples "viz")
+  if(NOT TARGET opencv_viz)
+    ocv_list_filterout(cpp_samples "/viz/")
+  endif()
 
   if(NOT HAVE_IPP_A)
     ocv_list_filterout(cpp_samples "/ippasync/")
diff --git a/samples/cpp/contours2.cpp b/samples/cpp/contours2.cpp
index c5a1fa70f5..437f76cb24 100644
--- a/samples/cpp/contours2.cpp
+++ b/samples/cpp/contours2.cpp
@@ -1,5 +1,5 @@
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
 #include <math.h>
 #include <iostream>
 
diff --git a/samples/cpp/convexhull.cpp b/samples/cpp/convexhull.cpp
index 36e5544548..fd2275645d 100644
--- a/samples/cpp/convexhull.cpp
+++ b/samples/cpp/convexhull.cpp
@@ -1,6 +1,5 @@
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <fstream>
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
 #include <iostream>
 
 using namespace cv;
diff --git a/samples/cpp/cout_mat.cpp b/samples/cpp/cout_mat.cpp
index bf1dfb2a41..ed2cd71c86 100644
--- a/samples/cpp/cout_mat.cpp
+++ b/samples/cpp/cout_mat.cpp
@@ -5,7 +5,7 @@
  *
  */
 
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include <iostream>
 
 using namespace std;
diff --git a/samples/cpp/dbt_face_detection.cpp b/samples/cpp/dbt_face_detection.cpp
index d7409bf5b5..920ae001d8 100644
--- a/samples/cpp/dbt_face_detection.cpp
+++ b/samples/cpp/dbt_face_detection.cpp
@@ -1,11 +1,11 @@
 #if defined(__linux__) || defined(LINUX) || defined(__APPLE__) || defined(ANDROID)
 
-#include <opencv2/imgproc/imgproc.hpp>  // Gaussian Blur
-#include <opencv2/core/core.hpp>        // Basic OpenCV structures (cv::Mat, Scalar)
-#include <opencv2/videoio/videoio.hpp>
-#include <opencv2/highgui/highgui.hpp>  // OpenCV window I/O
-#include <opencv2/features2d/features2d.hpp>
-#include <opencv2/objdetect/objdetect.hpp>
+#include <opencv2/imgproc.hpp>  // Gaussian Blur
+#include <opencv2/core.hpp>        // Basic OpenCV structures (cv::Mat, Scalar)
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>  // OpenCV window I/O
+#include <opencv2/features2d.hpp>
+#include <opencv2/objdetect.hpp>
 
 #include <stdio.h>
 #include <string>
diff --git a/samples/cpp/delaunay2.cpp b/samples/cpp/delaunay2.cpp
index a370feb2a7..4807cd373f 100644
--- a/samples/cpp/delaunay2.cpp
+++ b/samples/cpp/delaunay2.cpp
@@ -1,5 +1,5 @@
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
 #include <iostream>
 
 using namespace cv;
diff --git a/samples/cpp/facedetect.cpp b/samples/cpp/facedetect.cpp
index 2cfe524a18..8f6a45666f 100644
--- a/samples/cpp/facedetect.cpp
+++ b/samples/cpp/facedetect.cpp
@@ -177,7 +177,7 @@ void detectAndDraw( Mat& img, CascadeClassifier& cascade,
     resize( gray, smallImg, Size(), fx, fx, INTER_LINEAR );
     equalizeHist( smallImg, smallImg );
 
-    t = (double)cvGetTickCount();
+    t = (double)getTickCount();
     cascade.detectMultiScale( smallImg, faces,
         1.1, 2, 0
         //|CASCADE_FIND_BIGGEST_OBJECT
@@ -198,8 +198,8 @@ void detectAndDraw( Mat& img, CascadeClassifier& cascade,
             faces.push_back(Rect(smallImg.cols - r->x - r->width, r->y, r->width, r->height));
         }
     }
-    t = (double)cvGetTickCount() - t;
-    printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
+    t = (double)getTickCount() - t;
+    printf( "detection time = %g ms\n", t*1000/getTickFrequency());
     for ( size_t i = 0; i < faces.size(); i++ )
     {
         Rect r = faces[i];
diff --git a/samples/cpp/facial_features.cpp b/samples/cpp/facial_features.cpp
index f46fa3fb56..3ed6a442f9 100644
--- a/samples/cpp/facial_features.cpp
+++ b/samples/cpp/facial_features.cpp
@@ -6,9 +6,9 @@
  *
  */
 
-#include "opencv2/objdetect/objdetect.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/objdetect.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 #include <iostream>
 #include <cstdio>
diff --git a/samples/cpp/fback.cpp b/samples/cpp/fback.cpp
index 5fbec6913c..a044844023 100644
--- a/samples/cpp/fback.cpp
+++ b/samples/cpp/fback.cpp
@@ -1,7 +1,7 @@
 #include "opencv2/video/tracking.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/ffilldemo.cpp b/samples/cpp/ffilldemo.cpp
index 93bdd672bd..d10d72e0eb 100644
--- a/samples/cpp/ffilldemo.cpp
+++ b/samples/cpp/ffilldemo.cpp
@@ -1,7 +1,7 @@
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/filestorage.cpp b/samples/cpp/filestorage.cpp
index 60ea51acab..46b4da2414 100644
--- a/samples/cpp/filestorage.cpp
+++ b/samples/cpp/filestorage.cpp
@@ -2,7 +2,7 @@
  * filestorage_sample demonstrate the usage of the opencv serialization functionality
  */
 
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include <iostream>
 #include <string>
 
diff --git a/samples/cpp/fitellipse.cpp b/samples/cpp/fitellipse.cpp
index b83f617c1d..0cd6c4a6db 100644
--- a/samples/cpp/fitellipse.cpp
+++ b/samples/cpp/fitellipse.cpp
@@ -14,10 +14,11 @@
 *
 *
 ********************************************************************************/
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include <iostream>
+
 using namespace cv;
 using namespace std;
 
diff --git a/samples/cpp/grabcut.cpp b/samples/cpp/grabcut.cpp
index b6b406000b..726906c6ff 100644
--- a/samples/cpp/grabcut.cpp
+++ b/samples/cpp/grabcut.cpp
@@ -1,6 +1,6 @@
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/houghcircles.cpp b/samples/cpp/houghcircles.cpp
index bdafffec30..26c4dbae08 100644
--- a/samples/cpp/houghcircles.cpp
+++ b/samples/cpp/houghcircles.cpp
@@ -1,6 +1,6 @@
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/houghlines.cpp b/samples/cpp/houghlines.cpp
index ddeb3bd751..94eec86eeb 100644
--- a/samples/cpp/houghlines.cpp
+++ b/samples/cpp/houghlines.cpp
@@ -1,6 +1,6 @@
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/image.cpp b/samples/cpp/image.cpp
index b5925c69af..fc57738a77 100644
--- a/samples/cpp/image.cpp
+++ b/samples/cpp/image.cpp
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <iostream>
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
 #include <opencv2/core/utility.hpp>
 
 using namespace cv; // all the new API is put into "cv" namespace. Export its content
diff --git a/samples/cpp/image_sequence.cpp b/samples/cpp/image_sequence.cpp
index 14b63e39c4..6a84fab4bd 100644
--- a/samples/cpp/image_sequence.cpp
+++ b/samples/cpp/image_sequence.cpp
@@ -1,6 +1,6 @@
-#include <opencv2/core/core.hpp>
-#include <opencv2/videoio/videoio.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/core.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
 
 #include <iostream>
 
diff --git a/samples/cpp/imagelist_creator.cpp b/samples/cpp/imagelist_creator.cpp
index bdb023187b..5b2dc38d47 100644
--- a/samples/cpp/imagelist_creator.cpp
+++ b/samples/cpp/imagelist_creator.cpp
@@ -1,9 +1,9 @@
 /*this creates a yaml or xml list of files from the command line args
  */
 
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include <string>
 #include <iostream>
 
diff --git a/samples/cpp/inpaint.cpp b/samples/cpp/inpaint.cpp
index 62d7521932..86e6e37416 100644
--- a/samples/cpp/inpaint.cpp
+++ b/samples/cpp/inpaint.cpp
@@ -1,7 +1,7 @@
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/photo/photo.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/photo.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp
index daae4420fb..d8a1c32241 100644
--- a/samples/cpp/intelperc_capture.cpp
+++ b/samples/cpp/intelperc_capture.cpp
@@ -1,8 +1,8 @@
 // testOpenCVCam.cpp : Defines the entry point for the console application.
 //
 
-#include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/kalman.cpp b/samples/cpp/kalman.cpp
index 8f9adc6c27..501a749124 100644
--- a/samples/cpp/kalman.cpp
+++ b/samples/cpp/kalman.cpp
@@ -1,5 +1,5 @@
 #include "opencv2/video/tracking.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <stdio.h>
 
diff --git a/samples/cpp/laplace.cpp b/samples/cpp/laplace.cpp
index b33b49cbdb..462f62804e 100644
--- a/samples/cpp/laplace.cpp
+++ b/samples/cpp/laplace.cpp
@@ -1,6 +1,6 @@
-#include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 #include <ctype.h>
 #include <stdio.h>
diff --git a/samples/cpp/letter_recog.cpp b/samples/cpp/letter_recog.cpp
index 3d7e34ab7a..0eb67d1e42 100644
--- a/samples/cpp/letter_recog.cpp
+++ b/samples/cpp/letter_recog.cpp
@@ -1,5 +1,5 @@
-#include "opencv2/core/core.hpp"
-#include "opencv2/ml/ml.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/ml.hpp"
 
 #include <cstdio>
 #include <vector>
diff --git a/samples/cpp/lkdemo.cpp b/samples/cpp/lkdemo.cpp
index 3881aa8650..5e57aa84a4 100644
--- a/samples/cpp/lkdemo.cpp
+++ b/samples/cpp/lkdemo.cpp
@@ -1,7 +1,7 @@
 #include "opencv2/video/tracking.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 #include <ctype.h>
diff --git a/samples/cpp/lsd_lines.cpp b/samples/cpp/lsd_lines.cpp
index 4c8c7e0a41..e0db6c3669 100644
--- a/samples/cpp/lsd_lines.cpp
+++ b/samples/cpp/lsd_lines.cpp
@@ -1,11 +1,8 @@
 #include <iostream>
-#include <string>
 
-#include "opencv2/core/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 using namespace std;
 using namespace cv;
@@ -23,6 +20,9 @@ int main(int argc, char** argv)
 
     Mat image = imread(in, IMREAD_GRAYSCALE);
 
+    if( image.empty() )
+    { return -1; }
+
 #if 0
     Canny(image, image, 50, 200, 3); // Apply canny edge
 #endif
diff --git a/samples/cpp/minarea.cpp b/samples/cpp/minarea.cpp
index 91ad5a37bf..ac79db5e5a 100644
--- a/samples/cpp/minarea.cpp
+++ b/samples/cpp/minarea.cpp
@@ -1,5 +1,5 @@
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/morphology2.cpp b/samples/cpp/morphology2.cpp
index 04e916c64a..8439080b2b 100644
--- a/samples/cpp/morphology2.cpp
+++ b/samples/cpp/morphology2.cpp
@@ -1,6 +1,6 @@
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include <stdlib.h>
 #include <stdio.h>
 #include <string>
@@ -66,7 +66,7 @@ int main( int argc, char** argv )
         return 0;
     }
     std::string filename = parser.get<std::string>("@image");
-    if( (src = imread(filename,1)).empty() )
+    if( (src = imread(filename,IMREAD_COLOR)).empty() )
     {
         help();
         return -1;
diff --git a/samples/cpp/neural_network.cpp b/samples/cpp/neural_network.cpp
index e0ef6fa9ae..d6e681b6c6 100644
--- a/samples/cpp/neural_network.cpp
+++ b/samples/cpp/neural_network.cpp
@@ -16,13 +16,13 @@ int main()
     {
         if (i < data.rows/2)
         {
-            data(i, 0) = 1;
-            data(i, 1) = 0;
+            responses(i, 0) = 1;
+            responses(i, 1) = 0;
         }
         else
         {
-            data(i, 0) = 0;
-            data(i, 1) = 1;
+            responses(i, 0) = 0;
+            responses(i, 1) = 1;
         }
     }
 
diff --git a/samples/cpp/openni_capture.cpp b/samples/cpp/openni_capture.cpp
index 70d4a7c610..0d0a967226 100644
--- a/samples/cpp/openni_capture.cpp
+++ b/samples/cpp/openni_capture.cpp
@@ -1,6 +1,6 @@
 #include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 #include <iostream>
 
@@ -21,6 +21,8 @@ static void help()
             "2.) Data given from RGB image generator\n"
             "   CAP_OPENNI_BGR_IMAGE            - color image (CV_8UC3)\n"
             "   CAP_OPENNI_GRAY_IMAGE           - gray image (CV_8UC1)\n"
+            "2.) Data given from IR image generator\n"
+            "   CAP_OPENNI_IR_IMAGE             - gray image (CV_16UC1)\n"
          << endl;
 }
 
@@ -92,8 +94,8 @@ static void printCommandLineParams()
     cout << "-mode=     image mode: resolution and fps, supported three values:  0 - CAP_OPENNI_VGA_30HZ, 1 - CAP_OPENNI_SXGA_15HZ," << endl;
     cout << "          2 - CAP_OPENNI_SXGA_30HZ (0 by default). Ignored if rgb image or gray image are not selected to show." << endl;
     cout << "-m=        Mask to set which output images are need. It is a string of size 5. Each element of this is '0' or '1' and" << endl;
-    cout << "          determine: is depth map, disparity map, valid pixels mask, rgb image, gray image need or not (correspondently)?" << endl ;
-    cout << "          By default -m=01010 i.e. disparity map and rgb image will be shown." << endl ;
+    cout << "          determine: is depth map, disparity map, valid pixels mask, rgb image, gray image need or not (correspondently), ir image" << endl ;
+    cout << "          By default -m=010100 i.e. disparity map and rgb image will be shown." << endl ;
     cout << "-r=        Filename of .oni video file. The data will grabbed from it." << endl ;
 }
 
@@ -101,7 +103,7 @@ static void parseCommandLine( int argc, char* argv[], bool& isColorizeDisp, bool
                        string& filename, bool& isFileReading )
 {
     filename.clear();
-    cv::CommandLineParser parser(argc, argv, "{h help||}{cd|1|}{fmd|0|}{mode|0|}{m|01010|}{r||}");
+    cv::CommandLineParser parser(argc, argv, "{h help||}{cd|1|}{fmd|0|}{mode|-1|}{m|010100|}{r||}");
     if (parser.has("h"))
     {
         help();
@@ -121,14 +123,14 @@ static void parseCommandLine( int argc, char* argv[], bool& isColorizeDisp, bool
         help();
         exit(-1);
     }
-    if (flags % 100000 == 0)
+    if (flags % 1000000 == 0)
     {
         cout << "No one output image is selected." << endl;
         exit(0);
     }
-    for (int i = 0; i < 5; i++)
+    for (int i = 0; i < 6; i++)
     {
-        retrievedImageFlags[4 - i] = (flags % 10 != 0);
+        retrievedImageFlags[5 - i] = (flags % 10 != 0);
         flags /= 10;
     }
 }
@@ -141,7 +143,7 @@ int main( int argc, char* argv[] )
 {
     bool isColorizeDisp, isFixedMaxDisp;
     int imageMode;
-    bool retrievedImageFlags[5];
+    bool retrievedImageFlags[6];
     string filename;
     bool isVideoReading;
     parseCommandLine( argc, argv, isColorizeDisp, isFixedMaxDisp, imageMode, retrievedImageFlags, filename, isVideoReading );
@@ -165,7 +167,7 @@ int main( int argc, char* argv[] )
         return -1;
     }
 
-    if( !isVideoReading )
+    if( !isVideoReading && imageMode >= 0 )
     {
         bool modeRes=false;
         switch ( imageMode )
@@ -193,13 +195,35 @@ int main( int argc, char* argv[] )
             cout << "\nThis image mode is not supported by the device, the default value (CV_CAP_OPENNI_SXGA_15HZ) will be used.\n" << endl;
     }
 
+    // turn on depth, color and IR if needed
+    if (retrievedImageFlags[0] || retrievedImageFlags[1] || retrievedImageFlags[2])
+        capture.set(CAP_OPENNI_DEPTH_GENERATOR_PRESENT, true);
+    else
+        capture.set(CAP_OPENNI_DEPTH_GENERATOR_PRESENT, false);
+    if (retrievedImageFlags[3] || retrievedImageFlags[4])
+        capture.set(CAP_OPENNI_IMAGE_GENERATOR_PRESENT, true);
+    else
+        capture.set(CAP_OPENNI_IMAGE_GENERATOR_PRESENT, false);
+    if (retrievedImageFlags[5])
+        capture.set(CAP_OPENNI_IR_GENERATOR_PRESENT, true);
+    else
+        capture.set(CAP_OPENNI_IR_GENERATOR_PRESENT, false);
+
     // Print some avalible device settings.
-    cout << "\nDepth generator output mode:" << endl <<
-            "FRAME_WIDTH      " << capture.get( CAP_PROP_FRAME_WIDTH ) << endl <<
-            "FRAME_HEIGHT     " << capture.get( CAP_PROP_FRAME_HEIGHT ) << endl <<
-            "FRAME_MAX_DEPTH  " << capture.get( CAP_PROP_OPENNI_FRAME_MAX_DEPTH ) << " mm" << endl <<
-            "FPS              " << capture.get( CAP_PROP_FPS ) << endl <<
-            "REGISTRATION     " << capture.get( CAP_PROP_OPENNI_REGISTRATION ) << endl;
+    if (capture.get(CAP_OPENNI_DEPTH_GENERATOR_PRESENT))
+    {
+        cout << "\nDepth generator output mode:" << endl <<
+            "FRAME_WIDTH      " << capture.get(CAP_PROP_FRAME_WIDTH) << endl <<
+            "FRAME_HEIGHT     " << capture.get(CAP_PROP_FRAME_HEIGHT) << endl <<
+            "FRAME_MAX_DEPTH  " << capture.get(CAP_PROP_OPENNI_FRAME_MAX_DEPTH) << " mm" << endl <<
+            "FPS              " << capture.get(CAP_PROP_FPS) << endl <<
+            "REGISTRATION     " << capture.get(CAP_PROP_OPENNI_REGISTRATION) << endl;
+    }
+    else
+    {
+        cout << "\nDevice doesn't contain depth generator or it is not selected." << endl;
+    }
+
     if( capture.get( CAP_OPENNI_IMAGE_GENERATOR_PRESENT ) )
     {
         cout <<
@@ -210,9 +234,20 @@ int main( int argc, char* argv[] )
     }
     else
     {
-        cout << "\nDevice doesn't contain image generator." << endl;
-        if (!retrievedImageFlags[0] && !retrievedImageFlags[1] && !retrievedImageFlags[2])
-            return 0;
+        cout << "\nDevice doesn't contain image generator or it is not selected." << endl;
+    }
+
+    if( capture.get(CAP_OPENNI_IR_GENERATOR_PRESENT) )
+    {
+        cout <<
+            "\nIR generator output mode:" << endl <<
+            "FRAME_WIDTH   " << capture.get(CAP_OPENNI_IR_GENERATOR + CAP_PROP_FRAME_WIDTH) << endl <<
+            "FRAME_HEIGHT  " << capture.get(CAP_OPENNI_IR_GENERATOR + CAP_PROP_FRAME_HEIGHT) << endl <<
+            "FPS           " << capture.get(CAP_OPENNI_IR_GENERATOR + CAP_PROP_FPS) << endl;
+    }
+    else
+    {
+        cout << "\nDevice doesn't contain IR generator or it is not selected." << endl;
     }
 
     for(;;)
@@ -222,6 +257,7 @@ int main( int argc, char* argv[] )
         Mat disparityMap;
         Mat bgrImage;
         Mat grayImage;
+        Mat irImage;
 
         if( !capture.grab() )
         {
@@ -261,6 +297,13 @@ int main( int argc, char* argv[] )
 
             if( retrievedImageFlags[4] && capture.retrieve( grayImage, CAP_OPENNI_GRAY_IMAGE ) )
                 imshow( "gray image", grayImage );
+
+            if( retrievedImageFlags[5] && capture.retrieve( irImage, CAP_OPENNI_IR_IMAGE ) )
+            {
+                Mat ir8;
+                irImage.convertTo(ir8, CV_8U, 256.0 / 3500, 0.0);
+                imshow("IR image", ir8);
+            }
         }
 
         if( waitKey( 30 ) >= 0 )
diff --git a/samples/cpp/pca.cpp b/samples/cpp/pca.cpp
index 35fd0c1a04..b33a463211 100644
--- a/samples/cpp/pca.cpp
+++ b/samples/cpp/pca.cpp
@@ -42,9 +42,9 @@
 #include <fstream>
 #include <sstream>
 
-#include <opencv2/core/core.hpp>
+#include <opencv2/core.hpp>
 #include "opencv2/imgcodecs.hpp"
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui.hpp>
 
 using namespace cv;
 using namespace std;
diff --git a/samples/cpp/phase_corr.cpp b/samples/cpp/phase_corr.cpp
index 5e8685fcfa..c3120fe084 100644
--- a/samples/cpp/phase_corr.cpp
+++ b/samples/cpp/phase_corr.cpp
@@ -1,7 +1,7 @@
-#include "opencv2/core/core.hpp"
-#include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 using namespace cv;
 
diff --git a/samples/cpp/polar_transforms.cpp b/samples/cpp/polar_transforms.cpp
index 872dda8c3f..3cbc431c3c 100644
--- a/samples/cpp/polar_transforms.cpp
+++ b/samples/cpp/polar_transforms.cpp
@@ -1,94 +1,75 @@
-#include "opencv2/imgproc/imgproc_c.h"
-#include "opencv2/videoio/videoio_c.h"
-#include "opencv2/highgui/highgui_c.h"
-#include "opencv2/core/utility.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include <iostream>
 
-#include <ctype.h>
-#include <stdio.h>
+using namespace cv;
 
 static void help( void )
 {
     printf("\nThis program illustrates Linear-Polar and Log-Polar image transforms\n"
             "Usage :\n"
-            "./polar_transforms [[camera number -- Default 0],[AVI path_filename]]\n\n"
-            );
+            "./polar_transforms [[camera number -- Default 0],[path_to_filename]]\n\n");
 }
+
 int main( int argc, char** argv )
 {
-    CvCapture* capture = 0;
-    IplImage*  log_polar_img = 0;
-    IplImage*  lin_polar_img = 0;
-    IplImage*  recovered_img = 0;
+    VideoCapture capture;
+    Mat log_polar_img, lin_polar_img, recovered_log_polar, recovered_lin_polar_img;
 
     help();
-    cv::CommandLineParser parser(argc, argv, "{help h||}{@input|0|}");
-    if (parser.has("help"))
-    {
-        help();
-        return 0;
-    }
+
+    CommandLineParser parser(argc, argv, "{@input|0|}");
     std::string arg = parser.get<std::string>("@input");
+
     if( arg.size() == 1 && isdigit(arg[0]) )
-        capture = cvCaptureFromCAM( arg[0] - '0' );
+        capture.open( arg[0] - '0' );
     else
-        capture = cvCaptureFromAVI( arg.c_str() );
-    if( !capture )
+        capture.open( arg.c_str() );
+
+    if( !capture.isOpened() )
     {
         const char* name = argv[0];
         fprintf(stderr,"Could not initialize capturing...\n");
         fprintf(stderr,"Usage: %s <CAMERA_NUMBER>    , or \n       %s <VIDEO_FILE>\n", name, name);
-        help();
         return -1;
     }
 
-    cvNamedWindow( "Linear-Polar", 0 );
-    cvNamedWindow( "Log-Polar", 0 );
-    cvNamedWindow( "Recovered image", 0 );
+    namedWindow( "Linear-Polar", WINDOW_NORMAL );
+    namedWindow( "Log-Polar", WINDOW_NORMAL );
+    namedWindow( "Recovered Linear-Polar", WINDOW_NORMAL );
+    namedWindow( "Recovered Log-Polar", WINDOW_NORMAL );
 
-    cvMoveWindow( "Linear-Polar", 20,20 );
-    cvMoveWindow( "Log-Polar", 700,20 );
-    cvMoveWindow( "Recovered image", 20,700 );
+    moveWindow( "Linear-Polar", 20,20 );
+    moveWindow( "Log-Polar", 700,20 );
+    moveWindow( "Recovered Linear-Polar", 20, 350 );
+    moveWindow( "Recovered Log-Polar", 700, 350 );
 
     for(;;)
     {
-        IplImage* frame = 0;
+        Mat frame;
+        capture >> frame;
 
-        frame = cvQueryFrame( capture );
-        if( !frame )
+        if( frame.empty() )
             break;
 
-        if( !log_polar_img )
-        {
-            log_polar_img = cvCreateImage( cvSize(frame->width,frame->height), IPL_DEPTH_8U, frame->nChannels );
-            lin_polar_img = cvCreateImage( cvSize(frame->width,frame->height), IPL_DEPTH_8U, frame->nChannels );
-            recovered_img = cvCreateImage( cvSize(frame->width,frame->height), IPL_DEPTH_8U, frame->nChannels );
-        }
+        Point2f center( (float)frame.cols / 2, (float)frame.rows / 2 );
+        double M = (double)frame.cols / 8;
 
-        cvLogPolar(frame,log_polar_img,cvPoint2D32f(frame->width >> 1,frame->height >> 1),70, CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS);
-        cvLinearPolar(frame,lin_polar_img,cvPoint2D32f(frame->width >> 1,frame->height >> 1),70, CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS);
+        logPolar(frame,log_polar_img, center, M, INTER_LINEAR + WARP_FILL_OUTLIERS);
+        linearPolar(frame,lin_polar_img, center, M, INTER_LINEAR + WARP_FILL_OUTLIERS);
 
-#if 0
-        cvLogPolar(log_polar_img,recovered_img,cvPoint2D32f(frame->width >> 1,frame->height >> 1),70, CV_WARP_INVERSE_MAP+CV_INTER_LINEAR);
-#else
-        cvLinearPolar(lin_polar_img,recovered_img,cvPoint2D32f(frame->width >> 1,frame->height >> 1),70, CV_WARP_INVERSE_MAP+CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS);
-#endif
+        logPolar(log_polar_img, recovered_log_polar, center, M, WARP_INVERSE_MAP + INTER_LINEAR);
+        linearPolar(lin_polar_img, recovered_lin_polar_img, center, M, WARP_INVERSE_MAP + INTER_LINEAR + WARP_FILL_OUTLIERS);
 
-        cvShowImage("Log-Polar", log_polar_img );
-        cvShowImage("Linear-Polar", lin_polar_img );
-        cvShowImage("Recovered image", recovered_img );
+        imshow("Log-Polar", log_polar_img );
+        imshow("Linear-Polar", lin_polar_img );
+        imshow("Recovered Linear-Polar", recovered_lin_polar_img );
+        imshow("Recovered Log-Polar", recovered_log_polar );
 
-        if( cvWaitKey(10) >= 0 )
+        if( waitKey(10) >= 0 )
             break;
     }
 
-    cvReleaseCapture( &capture );
-    cvDestroyWindow("Linear-Polar");
-    cvDestroyWindow("Log-Polar");
-    cvDestroyWindow("Recovered image");
-
+    waitKey(0);
     return 0;
 }
-
-#ifdef _EiC
-main(1,"laplace.c");
-#endif
diff --git a/samples/cpp/segment_objects.cpp b/samples/cpp/segment_objects.cpp
index 3c217f679d..916283124f 100644
--- a/samples/cpp/segment_objects.cpp
+++ b/samples/cpp/segment_objects.cpp
@@ -1,6 +1,6 @@
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/video/background_segm.hpp"
 #include <stdio.h>
 #include <string>
diff --git a/samples/cpp/squares.cpp b/samples/cpp/squares.cpp
index f53e931e75..df8459caa4 100644
--- a/samples/cpp/squares.cpp
+++ b/samples/cpp/squares.cpp
@@ -2,10 +2,10 @@
 // It loads several images sequentially and tries to find squares in
 // each image
 
-#include "opencv2/core/core.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 #include <math.h>
@@ -23,8 +23,8 @@ static void help()
     "Returns sequence of squares detected on the image.\n"
     "the sequence is stored in the specified memory storage\n"
     "Call:\n"
-    "./squares\n"
-    "Using OpenCV version %s\n" << CV_VERSION << "\n" << endl;
+    "./squares [file_name (optional)]\n"
+    "Using OpenCV version " << CV_VERSION << "\n" << endl;
 }
 
 
@@ -140,11 +140,18 @@ static void drawSquares( Mat& image, const vector<vector<Point> >& squares )
 }
 
 
-int main(int /*argc*/, char** /*argv*/)
+int main(int argc, char** argv)
 {
     static const char* names[] = { "../data/pic1.png", "../data/pic2.png", "../data/pic3.png",
         "../data/pic4.png", "../data/pic5.png", "../data/pic6.png", 0 };
     help();
+
+    if( argc > 1)
+    {
+     names[0] =  argv[1];
+     names[1] =  "0";
+    }
+
     namedWindow( wndname, 1 );
     vector<vector<Point> > squares;
 
diff --git a/samples/cpp/starter_imagelist.cpp b/samples/cpp/starter_imagelist.cpp
index 89df0cc1b5..6f4f71466c 100644
--- a/samples/cpp/starter_imagelist.cpp
+++ b/samples/cpp/starter_imagelist.cpp
@@ -9,7 +9,7 @@
  * easy as CV_PI right?
  */
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include <iostream>
 #include <vector>
 
diff --git a/samples/cpp/starter_video.cpp b/samples/cpp/starter_video.cpp
index d6bf1b7664..2839b03d7e 100644
--- a/samples/cpp/starter_video.cpp
+++ b/samples/cpp/starter_video.cpp
@@ -12,8 +12,8 @@
 */
 
 #include <opencv2/imgcodecs.hpp>
-#include <opencv2/videoio/videoio.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
 
 #include <iostream>
 #include <stdio.h>
diff --git a/samples/cpp/stereo_calib.cpp b/samples/cpp/stereo_calib.cpp
index 23dfffd417..025e1d3b60 100644
--- a/samples/cpp/stereo_calib.cpp
+++ b/samples/cpp/stereo_calib.cpp
@@ -22,10 +22,10 @@
      GitHub:        https://github.com/Itseez/opencv/
    ************************************************** */
 
-#include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/calib3d.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 
 #include <vector>
 #include <string>
diff --git a/samples/cpp/stereo_match.cpp b/samples/cpp/stereo_match.cpp
index e88b139a16..4868a63950 100644
--- a/samples/cpp/stereo_match.cpp
+++ b/samples/cpp/stereo_match.cpp
@@ -8,9 +8,9 @@
  */
 
 #include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/core/utility.hpp"
 
 #include <stdio.h>
diff --git a/samples/cpp/tree_engine.cpp b/samples/cpp/tree_engine.cpp
index 4412588753..96b2adf837 100644
--- a/samples/cpp/tree_engine.cpp
+++ b/samples/cpp/tree_engine.cpp
@@ -1,5 +1,5 @@
-#include "opencv2/ml/ml.hpp"
-#include "opencv2/core/core.hpp"
+#include "opencv2/ml.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/core/utility.hpp"
 #include <stdio.h>
 #include <string>
diff --git a/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp b/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
index 9e04dd912e..93a7b48b19 100644
--- a/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
+++ b/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
@@ -5,7 +5,7 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include <stdio.h>
 
 using namespace cv;
diff --git a/samples/cpp/tutorial_code/HighGUI/BasicLinearTransformsTrackbar.cpp b/samples/cpp/tutorial_code/HighGUI/BasicLinearTransformsTrackbar.cpp
index 213850f995..1834e35e22 100644
--- a/samples/cpp/tutorial_code/HighGUI/BasicLinearTransformsTrackbar.cpp
+++ b/samples/cpp/tutorial_code/HighGUI/BasicLinearTransformsTrackbar.cpp
@@ -6,7 +6,7 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/EqualizeHist_Demo.cpp b/samples/cpp/tutorial_code/Histograms_Matching/EqualizeHist_Demo.cpp
index 45db5854e5..80d4d4645b 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/EqualizeHist_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/EqualizeHist_Demo.cpp
@@ -5,10 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
 
 using namespace cv;
 using namespace std;
@@ -24,10 +23,10 @@ int main( int, char** argv )
   const char* equalized_window = "Equalized Image";
 
   /// Load image
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
 
   if( src.empty() )
-    { cout<<"Usage: ./Histogram_Demo <path_to_image>"<<endl;
+    { cout<<"Usage: ./EqualizeHist_Demo <path_to_image>"<<endl;
       return -1;
     }
 
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp b/samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp
index e106cb461e..0bf447d2f9 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp
@@ -5,10 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
 
 using namespace std;
 using namespace cv;
@@ -27,11 +26,24 @@ void MatchingMethod( int, void* );
 /**
  * @function main
  */
-int main( int, char** argv )
+int main( int argc, char** argv )
 {
+  if (argc < 3)
+  {
+    cout << "Not enough parameters" << endl;
+    cout << "Usage:\n./MatchTemplate_Demo <image_name> <template_name>" << endl;
+    return -1;
+  }
+
   /// Load image and template
-  img = imread( argv[1], 1 );
-  templ = imread( argv[2], 1 );
+  img = imread( argv[1], IMREAD_COLOR );
+  templ = imread( argv[2], IMREAD_COLOR );
+
+  if(img.empty() || templ.empty())
+  {
+    cout << "Can't read one of the images" << endl;
+    return -1;
+  }
 
   /// Create windows
   namedWindow( image_window, WINDOW_AUTOSIZE );
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo1.cpp b/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo1.cpp
index ce0e911237..1e2fe928f7 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo1.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo1.cpp
@@ -4,9 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 
@@ -27,7 +27,13 @@ void Hist_and_Backproj(int, void* );
 int main( int, char** argv )
 {
   /// Read the image
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
+
+  if( src.empty() )
+    { cout<<"Usage: ./calcBackProject_Demo1 <path_to_image>"<<endl;
+      return -1;
+    }
+
   /// Transform it to HSV
   cvtColor( src, hsv, COLOR_BGR2HSV );
 
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo2.cpp b/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo2.cpp
index ff7b369e2c..ecb77372e8 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo2.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo2.cpp
@@ -4,9 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 
@@ -30,7 +30,7 @@ void pickPoint (int event, int x, int y, int, void* );
 int main( int, char** argv )
 {
   /// Read the image
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
   /// Transform it to HSV
   cvtColor( src, hsv, COLOR_BGR2HSV );
 
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/calcHist_Demo.cpp b/samples/cpp/tutorial_code/Histograms_Matching/calcHist_Demo.cpp
index 1540cd85da..27e21e76b2 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/calcHist_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/calcHist_Demo.cpp
@@ -4,11 +4,10 @@
  * @author
  */
 
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
 
 using namespace std;
 using namespace cv;
@@ -16,12 +15,19 @@ using namespace cv;
 /**
  * @function main
  */
-int main( int, char** argv )
+int main(int argc, char** argv)
 {
   Mat src, dst;
 
   /// Load image
-  src = imread( argv[1], 1 );
+  String imageName( "../data/lena.jpg" ); // by default
+
+  if (argc > 1)
+  {
+      imageName = argv[1];
+  }
+
+  src = imread( imageName, IMREAD_COLOR );
 
   if( src.empty() )
     { return -1; }
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp b/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
index 122e19bebc..4d0123b247 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
@@ -5,10 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
 
 using namespace std;
 using namespace cv;
@@ -26,13 +25,19 @@ int main( int argc, char** argv )
     /// Load three images with different environment settings
     if( argc < 4 )
     {
-        printf("** Error. Usage: ./compareHist_Demo <image_settings0> <image_setting1> <image_settings2>\n");
+        printf("** Error. Usage: ./compareHist_Demo <image_settings0> <image_settings1> <image_settings2>\n");
         return -1;
     }
 
-    src_base = imread( argv[1], 1 );
-    src_test1 = imread( argv[2], 1 );
-    src_test2 = imread( argv[3], 1 );
+    src_base = imread( argv[1], IMREAD_COLOR );
+    src_test1 = imread( argv[2], IMREAD_COLOR );
+    src_test2 = imread( argv[3], IMREAD_COLOR );
+
+    if(src_base.empty() || src_test1.empty() || src_test2.empty())
+    {
+      cout << "Can't read one of the images" << endl;
+      return -1;
+    }
 
     /// Convert to HSV
     cvtColor( src_base, hsv_base, COLOR_BGR2HSV );
diff --git a/samples/cpp/tutorial_code/ImgProc/AddingImages.cpp b/samples/cpp/tutorial_code/ImgProc/AddingImages.cpp
index 32ce10f4b2..20b09ec32b 100644
--- a/samples/cpp/tutorial_code/ImgProc/AddingImages.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/AddingImages.cpp
@@ -5,7 +5,7 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include <iostream>
 
 using namespace cv;
diff --git a/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp b/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
index 9ffe1563de..b24451de35 100644
--- a/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
@@ -5,7 +5,7 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include <iostream>
 
 using namespace cv;
diff --git a/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp b/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
index e8fac91300..626bd8b960 100644
--- a/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
@@ -32,7 +30,7 @@ void Dilation( int, void* );
 int main( int, char** argv )
 {
   /// Load an image
-  src = imread( argv[1] );
+  src = imread( argv[1], IMREAD_COLOR );
 
   if( src.empty() )
     { return -1; }
diff --git a/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp b/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp
index 3f43eeeedb..97304a4612 100644
--- a/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
@@ -34,7 +32,7 @@ void Morphology_Operations( int, void* );
 int main( int, char** argv )
 {
   /// Load an image
-  src = imread( argv[1] );
+  src = imread( argv[1], IMREAD_COLOR );
 
   if( src.empty() )
     { return -1; }
@@ -67,7 +65,6 @@ int main( int, char** argv )
  */
 void Morphology_Operations( int, void* )
 {
-
   // Since MORPH_X : 2,3,4,5 and 6
   int operation = morph_operator + 2;
 
diff --git a/samples/cpp/tutorial_code/ImgProc/Pyramids.cpp b/samples/cpp/tutorial_code/ImgProc/Pyramids.cpp
index c52c0e04f4..e4b7096e6d 100644
--- a/samples/cpp/tutorial_code/ImgProc/Pyramids.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Pyramids.cpp
@@ -4,12 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
diff --git a/samples/cpp/tutorial_code/ImgProc/Smoothing.cpp b/samples/cpp/tutorial_code/ImgProc/Smoothing.cpp
index e7ac4d40f5..6a84da073b 100644
--- a/samples/cpp/tutorial_code/ImgProc/Smoothing.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Smoothing.cpp
@@ -3,13 +3,10 @@
  * brief Sample code for simple filters
  * author OpenCV team
  */
-#include <iostream>
-#include <vector>
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/features2d/features2d.hpp"
+#include "opencv2/highgui.hpp"
 
 using namespace std;
 using namespace cv;
@@ -35,7 +32,7 @@ int main( void )
   namedWindow( window_name, WINDOW_AUTOSIZE );
 
   /// Load the source image
-  src = imread( "../data/lena.jpg", 1 );
+  src = imread( "../data/lena.jpg", IMREAD_COLOR );
 
   if( display_caption( "Original Image" ) != 0 ) { return 0; }
 
diff --git a/samples/cpp/tutorial_code/ImgProc/Threshold.cpp b/samples/cpp/tutorial_code/ImgProc/Threshold.cpp
index 0944f6cd3b..36943028d9 100644
--- a/samples/cpp/tutorial_code/ImgProc/Threshold.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Threshold.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
@@ -35,10 +33,13 @@ void Threshold_Demo( int, void* );
 int main( int, char** argv )
 {
   /// Load an image
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
+
+  if( src.empty() )
+    { return -1; }
 
   /// Convert the image to Gray
-  cvtColor( src, src_gray, COLOR_RGB2GRAY );
+  cvtColor( src, src_gray, COLOR_BGR2GRAY );
 
   /// Create a window to display results
   namedWindow( window_name, WINDOW_AUTOSIZE );
diff --git a/samples/cpp/tutorial_code/ImgTrans/CannyDetector_Demo.cpp b/samples/cpp/tutorial_code/ImgTrans/CannyDetector_Demo.cpp
index 92e10e4b0d..8b94bb3207 100644
--- a/samples/cpp/tutorial_code/ImgTrans/CannyDetector_Demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/CannyDetector_Demo.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
@@ -50,7 +48,7 @@ static void CannyThreshold(int, void*)
 int main( int, char** argv )
 {
   /// Load an image
-  src = imread( argv[1] );
+  src = imread( argv[1], IMREAD_COLOR );
 
   if( src.empty() )
     { return -1; }
diff --git a/samples/cpp/tutorial_code/ImgTrans/Geometric_Transforms_Demo.cpp b/samples/cpp/tutorial_code/ImgTrans/Geometric_Transforms_Demo.cpp
index 00184a3f87..edbb1e7387 100644
--- a/samples/cpp/tutorial_code/ImgTrans/Geometric_Transforms_Demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/Geometric_Transforms_Demo.cpp
@@ -5,10 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
 
 using namespace cv;
 using namespace std;
@@ -31,7 +30,7 @@ int main( int, char** argv )
   Mat src, warp_dst, warp_rotate_dst;
 
   /// Load the image
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
 
   /// Set the dst image the same type and size as src
   warp_dst = Mat::zeros( src.rows, src.cols, src.type() );
diff --git a/samples/cpp/tutorial_code/ImgTrans/HoughCircle_Demo.cpp b/samples/cpp/tutorial_code/ImgTrans/HoughCircle_Demo.cpp
index 02f03813f2..81bb3f8895 100644
--- a/samples/cpp/tutorial_code/ImgTrans/HoughCircle_Demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/HoughCircle_Demo.cpp
@@ -5,8 +5,8 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
 
 using namespace std;
@@ -63,7 +63,7 @@ int main(int argc, char** argv)
     }
 
     // Read the image
-    src = imread( argv[1], 1 );
+    src = imread( argv[1], IMREAD_COLOR );
 
     if( src.empty() )
     {
diff --git a/samples/cpp/tutorial_code/ImgTrans/HoughLines_Demo.cpp b/samples/cpp/tutorial_code/ImgTrans/HoughLines_Demo.cpp
index 2d9b7b6454..5610cfefe5 100644
--- a/samples/cpp/tutorial_code/ImgTrans/HoughLines_Demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/HoughLines_Demo.cpp
@@ -5,10 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
 
 using namespace cv;
 using namespace std;
@@ -39,7 +38,7 @@ void Probabilistic_Hough( int, void* );
 int main( int, char** argv )
 {
    /// Read the image
-   src = imread( argv[1], 1 );
+   src = imread( argv[1], IMREAD_COLOR );
 
    if( src.empty() )
      { help();
diff --git a/samples/cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp b/samples/cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp
index f0d45e715b..386b26da74 100644
--- a/samples/cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
@@ -26,7 +24,7 @@ int main( int, char** argv )
   const char* window_name = "Laplace Demo";
 
   /// Load an image
-  src = imread( argv[1] );
+  src = imread( argv[1], IMREAD_COLOR );
 
   if( src.empty() )
     { return -1; }
diff --git a/samples/cpp/tutorial_code/ImgTrans/Remap_Demo.cpp b/samples/cpp/tutorial_code/ImgTrans/Remap_Demo.cpp
index 49727e9cf0..b5a76faa47 100644
--- a/samples/cpp/tutorial_code/ImgTrans/Remap_Demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/Remap_Demo.cpp
@@ -5,10 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
 
 using namespace cv;
 
@@ -27,7 +26,7 @@ void update_map( void );
 int main( int, char** argv )
 {
   /// Load the image
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
 
   /// Create dst, map_x and map_y with the same size as src:
   dst.create( src.size(), src.type() );
diff --git a/samples/cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp b/samples/cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp
index 7d57e8ec6c..efe4c71691 100644
--- a/samples/cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
@@ -26,7 +24,7 @@ int main( int, char** argv )
   int ddepth = CV_16S;
 
   /// Load an image
-  src = imread( argv[1] );
+  src = imread( argv[1], IMREAD_COLOR );
 
   if( src.empty() )
     { return -1; }
diff --git a/samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp b/samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
index 5f786d568f..6da2b296b2 100644
--- a/samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
@@ -28,7 +26,7 @@ int main( int, char** argv )
   int c;
 
   /// Load an image
-  src = imread( argv[1] );
+  src = imread( argv[1], IMREAD_COLOR );
 
   if( src.empty() )
     {
diff --git a/samples/cpp/tutorial_code/ImgTrans/filter2D_demo.cpp b/samples/cpp/tutorial_code/ImgTrans/filter2D_demo.cpp
index 3c580bb216..6fc04d8f41 100644
--- a/samples/cpp/tutorial_code/ImgTrans/filter2D_demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/filter2D_demo.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
@@ -30,7 +28,7 @@ int main ( int, char** argv )
   int c;
 
   /// Load an image
-  src = imread( argv[1] );
+  src = imread( argv[1], IMREAD_COLOR );
 
   if( src.empty() )
     { return -1; }
diff --git a/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp b/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
index 6a6de95394..b831441b33 100644
--- a/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
@@ -5,11 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -28,7 +26,7 @@ void thresh_callback(int, void* );
 int main( int, char** argv )
 {
   /// Load source image
-  src = imread(argv[1]);
+  src = imread(argv[1], IMREAD_COLOR);
   if (src.empty())
   {
     cerr << "No image supplied ..." << endl;
diff --git a/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp b/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp
index a9c22e60fc..29d75515cf 100644
--- a/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp
@@ -5,11 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -28,7 +26,7 @@ void thresh_callback(int, void* );
 int main( int, char** argv )
 {
   /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
 
   /// Convert image to gray and blur it
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
diff --git a/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo2.cpp b/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo2.cpp
index c6fd379328..742c6cf0a9 100644
--- a/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo2.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo2.cpp
@@ -5,11 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -28,7 +26,7 @@ void thresh_callback(int, void* );
 int main( int, char** argv )
 {
   /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
 
   /// Convert image to gray and blur it
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
diff --git a/samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp b/samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp
index 0b354291a9..7fe5b71f98 100644
--- a/samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp
@@ -5,11 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -28,7 +26,7 @@ void thresh_callback(int, void* );
 int main( int, char** argv )
 {
   /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
 
   /// Convert image to gray and blur it
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
diff --git a/samples/cpp/tutorial_code/ShapeDescriptors/moments_demo.cpp b/samples/cpp/tutorial_code/ShapeDescriptors/moments_demo.cpp
index c3a5cbf60a..4064048df7 100644
--- a/samples/cpp/tutorial_code/ShapeDescriptors/moments_demo.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/moments_demo.cpp
@@ -5,11 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -28,7 +26,7 @@ void thresh_callback(int, void* );
 int main( int, char** argv )
 {
   /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
 
   /// Convert image to gray and blur it
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
diff --git a/samples/cpp/tutorial_code/ShapeDescriptors/pointPolygonTest_demo.cpp b/samples/cpp/tutorial_code/ShapeDescriptors/pointPolygonTest_demo.cpp
index f55f8f6879..757b8dc0f8 100644
--- a/samples/cpp/tutorial_code/ShapeDescriptors/pointPolygonTest_demo.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/pointPolygonTest_demo.cpp
@@ -4,11 +4,9 @@
  * @author OpenCV team
  */
 
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
diff --git a/samples/cpp/tutorial_code/TrackingMotion/cornerDetector_Demo.cpp b/samples/cpp/tutorial_code/TrackingMotion/cornerDetector_Demo.cpp
index c5dceab48b..52d7561522 100644
--- a/samples/cpp/tutorial_code/TrackingMotion/cornerDetector_Demo.cpp
+++ b/samples/cpp/tutorial_code/TrackingMotion/cornerDetector_Demo.cpp
@@ -3,12 +3,11 @@
  * @brief Demo code for detecting corners using OpenCV built-in functions
  * @author OpenCV team
  */
+
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -40,7 +39,7 @@ void myHarris_function( int, void* );
 int main( int, char** argv )
 {
   /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
 
   /// Set some parameters
diff --git a/samples/cpp/tutorial_code/TrackingMotion/cornerHarris_Demo.cpp b/samples/cpp/tutorial_code/TrackingMotion/cornerHarris_Demo.cpp
index 4314a97e26..4f1df4b844 100644
--- a/samples/cpp/tutorial_code/TrackingMotion/cornerHarris_Demo.cpp
+++ b/samples/cpp/tutorial_code/TrackingMotion/cornerHarris_Demo.cpp
@@ -5,11 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -31,7 +29,7 @@ void cornerHarris_demo( int, void* );
 int main( int, char** argv )
 {
   /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
 
   /// Create a window and a trackbar
diff --git a/samples/cpp/tutorial_code/TrackingMotion/cornerSubPix_Demo.cpp b/samples/cpp/tutorial_code/TrackingMotion/cornerSubPix_Demo.cpp
index 775e566ce7..8e4876bcee 100644
--- a/samples/cpp/tutorial_code/TrackingMotion/cornerSubPix_Demo.cpp
+++ b/samples/cpp/tutorial_code/TrackingMotion/cornerSubPix_Demo.cpp
@@ -5,11 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -32,7 +30,7 @@ void goodFeaturesToTrack_Demo( int, void* );
 int main( int, char** argv )
 {
   /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
 
   /// Create Window
diff --git a/samples/cpp/tutorial_code/TrackingMotion/goodFeaturesToTrack_Demo.cpp b/samples/cpp/tutorial_code/TrackingMotion/goodFeaturesToTrack_Demo.cpp
index cff59f5390..da357e7407 100644
--- a/samples/cpp/tutorial_code/TrackingMotion/goodFeaturesToTrack_Demo.cpp
+++ b/samples/cpp/tutorial_code/TrackingMotion/goodFeaturesToTrack_Demo.cpp
@@ -5,11 +5,9 @@
  */
 
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
 
 using namespace cv;
 using namespace std;
@@ -32,7 +30,7 @@ void goodFeaturesToTrack_Demo( int, void* );
 int main( int, char** argv )
 {
   /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  src = imread( argv[1], IMREAD_COLOR );
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
 
   /// Create Window
diff --git a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
index 0322519839..580fe73792 100644
--- a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
+++ b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
@@ -21,7 +21,7 @@ using namespace std;
 static void help()
 {
     cout <<  "This is a camera calibration sample." << endl
-         <<  "Usage: calibration configurationFile"  << endl
+         <<  "Usage: camera_calibration [configuration_file -- default ./default.xml]"  << endl
          <<  "Near the sample file you'll find the configuration file, which has detailed help of "
              "how to edit it.  It may be any OpenCV supported file format XML/YAML." << endl;
 }
@@ -415,7 +415,7 @@ int main(int argc, char* argv[])
 
         for(size_t i = 0; i < s.imageList.size(); i++ )
         {
-            view = imread(s.imageList[i], 1);
+            view = imread(s.imageList[i], IMREAD_COLOR);
             if(view.empty())
                 continue;
             remap(view, rview, map1, map2, INTER_LINEAR);
diff --git a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/Utils.cpp b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/Utils.cpp
index fa2bad442e..b548bed463 100644
--- a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/Utils.cpp
+++ b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/Utils.cpp
@@ -11,8 +11,8 @@
 #include "ModelRegistration.h"
 #include "Utils.h"
 
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/calib3d/calib3d.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/calib3d.hpp>
 
 // For text
 int fontFace = cv::FONT_ITALIC;
diff --git a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_detection.cpp b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_detection.cpp
index 6de590e684..4808e64135 100644
--- a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_detection.cpp
+++ b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_detection.cpp
@@ -2,11 +2,11 @@
 #include <iostream>
 #include <time.h>
 // OpenCV
-#include <opencv2/core/core.hpp>
+#include <opencv2//core.hpp>
 #include <opencv2/core/utility.hpp>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/calib3d/calib3d.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/calib3d.hpp>
 #include <opencv2/video/tracking.hpp>
 // PnP Tutorial
 #include "Mesh.h"
diff --git a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_registration.cpp b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_registration.cpp
index da775a063e..5ddb83f0da 100644
--- a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_registration.cpp
+++ b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_registration.cpp
@@ -1,10 +1,10 @@
 // C++
 #include <iostream>
 // OpenCV
-#include <opencv2/core/core.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/calib3d/calib3d.hpp>
-#include <opencv2/features2d/features2d.hpp>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/calib3d.hpp>
+#include <opencv2/features2d.hpp>
 // PnP Tutorial
 #include "Mesh.h"
 #include "Model.h"
diff --git a/samples/cpp/tutorial_code/calib3d/stereoBM/SBM_Sample.cpp b/samples/cpp/tutorial_code/calib3d/stereoBM/SBM_Sample.cpp
index 90fa2801be..d114a9582f 100644
--- a/samples/cpp/tutorial_code/calib3d/stereoBM/SBM_Sample.cpp
+++ b/samples/cpp/tutorial_code/calib3d/stereoBM/SBM_Sample.cpp
@@ -4,12 +4,10 @@
  * @author A. Huaman
  */
 
-#include <stdio.h>
 #include <iostream>
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/core/core.hpp"
+#include "opencv2/calib3d.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 
diff --git a/samples/cpp/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.cpp b/samples/cpp/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.cpp
index dbebeea923..e23ab1c326 100644
--- a/samples/cpp/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.cpp
+++ b/samples/cpp/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.cpp
@@ -1,7 +1,7 @@
-#include "opencv2/core/core.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 
diff --git a/samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp b/samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
index 8fc722df78..faeacfb967 100644
--- a/samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
+++ b/samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
@@ -1,4 +1,4 @@
-#include <opencv2/core/core.hpp>
+#include <opencv2/core.hpp>
 #include <iostream>
 #include <string>
 
diff --git a/samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp b/samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
index c673c66592..47a5547837 100644
--- a/samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
+++ b/samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
@@ -16,7 +16,7 @@ static void help()
         << " we take an input image and divide the native color palette (255) with the "  << endl
         << "input. Shows C operator[] method, iterators and at function for on-the-fly item address calculation."<< endl
         << "Usage:"                                                                       << endl
-        << "./howToScanImages imageNameToUse divideWith [G]"                              << endl
+        << "./how_to_scan_images <imageNameToUse> <divideWith> [G]"                       << endl
         << "if you add a G parameter the image is processed in gray scale"                << endl
         << "--------------------------------------------------------------------------"   << endl
         << endl;
diff --git a/samples/cpp/tutorial_code/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.cpp b/samples/cpp/tutorial_code/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.cpp
index b6aa6131de..79e7c99d85 100644
--- a/samples/cpp/tutorial_code/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.cpp
+++ b/samples/cpp/tutorial_code/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.cpp
@@ -1,12 +1,9 @@
 //! [head]
-#include <stdio.h>
 #include <iostream>
 
-#include <opencv2/core/core.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc.hpp>
 #include "opencv2/imgcodecs.hpp"
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/core/utility.hpp>
+#include <opencv2/highgui.hpp>
 
 using namespace cv;  // The new C++ interface API is inside this namespace. Import it.
 using namespace std;
diff --git a/samples/cpp/tutorial_code/core/mat_mask_operations/mat_mask_operations.cpp b/samples/cpp/tutorial_code/core/mat_mask_operations/mat_mask_operations.cpp
index f0cbca1695..8eb9ca7efd 100644
--- a/samples/cpp/tutorial_code/core/mat_mask_operations/mat_mask_operations.cpp
+++ b/samples/cpp/tutorial_code/core/mat_mask_operations/mat_mask_operations.cpp
@@ -1,5 +1,3 @@
-#include <opencv2/core.hpp>
-#include <opencv2/core/utility.hpp>
 #include <opencv2/imgcodecs.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/imgproc.hpp>
@@ -25,38 +23,44 @@ int main( int argc, char* argv[])
     help(argv[0]);
     const char* filename = argc >=2 ? argv[1] : "../data/lena.jpg";
 
-    Mat I, J, K;
+    Mat src, dst0, dst1;
 
     if (argc >= 3 && !strcmp("G", argv[2]))
-        I = imread( filename, IMREAD_GRAYSCALE);
+        src = imread( filename, IMREAD_GRAYSCALE);
     else
-        I = imread( filename, IMREAD_COLOR);
+        src = imread( filename, IMREAD_COLOR);
+
+    if (src.empty())
+    {
+        cerr << "Can't open image ["  << filename << "]" << endl;
+        return -1;
+    }
 
     namedWindow("Input", WINDOW_AUTOSIZE);
     namedWindow("Output", WINDOW_AUTOSIZE);
 
-    imshow("Input", I);
+    imshow( "Input", src );
     double t = (double)getTickCount();
 
-    Sharpen(I, J);
+    Sharpen( src, dst0 );
 
     t = ((double)getTickCount() - t)/getTickFrequency();
     cout << "Hand written function times passed in seconds: " << t << endl;
 
-    imshow("Output", J);
-    waitKey(0);
+    imshow( "Output", dst0 );
+    waitKey();
 
-    Mat kern = (Mat_<char>(3,3) <<  0, -1,  0,
+    Mat kernel = (Mat_<char>(3,3) <<  0, -1,  0,
                                    -1,  5, -1,
                                     0, -1,  0);
     t = (double)getTickCount();
-    filter2D(I, K, I.depth(), kern );
+    filter2D( src, dst1, src.depth(), kernel );
     t = ((double)getTickCount() - t)/getTickFrequency();
     cout << "Built-in filter2D time passed in seconds:      " << t << endl;
 
-    imshow("Output", K);
+    imshow( "Output", dst1 );
 
-    waitKey(0);
+    waitKey();
     return 0;
 }
 void Sharpen(const Mat& myImage,Mat& Result)
diff --git a/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp b/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
index 7fb060bddd..77ea93827f 100644
--- a/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
+++ b/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
@@ -1,6 +1,6 @@
 /*  For description look into the help() function. */
 
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include <iostream>
 
 using namespace std;
@@ -15,7 +15,7 @@ static void help()
     << "That is, cv::Mat M(...); M.create and cout << M. "                            << endl
     << "Shows how output can be formated to OpenCV, python, numpy, csv and C styles." << endl
     << "Usage:"                                                                       << endl
-    << "./cvout_sample"                                                               << endl
+    << "./mat_the_basic_image_container"                                              << endl
     << "--------------------------------------------------------------------------"   << endl
     << endl;
 }
diff --git a/samples/cpp/tutorial_code/imgcodecs/GDAL_IO/gdal-image.cpp b/samples/cpp/tutorial_code/imgcodecs/GDAL_IO/gdal-image.cpp
index 6e7c950a26..66084df78a 100644
--- a/samples/cpp/tutorial_code/imgcodecs/GDAL_IO/gdal-image.cpp
+++ b/samples/cpp/tutorial_code/imgcodecs/GDAL_IO/gdal-image.cpp
@@ -3,9 +3,9 @@
 */
 
 // OpenCV Headers
-#include "opencv2/core/core.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
 
 // C++ Standard Libraries
 #include <cmath>
@@ -159,8 +159,8 @@ int main( int argc, char* argv[] ){
      * Check input arguments
     */
     if( argc < 3 ){
-        cout << "usage: " << argv[0] << " <image> <dem>" << endl;
-        return 1;
+        cout << "usage: " << argv[0] << " <image_name> <dem_model_name>" << endl;
+        return -1;
     }
 
     // load the image (note that we don't have the projection information.  You will
diff --git a/samples/cpp/tutorial_code/introduction/display_image/display_image.cpp b/samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
index 558cb66d55..456286741e 100644
--- a/samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
+++ b/samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
@@ -1,7 +1,7 @@
 //! [includes]
-#include <opencv2/core/core.hpp>
+#include <opencv2/core.hpp>
 #include <opencv2/imgcodecs.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui.hpp>
 
 #include <iostream>
 #include <string>
@@ -16,7 +16,7 @@ using namespace std;
 int main( int argc, char** argv )
 {
     //! [load]
-    string imageName("../data/HappyFish.jpg"); // by default
+    String imageName( "../data/HappyFish.jpg" ); // by default
     if( argc > 1)
     {
         imageName = argv[1];
@@ -28,7 +28,7 @@ int main( int argc, char** argv )
     //! [mat]
 
     //! [imread]
-    image = imread(imageName.c_str(), IMREAD_COLOR); // Read the file
+    image = imread( imageName, IMREAD_COLOR ); // Read the file
     //! [imread]
 
     if( image.empty() )                      // Check for invalid input
diff --git a/samples/cpp/tutorial_code/photo/decolorization/decolor.cpp b/samples/cpp/tutorial_code/photo/decolorization/decolor.cpp
index 067bad1178..13c1b0c5f7 100644
--- a/samples/cpp/tutorial_code/photo/decolorization/decolor.cpp
+++ b/samples/cpp/tutorial_code/photo/decolorization/decolor.cpp
@@ -27,13 +27,13 @@ using namespace cv;
 int main(int argc, char *argv[])
 {
     CV_Assert(argc == 2);
-    Mat I;
-    I = imread(argv[1]);
+    Mat src;
+    src = imread(argv[1], IMREAD_COLOR);
 
-    Mat gray = Mat(I.size(),CV_8UC1);
-    Mat color_boost = Mat(I.size(),CV_8UC3);
+    Mat gray = Mat(src.size(),CV_8UC1);
+    Mat color_boost = Mat(src.size(),CV_8UC3);
 
-    decolor(I,gray,color_boost);
+    decolor(src,gray,color_boost);
     imshow("grayscale",gray);
     imshow("color_boost",color_boost);
     waitKey(0);
diff --git a/samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp b/samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp
index f81204c88b..8a24ddb7e6 100644
--- a/samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp
+++ b/samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp
@@ -35,9 +35,9 @@ int main(int argc, char* argv[])
 
     int num,type;
 
-    Mat I = imread(argv[1]);
+    Mat src = imread(argv[1], IMREAD_COLOR);
 
-    if(I.empty())
+    if(src.empty())
     {
         cout <<  "Image not found" << endl;
         exit(0);
@@ -71,25 +71,25 @@ int main(int argc, char* argv[])
 
         cin >> type;
 
-        edgePreservingFilter(I,img,type);
+        edgePreservingFilter(src,img,type);
         imshow("Edge Preserve Smoothing",img);
 
     }
     else if(num == 2)
     {
-        detailEnhance(I,img);
+        detailEnhance(src,img);
         imshow("Detail Enhanced",img);
     }
     else if(num == 3)
     {
         Mat img1;
-        pencilSketch(I,img1, img, 10 , 0.1f, 0.03f);
+        pencilSketch(src,img1, img, 10 , 0.1f, 0.03f);
         imshow("Pencil Sketch",img1);
         imshow("Color Pencil Sketch",img);
     }
     else if(num == 4)
     {
-        stylization(I,img);
+        stylization(src,img);
         imshow("Stylization",img);
     }
     waitKey(0);
diff --git a/samples/cpp/tutorial_code/video/bg_sub.cpp b/samples/cpp/tutorial_code/video/bg_sub.cpp
index d37c7bd0f9..94799f388b 100644
--- a/samples/cpp/tutorial_code/video/bg_sub.cpp
+++ b/samples/cpp/tutorial_code/video/bg_sub.cpp
@@ -38,9 +38,9 @@ void help()
     << " OpenCV. You can process both videos (-vid) and images (-img)."             << endl
                                                                                     << endl
     << "Usage:"                                                                     << endl
-    << "./bs {-vid <video filename>|-img <image filename>}"                         << endl
-    << "for example: ./bs -vid video.avi"                                           << endl
-    << "or: ./bs -img /data/images/1.png"                                           << endl
+    << "./bg_sub {-vid <video filename>|-img <image filename>}"                     << endl
+    << "for example: ./bg_sub -vid video.avi"                                       << endl
+    << "or: ./bg_sub -img /data/images/1.png"                                       << endl
     << "--------------------------------------------------------------------------" << endl
     << endl;
 }
diff --git a/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp b/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
index 4c5bf9f586..d3f3c21c09 100644
--- a/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
+++ b/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
@@ -3,10 +3,10 @@
 #include <iomanip>  // for controlling float print precision
 #include <sstream>  // string to number conversion
 
-#include <opencv2/core/core.hpp>        // Basic OpenCV structures (cv::Mat, Scalar)
-#include <opencv2/imgproc/imgproc.hpp>  // Gaussian Blur
-#include <opencv2/videoio/videoio.hpp>
-#include <opencv2/highgui/highgui.hpp>  // OpenCV window I/O
+#include <opencv2/core.hpp>     // Basic OpenCV structures (cv::Mat, Scalar)
+#include <opencv2/imgproc.hpp>  // Gaussian Blur
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>  // OpenCV window I/O
 
 using namespace std;
 using namespace cv;
@@ -22,7 +22,7 @@ static void help()
         << "tests the similarity of two input videos first with PSNR, and for the frames "
         << "below a PSNR trigger value, also with MSSIM."                                   << endl
         << "Usage:"                                                                         << endl
-        << "./video-source referenceVideo useCaseTestVideo PSNR_Trigger_Value Wait_Between_Frames " << endl
+        << "./video-input-psnr-ssim <referenceVideo> <useCaseTestVideo> <PSNR_Trigger_Value> <Wait_Between_Frames> " << endl
         << "--------------------------------------------------------------------------"     << endl
         << endl;
 }
diff --git a/samples/cpp/tutorial_code/videoio/video-write/video-write.cpp b/samples/cpp/tutorial_code/videoio/video-write/video-write.cpp
index 9218cf2d31..56676fc9db 100644
--- a/samples/cpp/tutorial_code/videoio/video-write/video-write.cpp
+++ b/samples/cpp/tutorial_code/videoio/video-write/video-write.cpp
@@ -1,8 +1,8 @@
 #include <iostream>	// for standard I/O
 #include <string>   // for strings
 
-#include <opencv2/core/core.hpp>        // Basic OpenCV structures (cv::Mat)
-#include <opencv2/videoio/videoio.hpp>  // Video write
+#include <opencv2/core.hpp>     // Basic OpenCV structures (cv::Mat)
+#include <opencv2/videoio.hpp>  // Video write
 
 using namespace std;
 using namespace cv;
@@ -14,7 +14,7 @@ static void help()
         << "This program shows how to write video files."                                   << endl
         << "You can extract the R or G or B color channel of the input video."              << endl
         << "Usage:"                                                                         << endl
-        << "./video-write inputvideoName [ R | G | B] [Y | N]"                              << endl
+        << "./video-write <input_video_name> [ R | G | B] [Y | N]"                          << endl
         << "------------------------------------------------------------------------------" << endl
         << endl;
 }
diff --git a/samples/cpp/tutorial_code/viz/creating_widgets.cpp b/samples/cpp/tutorial_code/viz/creating_widgets.cpp
index 63f572e56b..f26c93ee66 100644
--- a/samples/cpp/tutorial_code/viz/creating_widgets.cpp
+++ b/samples/cpp/tutorial_code/viz/creating_widgets.cpp
@@ -4,6 +4,14 @@
  * @author Ozan Cagri Tonkal
  */
 
+#ifndef USE_VTK
+#include <iostream>
+int main()
+{
+    std::cout << "This sample requires direct compilation with VTK. Stop" << std::endl;
+    return 0;
+}
+#else
 #include <opencv2/viz.hpp>
 #include <opencv2/viz/widget_accessor.hpp>
 #include <iostream>
@@ -111,3 +119,4 @@ int main()
 
     return 0;
 }
+#endif
diff --git a/samples/cpp/tutorial_code/viz/launching_viz.cpp b/samples/cpp/tutorial_code/viz/launching_viz.cpp
index d19967e4b0..9c636ebaf2 100644
--- a/samples/cpp/tutorial_code/viz/launching_viz.cpp
+++ b/samples/cpp/tutorial_code/viz/launching_viz.cpp
@@ -41,7 +41,7 @@ int main()
     cout << "First event loop is over" << endl;
 
     /// Access window via its name
-    viz::Viz3d sameWindow = viz::get("Viz Demo");
+    viz::Viz3d sameWindow = viz::getWindowByName("Viz Demo");
 
     /// Start event loop
     sameWindow.spin();
diff --git a/samples/cpp/tutorial_code/viz/transformations.cpp b/samples/cpp/tutorial_code/viz/transformations.cpp
index 0d7450a63d..84cc2acf0d 100644
--- a/samples/cpp/tutorial_code/viz/transformations.cpp
+++ b/samples/cpp/tutorial_code/viz/transformations.cpp
@@ -71,7 +71,7 @@ int main(int argn, char **argv)
     myWindow.showWidget("Coordinate Widget", viz::WCoordinateSystem());
 
     /// Let's assume camera has the following properties
-    Point3f cam_pos(3.0f,3.0f,3.0f), cam_focal_point(3.0f,3.0f,2.0f), cam_y_dir(-1.0f,0.0f,0.0f);
+    Vec3f cam_pos(3.0f,3.0f,3.0f), cam_focal_point(3.0f,3.0f,2.0f), cam_y_dir(-1.0f,0.0f,0.0f);
 
     /// We can get the pose of the cam using makeCameraPose
     Affine3f cam_pose = viz::makeCameraPose(cam_pos, cam_focal_point, cam_y_dir);
diff --git a/samples/gpu/alpha_comp.cpp b/samples/gpu/alpha_comp.cpp
index 816c0a7b37..3f7966ded3 100644
--- a/samples/gpu/alpha_comp.cpp
+++ b/samples/gpu/alpha_comp.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 
 #include "opencv2/core/opengl.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/cudaimgproc.hpp"
 
 using namespace std;
diff --git a/samples/gpu/cascadeclassifier.cpp b/samples/gpu/cascadeclassifier.cpp
index f6209f9fa3..f59ff55b19 100644
--- a/samples/gpu/cascadeclassifier.cpp
+++ b/samples/gpu/cascadeclassifier.cpp
@@ -6,15 +6,13 @@
 
 #include <iostream>
 #include <iomanip>
-#include "opencv2/objdetect/objdetect.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/objdetect.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/cudaobjdetect.hpp"
 #include "opencv2/cudaimgproc.hpp"
 #include "opencv2/cudawarping.hpp"
 
-#include "tick_meter.hpp"
-
 using namespace std;
 using namespace cv;
 using namespace cv::cuda;
diff --git a/samples/gpu/driver_api_multi.cpp b/samples/gpu/driver_api_multi.cpp
index 7018eea79a..933368ae99 100644
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -8,7 +8,7 @@
 
 #include <iostream>
 #include "opencv2/cvconfig.h"
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/cudaarithm.hpp"
 
 #ifdef HAVE_TBB
diff --git a/samples/gpu/driver_api_stereo_multi.cpp b/samples/gpu/driver_api_stereo_multi.cpp
index 6c55e360b7..7cb0f5d51b 100644
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@@ -9,8 +9,8 @@
 
 #include <iostream>
 #include "opencv2/cvconfig.h"
-#include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/cudastereo.hpp"
 
 #ifdef HAVE_TBB
diff --git a/samples/gpu/generalized_hough.cpp b/samples/gpu/generalized_hough.cpp
index fb1cb8979c..7b7e80ab9d 100644
--- a/samples/gpu/generalized_hough.cpp
+++ b/samples/gpu/generalized_hough.cpp
@@ -8,8 +8,6 @@
 #include "opencv2/cudaimgproc.hpp"
 #include "opencv2/highgui.hpp"
 
-#include "tick_meter.hpp"
-
 using namespace std;
 using namespace cv;
 
diff --git a/samples/gpu/multi.cpp b/samples/gpu/multi.cpp
index 85349eb843..11e33b3300 100644
--- a/samples/gpu/multi.cpp
+++ b/samples/gpu/multi.cpp
@@ -8,7 +8,7 @@
 
 #include <iostream>
 #include "opencv2/cvconfig.h"
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/cudaarithm.hpp"
 
 #ifdef HAVE_TBB
diff --git a/samples/gpu/opengl.cpp b/samples/gpu/opengl.cpp
index 061cc58f34..9e2c68c432 100644
--- a/samples/gpu/opengl.cpp
+++ b/samples/gpu/opengl.cpp
@@ -17,10 +17,10 @@
     #include <GL/glu.h>
 #endif
 
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/core/opengl.hpp"
 #include "opencv2/core/cuda.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 using namespace std;
 using namespace cv;
diff --git a/samples/gpu/opticalflow_nvidia_api.cpp b/samples/gpu/opticalflow_nvidia_api.cpp
index 63eebfdafa..77f70ae636 100644
--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
@@ -14,8 +14,7 @@
 #include <iomanip>
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/cudalegacy.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/highgui/highgui_c.h"
+#include "opencv2/highgui.hpp"
 
 #if !defined(HAVE_CUDA)
 int main( int, const char** )
diff --git a/samples/gpu/stereo_multi.cpp b/samples/gpu/stereo_multi.cpp
index bfb3e8a48b..7ef656719d 100644
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp
@@ -17,8 +17,6 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/cudastereo.hpp"
 
-#include "tick_meter.hpp"
-
 using namespace std;
 using namespace cv;
 using namespace cv::cuda;
diff --git a/samples/gpu/super_resolution.cpp b/samples/gpu/super_resolution.cpp
index 026afd9710..94a922cdba 100644
--- a/samples/gpu/super_resolution.cpp
+++ b/samples/gpu/super_resolution.cpp
@@ -11,8 +11,6 @@
 #include "opencv2/superres/optical_flow.hpp"
 #include "opencv2/opencv_modules.hpp"
 
-#include "tick_meter.hpp"
-
 using namespace std;
 using namespace cv;
 using namespace cv::superres;
diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp
index 522c8a1e14..009568b466 100644
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@@ -4,9 +4,9 @@
 
 #ifdef HAVE_OPENCV_XFEATURES2D
 
-#include "opencv2/core/core.hpp"
-#include "opencv2/features2d/features2d.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/cudafeatures2d.hpp"
 #include "opencv2/xfeatures2d/cuda.hpp"
 
diff --git a/samples/gpu/tick_meter.hpp b/samples/gpu/tick_meter.hpp
deleted file mode 100644
index c11a22d140..0000000000
--- a/samples/gpu/tick_meter.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef OPENCV_CUDA_SAMPLES_TICKMETER_
-#define OPENCV_CUDA_SAMPLES_TICKMETER_
-
-class CV_EXPORTS TickMeter
-{
-public:
-    TickMeter();
-    void start();
-    void stop();
-
-    int64 getTimeTicks() const;
-    double getTimeMicro() const;
-    double getTimeMilli() const;
-    double getTimeSec()   const;
-    int64 getCounter() const;
-
-    void reset();
-private:
-    int64 counter;
-    int64 sumTime;
-    int64 startTime;
-};
-
-std::ostream& operator << (std::ostream& out, const TickMeter& tm);
-
-
-TickMeter::TickMeter() { reset(); }
-int64 TickMeter::getTimeTicks() const { return sumTime; }
-double TickMeter::getTimeMicro() const { return  getTimeMilli()*1e3;}
-double TickMeter::getTimeMilli() const { return getTimeSec()*1e3; }
-double TickMeter::getTimeSec() const { return (double)getTimeTicks()/cv::getTickFrequency();}
-int64 TickMeter::getCounter() const { return counter; }
-void TickMeter::reset() {startTime = 0; sumTime = 0; counter = 0; }
-
-void TickMeter::start(){ startTime = cv::getTickCount(); }
-void TickMeter::stop()
-{
-    int64 time = cv::getTickCount();
-    if ( startTime == 0 )
-        return;
-    ++counter;
-    sumTime += ( time - startTime );
-    startTime = 0;
-}
-
-std::ostream& operator << (std::ostream& out, const TickMeter& tm) { return out << tm.getTimeSec() << "sec"; }
-
-#endif
diff --git a/samples/gpu/video_reader.cpp b/samples/gpu/video_reader.cpp
index d8d6e136f8..c4c19281c0 100644
--- a/samples/gpu/video_reader.cpp
+++ b/samples/gpu/video_reader.cpp
@@ -14,8 +14,6 @@
 #include <opencv2/cudacodec.hpp>
 #include <opencv2/highgui.hpp>
 
-#include "tick_meter.hpp"
-
 int main(int argc, const char* argv[])
 {
     if (argc != 2)
@@ -33,7 +31,7 @@ int main(int argc, const char* argv[])
     cv::cuda::GpuMat d_frame;
     cv::Ptr<cv::cudacodec::VideoReader> d_reader = cv::cudacodec::createVideoReader(fname);
 
-    TickMeter tm;
+    cv::TickMeter tm;
     std::vector<double> cpu_times;
     std::vector<double> gpu_times;
 
diff --git a/samples/gpu/video_writer.cpp b/samples/gpu/video_writer.cpp
index 6c5d1412d6..80d2cfc47b 100644
--- a/samples/gpu/video_writer.cpp
+++ b/samples/gpu/video_writer.cpp
@@ -11,8 +11,6 @@
 #include "opencv2/cudacodec.hpp"
 #include "opencv2/highgui.hpp"
 
-#include "tick_meter.hpp"
-
 int main(int argc, const char* argv[])
 {
     if (argc != 2)
diff --git a/samples/python/watershed.py b/samples/python/watershed.py
index 6d349e1c1b..134e499301 100755
--- a/samples/python/watershed.py
+++ b/samples/python/watershed.py
@@ -55,7 +55,7 @@ class App:
         cv2.imshow('watershed', vis)
 
     def run(self):
-        while True:
+        while cv2.getWindowProperty('img', 0) != -1 or cv2.getWindowProperty('watershed', 0) != -1:
             ch = 0xFF & cv2.waitKey(50)
             if ch == 27:
                 break
diff --git a/samples/tapi/bgfg_segm.cpp b/samples/tapi/bgfg_segm.cpp
index 40587713b7..8b944d184a 100644
--- a/samples/tapi/bgfg_segm.cpp
+++ b/samples/tapi/bgfg_segm.cpp
@@ -17,10 +17,10 @@ using namespace cv;
 int main(int argc, const char** argv)
 {
     CommandLineParser cmd(argc, argv,
-        "{ c camera   | false       | use camera }"
+        "{ c camera   |             | use camera }"
         "{ f file     | ../data/768x576.avi | input video file }"
         "{ t type     | mog2        | method's type (knn, mog2) }"
-        "{ h help     | false       | print help message }"
+        "{ h help     |             | print help message }"
         "{ m cpu_mode | false       | press 'm' to switch OpenCL<->CPU}");
 
     if (cmd.has("help"))
diff --git a/samples/tapi/camshift.cpp b/samples/tapi/camshift.cpp
index c0f1d8fb69..324e073400 100644
--- a/samples/tapi/camshift.cpp
+++ b/samples/tapi/camshift.cpp
@@ -1,9 +1,9 @@
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/ocl.hpp"
 #include "opencv2/video/tracking.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/videoio/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/videoio.hpp"
+#include "opencv2/highgui.hpp"
 
 #include <iostream>
 #include <cctype>
diff --git a/samples/tapi/clahe.cpp b/samples/tapi/clahe.cpp
index 905ea1f1ae..b663b020bf 100644
--- a/samples/tapi/clahe.cpp
+++ b/samples/tapi/clahe.cpp
@@ -1,11 +1,11 @@
 #include <iostream>
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/core/ocl.hpp"
 #include "opencv2/core/utility.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 
 using namespace cv;
 using namespace std;
@@ -33,7 +33,7 @@ int main(int argc, char** argv)
         "{ i input    |                    | specify input image }"
         "{ c camera   |  0                 | specify camera id   }"
         "{ o output   | clahe_output.jpg   | specify output save path}"
-        "{ h help     | false              | print help message }";
+        "{ h help     |                    | print help message }";
 
     cv::CommandLineParser cmd(argc, argv, keys);
     if (cmd.has("help"))
diff --git a/samples/tapi/hog.cpp b/samples/tapi/hog.cpp
index e361c2b735..db31396106 100644
--- a/samples/tapi/hog.cpp
+++ b/samples/tapi/hog.cpp
@@ -68,18 +68,16 @@ private:
 int main(int argc, char** argv)
 {
     const char* keys =
-        "{ h help      | false          | print help message }"
+        "{ h help      |                | print help message }"
         "{ i input     |                | specify input image}"
         "{ c camera    | -1             | enable camera capturing }"
         "{ v video     | ../data/768x576.avi | use video as input }"
-        "{ g gray      | false          | convert image to gray one or not}"
+        "{ g gray      |                | convert image to gray one or not}"
         "{ s scale     | 1.0            | resize the image before detect}"
         "{ o output    |                | specify output path when input is images}";
     CommandLineParser cmd(argc, argv, keys);
     if (cmd.has("help"))
     {
-        cout << "Usage : hog [options]" << endl;
-        cout << "Available options:" << endl;
         cmd.printMessage();
         return EXIT_SUCCESS;
     }
diff --git a/samples/tapi/pyrlk_optical_flow.cpp b/samples/tapi/pyrlk_optical_flow.cpp
index 9cdbd7c5bf..bb426cbf76 100644
--- a/samples/tapi/pyrlk_optical_flow.cpp
+++ b/samples/tapi/pyrlk_optical_flow.cpp
@@ -75,7 +75,7 @@ static void drawArrows(UMat& _frame, const vector<Point2f>& prevPts, const vecto
 int main(int argc, const char* argv[])
 {
     const char* keys =
-        "{ h help           | false           | print help message }"
+        "{ h help           |                 | print help message }"
         "{ l left           |                 | specify left image }"
         "{ r right          |                 | specify right image }"
         "{ c camera         | 0               | enable camera capturing }"
diff --git a/samples/tapi/squares.cpp b/samples/tapi/squares.cpp
index e18e533d66..8300b9c872 100644
--- a/samples/tapi/squares.cpp
+++ b/samples/tapi/squares.cpp
@@ -5,9 +5,9 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/ocl.hpp"
 #include "opencv2/core/utility.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
 #include <iostream>
 #include <string.h>
 
@@ -143,8 +143,8 @@ int main(int argc, char** argv)
     const char* keys =
         "{ i input    | ../data/pic1.png   | specify input image }"
         "{ o output   | squares_output.jpg | specify output save path}"
-        "{ h help     | false              | print help message }"
-        "{ m cpu_mode | false              | run without OpenCL }";
+        "{ h help     |                    | print help message }"
+        "{ m cpu_mode |                    | run without OpenCL }";
 
     CommandLineParser cmd(argc, argv, keys);
 
diff --git a/samples/tapi/tvl1_optical_flow.cpp b/samples/tapi/tvl1_optical_flow.cpp
index f7bebacbeb..5efa4e2352 100644
--- a/samples/tapi/tvl1_optical_flow.cpp
+++ b/samples/tapi/tvl1_optical_flow.cpp
@@ -6,8 +6,8 @@
 #include "opencv2/core/utility.hpp"
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/videoio.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/video/video.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/video.hpp"
 
 using namespace std;
 using namespace cv;
@@ -83,12 +83,12 @@ static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
 int main(int argc, const char* argv[])
 {
     const char* keys =
-        "{ h help     | false           | print help message }"
+        "{ h help     |                 | print help message }"
         "{ l left     |                 | specify left image }"
         "{ r right    |                 | specify right image }"
         "{ o output   | tvl1_output.jpg | specify output save path }"
         "{ c camera   | 0               | enable camera capturing }"
-        "{ m cpu_mode | false           | run without OpenCL }"
+        "{ m cpu_mode |                 | run without OpenCL }"
         "{ v video    |                 | use video as input }";
 
     CommandLineParser cmd(argc, argv, keys);
diff --git a/samples/winrt/FaceDetection/FaceDetection/MainPage.xaml.cpp b/samples/winrt/FaceDetection/FaceDetection/MainPage.xaml.cpp
index ed5e8d7326..e0ec27ad61 100644
--- a/samples/winrt/FaceDetection/FaceDetection/MainPage.xaml.cpp
+++ b/samples/winrt/FaceDetection/FaceDetection/MainPage.xaml.cpp
@@ -7,9 +7,9 @@
 #include "MainPage.xaml.h"
 
 #include <opencv2\imgproc\types_c.h>
-#include <opencv2\imgcodecs\imgcodecs.hpp>
-#include <opencv2\core\core.hpp>
-#include <opencv2\imgproc\imgproc.hpp>
+#include <opencv2\imgcodecs.hpp>
+#include <opencv2\core.hpp>
+#include <opencv2\imgproc.hpp>
 #include <opencv2\highgui.hpp>
 #include <opencv2\highgui\highgui_winrt.hpp>
 
@@ -78,4 +78,4 @@ void FaceDetection::MainPage::detectBtn_Click(Platform::Object^ sender, Windows:
     } else {
         Windows::UI::Popups::MessageDialog("Initialize image before processing \n").ShowAsync();
     }
-}
\ No newline at end of file
+}
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.cpp b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.cpp
index 56193c1cbb..a2854d2382 100644
--- a/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.cpp
+++ b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.cpp
@@ -8,9 +8,9 @@
 #include "OcvTransform.h"
 #include "bufferlock.h"
 
-#include <opencv2\core\core.hpp>
-#include <opencv2\imgproc\imgproc.hpp>
-#include <opencv2\features2d\features2d.hpp>
+#include <opencv2\core.hpp>
+#include <opencv2\imgproc.hpp>
+#include <opencv2\features2d.hpp>
 
 
 
diff --git a/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.cpp b/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.cpp
index fc7440fb29..c911787867 100644
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.cpp
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.cpp
@@ -10,8 +10,8 @@
 #include <Robuffer.h>
 #include <vector>
 #include <opencv2\imgproc\types_c.h>
-#include <opencv2\imgcodecs\imgcodecs.hpp>
-#include <opencv2\core\core.hpp>
+#include <opencv2\imgcodecs.hpp>
+#include <opencv2\core.hpp>
 
 #include <windows.storage.h>
 
diff --git a/samples/winrt_universal/PhoneTutorial/MainPage.xaml.cpp b/samples/winrt_universal/PhoneTutorial/MainPage.xaml.cpp
index 045698b462..de3c554bf5 100644
--- a/samples/winrt_universal/PhoneTutorial/MainPage.xaml.cpp
+++ b/samples/winrt_universal/PhoneTutorial/MainPage.xaml.cpp
@@ -7,8 +7,8 @@
 #include "MainPage.xaml.h"
 
 #include <opencv2\imgproc\types_c.h>
-#include <opencv2\core\core.hpp>
-#include <opencv2\imgproc\imgproc.hpp>
+#include <opencv2\core.hpp>
+#include <opencv2\imgproc.hpp>
 #include <Robuffer.h>
 #include <ppl.h>
 #include <ppltasks.h>
diff --git a/samples/wp8/OcvImageManipulation/PhoneXamlDirect3DApp1/PhoneXamlDirect3DApp1Comp/Direct3DInterop.cpp b/samples/wp8/OcvImageManipulation/PhoneXamlDirect3DApp1/PhoneXamlDirect3DApp1Comp/Direct3DInterop.cpp
index cb00c7d9cf..f3e3a01d6f 100644
--- a/samples/wp8/OcvImageManipulation/PhoneXamlDirect3DApp1/PhoneXamlDirect3DApp1Comp/Direct3DInterop.cpp
+++ b/samples/wp8/OcvImageManipulation/PhoneXamlDirect3DApp1/PhoneXamlDirect3DApp1Comp/Direct3DInterop.cpp
@@ -4,9 +4,9 @@
 #include <windows.storage.streams.h>
 #include <wrl.h>
 #include <robuffer.h>
-#include <opencv2\core\core.hpp>
-#include <opencv2\imgproc\imgproc.hpp>
-#include <opencv2\features2d\features2d.hpp>
+#include <opencv2\core.hpp>
+#include <opencv2\imgproc.hpp>
+#include <opencv2\features2d.hpp>
 #include <algorithm>
 
 using namespace Windows::Storage::Streams;
diff --git a/samples/wp8/OpenCVXaml/OpenCVComponent/OpenCVComponent.cpp b/samples/wp8/OpenCVXaml/OpenCVComponent/OpenCVComponent.cpp
index ce309d5977..7e0c0df272 100644
--- a/samples/wp8/OpenCVXaml/OpenCVComponent/OpenCVComponent.cpp
+++ b/samples/wp8/OpenCVXaml/OpenCVComponent/OpenCVComponent.cpp
@@ -3,8 +3,8 @@
 #include "OpenCVComponent.h"
 
 #include <opencv2\imgproc\types_c.h>
-#include <opencv2\core\core.hpp>
-#include <opencv2\imgproc\imgproc.hpp>
+#include <opencv2\core.hpp>
+#include <opencv2\imgproc.hpp>
 #include <vector>
 #include <algorithm>
 
@@ -63,5 +63,4 @@ void CopyMatrixToVector(const cv::Mat& mat, std::vector<int>& vector, int size)
     {
         vector.push_back(data[i]);
     }
-
-}
\ No newline at end of file
+}